#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/uio.h>
#include <sys/mman.h>
#include <stdint.h>
#include <termios.h>
#include "axi_dma.h"
#include "libaio.h"
#include "stb_image.c"
#include <pthread.h>
#include <sched.h>

uint8_t* buffer[1000];
uint8_t* sw_fb[2];
struct iovec iov[1000];
struct iocb viocb;
struct io_event event;
struct termios initial_settings, new_settings;
volatile uint32_t buf_switch;
uint32_t video_buffers;
volatile uint32_t end;

int compare_uint8_t (const void * a, const void * b)
{
    if ( *(uint8_t*)a <  *(uint8_t*)b ) return -1;
    if ( *(uint8_t*)a == *(uint8_t*)b ) return 0;
    if ( *(uint8_t*)a >  *(uint8_t*)b ) return 1;
}

static inline uint32_t get_val_rgb(uint32_t* data, int x, int y, int maxx, int maxy)
{
    x = abs(x);
    y = abs(y);
    if (x >= maxx)
        //x = (maxx - 1) - (x - maxx - 1);
        x = maxx - 1;
    if (y >= maxy)
        //y = (maxy - 1) - (y - maxy - 1);
        y = maxy - 1;
    return data[y*maxx + x];
}

#define RED(n) ((n >> 16) & 0x000000FF)
#define GREEN(n) ((n >> 8) & 0x000000FF)
#define BLUE(n) (n & 0x000000FF)

static inline uint32_t to_rgb(uint32_t r, uint32_t g, uint32_t b)
{
    uint32_t ret = 0;
    ret = (r << 16) | (g << 8) | b;
    return ret;
}

static inline uint32_t median_rgb_9(uint32_t* data, int x, int y, int maxx, int maxy)
{
    uint8_t r[9];
    uint8_t g[9];
    uint8_t b[9];
    int i,j, k, cnt;
    uint32_t rgb;

    cnt = 0;
    for (j = y - 1; j <= y + 1; j++)
        for (i = x - 1; i <= x + 1; i++)
        {
            rgb = get_val_rgb(data, i, j, maxx, maxy);
            r[cnt] = RED(rgb);
            g[cnt] = GREEN(rgb);
            b[cnt] = BLUE(rgb);
            cnt++;
        }
    
    qsort(r, 9, sizeof(uint8_t), compare_uint8_t);
    qsort(g, 9, sizeof(uint8_t), compare_uint8_t);
    qsort(b, 9, sizeof(uint8_t), compare_uint8_t);
    
    return to_rgb(r[4], g[4], b[4]);
}

void median_9(uint32_t *dst_data, uint32_t* src_data, int maxx, int maxy)
{
    int i,j, cnt;
    cnt = 0;
    for (j = 0; j < maxy; j++)
        for (i = 0; i < maxx; i++)
        {
            dst_data[cnt] = median_rgb_9(src_data, i , j, maxx, maxy);
            cnt++;
        }
}

void *sw_filter_thread(void *par)
{
    uint32_t current_buffer = 0;
    int i;
    cpu_set_t cpuset;
    
    CPU_ZERO(&cpuset);
    CPU_SET(1, &cpuset);
    pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
    
    for (;;)
    {
        for (i = 0; i < video_buffers; i++)
        {
            fprintf(stderr, "SW Filter start for frame %d\n", i);
            median_9((uint32_t*)sw_fb[current_buffer], (uint32_t*)buffer[i], 1280, 720);
            fprintf(stderr, "SW Filter finish for frame %d\n", i);
            __sync_synchronize();
            buf_switch = 1;
            current_buffer = (current_buffer == 0) ? 1 : 0;
            __sync_synchronize();
            while(buf_switch != 0) 
            {
                if (end)
                    goto finito;
            }
        }
    }
finito:
    return NULL;
}

int load_images(uint8_t** buffer, int cnt, char* prefix, int start, char* suffix)
{
    int i,j,k,l,m;
    char file_name[512];
    int x,y,n;
    unsigned char *data; 
    uint8_t *ptr;
    uint8_t *ptr1;
    uint32_t *ptr2;
    uint32_t *ptr3;
    uint8_t *reorder;
    
    for (k = 0; k < cnt; k++)
    {
        // Create file path
        sprintf(file_name, "%s%05d%s", prefix, start + k, suffix);
        // Load image
        data = stbi_load(file_name, &x, &y, &n, 0);
        if (data == NULL)
        {
            fprintf(stderr, "Failed to load: %s\n", file_name);
            return 1;
        }
        if (k == 0)
        {
            reorder = malloc(x*y*4);
            if (reorder == NULL)
            {
                fprintf(stderr, "Failed to allocate reorder buffer\n");
                stbi_image_free(data);
                return 1;
            }
        }
        
        // reorder data
        ptr = (uint8_t *)reorder;
        ptr1 = (uint8_t *)data;
        for (i = 0; i < y; i++)
            for (j = 0; j < x; j++)
            {
                ptr[0] = ptr1[2];
                ptr[1] = ptr1[1];
                ptr[2] = ptr1[0];
                ptr[3] = 0;
                ptr1 += 3;
                ptr += 4;
            }
        
        
        ptr2 = (uint32_t *)buffer[k];
        ptr3 = (uint32_t *)reorder;
        for (i = 0; i < 720; i++)
        for (j = 0; j < 1280; j++)
        {
            ptr2[i*1280+j] = ptr3[(i % y)*x + (j % x)];
        }

        fprintf(stderr, ".");

        // Fre image data
        stbi_image_free(data);
    }
    free(reorder);
    return 0;
}

int allocate_buffers(uint8_t** buffer, int cnt, int f)
{
    int i;
    
    for (i = 0; i < cnt; i++)
    {
        buffer[i] = mmap(NULL, 0x384000, PROT_WRITE, MAP_SHARED, f, MMAP_ALLOC_UOFFSET);
        if (!buffer[i])
        {
            fprintf(stderr, "Failed to mmap memory at %x\n", MMAP_ALLOC_UOFFSET);
            return 1;
        }
        fprintf(stderr, ".");
    }
    return 0;
}

// Command line parameters:
// argv[1] - dma device file, such as /dev/sec6net_dma_driver-0
// argv[2] - start video frame index, eg. 0
// argv[3] - Number of video frames, eg. 50
// argv[4] - prefix of video frame filename, eg. frame
// argv[5] - suffix of video frame filename, eg _noisy.png
int main(int argc, char **argv)
{
    int f = open(argv[1], O_RDWR);
    int i;
    int j;
    int k;
    int c;
    int len;
    int ret;
    uint64_t val = 0;
    struct iocb *pviocb;
    struct dma_settings settings;
    unsigned char key;
    int fd;
    uint32_t * sw_filter;
    uint32_t state;
    uint32_t cnt;
    uint32_t start;
    uint32_t leave;
    uint32_t sw;
    uint32_t current_buffer = 1;
    pthread_t thread;
    cpu_set_t cpuset;
    
    cnt = atoi(argv[3]);
    video_buffers = cnt;
    start = atoi(argv[2]);
    
    // Open DMA device file
    if (f == -1)
    {
        fprintf(stderr, "Failed to open file %s\n", argv[1]);
        return 1;
    }

    // Open dev mem (dirty, use libhwio later)
    fd = open("/dev/mem", O_RDWR);
    if (fd == -1)
    {
        fprintf(stderr, "Failed to open file /dev/mem\n");
        return 1;
    }
    
    // MMap median_filter top address space
    sw_filter = (uint32_t *)mmap(0, 4096, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0x41220000);
    
    fprintf(stderr, "Memory allocation started.\n");
    // Allocate buffers
    ret = allocate_buffers(buffer, cnt, f);
    if (ret)
    {
        fprintf(stderr, "Failed to allocate memory.\n");
        return 1;
    }
    ret = allocate_buffers(sw_fb, 2, f);
    if (ret)
    {
        fprintf(stderr, "Failed to allocate memory.\n");
        return 1;
    }
    fprintf(stderr, "Memory allocation done.\n");
    
    fprintf(stderr, "Video image loading started.\n");
    ret = load_images(buffer, cnt, argv[4], start, argv[5]);
    if (ret)
    {
        fprintf(stderr, "Failed to load video images.\n");
        return 1;
    }
    fprintf(stderr, "Video image loading done.\n");
    
    // Set state 0
    state = 0;
    end = 0;
    
    // Init async aio context and queue
    io_context_t ctx;
    memset(&ctx, 0, sizeof(ctx));
    ret = io_queue_init(1, &ctx);
    if (ret < 0)
    {
        fprintf(stderr, "Failed to init io queue\n");
        return 1;
    }

    // Set DMA driver settings
    ioctl(f, GET_SETTINGS, &settings);
    settings.tx_act_timeout = 1;
    settings.tx_inact_timeout = 0;
    settings.allocation_size = 0x384000;
    settings.tx_mode = TX_DIRECT;
    ioctl(f, SET_SETTINGS, &settings);

    
    // Set console attrbutes for reading single char and return EOF if no char 
    // is in queue
    fprintf(stderr, "Start setting attr\n");
    tcgetattr(0,&initial_settings);
 
    new_settings = initial_settings;
    new_settings.c_lflag &= ~ICANON;
    new_settings.c_lflag &= ~ECHO;
    new_settings.c_lflag &= ~ISIG;
    new_settings.c_cc[VMIN] = 0;
    new_settings.c_cc[VTIME] = 0;
    
    tcsetattr(0, TCSANOW, &new_settings);
    fprintf(stderr, "Done setting attr\n");
    
    // Start sw median filter thread
    ret = pthread_create(&thread, NULL, sw_filter_thread, NULL);
    if (ret)
    {
        fprintf(stderr, "Failed to create thread.\n");
        return 1;
    }
    
    CPU_ZERO(&cpuset);
    CPU_SET(0, &cpuset);
    pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
    
    // Start dma
    ioctl(f, START_TX_DMA, 0);
    
    // Forever (until interupted) video loop
    for(;;)
    {
        leave = 0;
        if (sw == 0)
        {
            for (i = 0; i < cnt; i++)
            {
                for (j = 0; j < 3; j++)
                {
                    iov[0].iov_base = buffer[i];
                    iov[0].iov_len = 0x384000;
                    io_prep_pwritev(&viocb, f, iov, 1, 0);
                    pviocb = &viocb;
                    if (io_submit(ctx, 1, &pviocb) < 0)
                    {
                        fprintf(stderr, "AIO writev failed\n");
                    }
            //         fprintf(stderr, "TX AIO loop %d\n", i);
                    c = getchar();
                    if (c != EOF)
                    {
                        key = c;
                        if (key == 'q' || key == 'Q')
                        {
                            goto jump_out_final;
                        }
                        else 
                        {
                            if (key == 'f' || key == 'F')
                            {
                                if (state == 0)
                                    state = 0xffffffff;
                                else
                                    state = 0;
                                *(sw_filter + 1) = state;
                            }
                            else
                            {
                                sw = 1;
                                leave = 1;
                                break;
                            }
                        }
                    }
                    ret = io_getevents(ctx, 1, 1, &event, NULL);
                    if (ret < 1)
                    {
                        fprintf(stderr, "AIO get event failed\n");
                    }
                }
                if (leave == 1)
                    break;
            }
        }
        leave = 0;
        if (sw == 1)
        {
            state = 0xffffffff;
            *(sw_filter + 1) = state;
            for(;;)
            {
                __sync_synchronize();
                if (buf_switch == 1)
                {
                    current_buffer = (current_buffer == 0) ? 1 : 0;
                    buf_switch = 0;
                }
                __sync_synchronize();
                iov[0].iov_base = sw_fb[current_buffer];
                iov[0].iov_len = 0x384000;
                io_prep_pwritev(&viocb, f, iov, 1, 0);
                pviocb = &viocb;
                if (io_submit(ctx, 1, &pviocb) < 0)
                {
                    fprintf(stderr, "AIO writev failed\n");
                }
        //         fprintf(stderr, "TX AIO loop %d\n", i);
                c = getchar();
                if (c != EOF)
                {
                    key = c;
                    if (key == 'q' || key == 'Q')
                    {
                        goto jump_out_final;
                    }
                    else
                    {
                        sw = 0;
                        leave = 1;
                    }
                }
                ret = io_getevents(ctx, 1, 1, &event, NULL);
                if (ret < 1)
                {
                    fprintf(stderr, "AIO get event failed\n");
                }
                if (leave == 1)
                {
                    leave = 0;
                    break;
                }
            }
            state = 0;
            *(sw_filter + 1) = state;
        }
    }
jump_out_final:
    end = 1;
    io_queue_release(ctx);
    close(f);
    for (i = 0; i < cnt; i++)
    {
        if(munmap(buffer[i], 0x384000))
            fprintf(stderr, "Failed to unmap buffer %d\n", i);
    }
    for (i = 0; i < 2; i++)
    {
        if(munmap(sw_fb[i], 0x384000))
            fprintf(stderr, "Failed to unmap buffer %d\n", i);
    }
    tcsetattr(0, TCSANOW, &initial_settings);
}
