#ifndef __INT_WORKGROUP_SCAN_AND_REDUCE_INCLUDED
#define __INT_WORKGROUP_SCAN_AND_REDUCE_INCLUDED

#include "../kernel_utils/NVIDIA.h"

// todo - transform this into macros to define the functions, this won't work with pragma once / include once

#ifndef REDUCE_LOCAL_WORK_SIZE
#error "error: REDUCE_LOCAL_WORK_SIZE not defined in IntScanReduce.h"
#endif // !REDUCE_LOCAL_WORK_SIZE

#ifndef SCAN_LOCAL_WORK_SIZE
#error "error: SCAN_LOCAL_WORK_SIZE not defined in IntScanReduce.h"
#endif // !SCAN_LOCAL_WORK_SIZE

#if defined(NVIDIA) && !defined(DISABLE_NV_SHFL)

#include "../kernel_utils/NV30.h"

typedef struct {
	int p_storage[WARP_SIZE];
} TLocalReduceStorage_Int;

/**
 *	@brief reduces a tile of an array in local memory
 *
 *	@param[in] l is local thread id
 *	@param[in] p_storage is pointer to shared storage (only WARP_SIZE elements is accessed)
 *
 *	@return Returns the reduction result.
 *
 *	@note The result is left in p_storage->p_storage[0].
 */
inline int n_LocalReduce_Int(const unsigned int l, int x, __local TLocalReduceStorage_Int *p_storage)
{
	__local int *p_sh_mem = p_storage->p_storage; // ...

	enum {
#if b_Is_POT(REDUCE_LOCAL_WORK_SIZE / WARP_SIZE)
		n_section_num = WARP_SIZE,
		n_section_size = REDUCE_LOCAL_WORK_SIZE / n_section_num,
#else // !b_Is_POT(REDUCE_LOCAL_WORK_SIZE / WARP_SIZE)
		n_section_num = REDUCE_LOCAL_WORK_SIZE / WARP_SIZE,
		n_section_size = WARP_SIZE,
#endif // !b_Is_POT(REDUCE_LOCAL_WORK_SIZE / WARP_SIZE)
		n_section_size_log2 = n_Log2(n_section_size)
	};
	const int n_lane = l & (n_section_size - 1);
	const int n_section = l >> n_section_size_log2;
	// want to reduce the tile to WARP_SIZE items and then do a single warp-cooperative scan

    #pragma unroll
    for(int offset = 1; offset < n_section_size; offset *= 2)
        x = shfl_add_int32_t(x, offset, n_section_size);

	if(n_section_size - 1 == n_lane)
		p_sh_mem[n_section] = x;

	barrier(CLK_LOCAL_MEM_FENCE);

	if(l < n_section_num) {
        x = p_sh_mem[l];
        #pragma unroll
        for(int offset = 1; offset < n_section_num; offset *= 2)
            x = shfl_add_int32_t(x, offset, n_section_num);
        p_sh_mem[l] = x;
    }

    barrier(CLK_LOCAL_MEM_FENCE);

    int reduction = p_sh_mem[n_section_num - 1];

    return reduction;
}

#if b_Is_POT(SCAN_LOCAL_WORK_SIZE / WARP_SIZE)

typedef struct {
	int p_workspace[SCAN_LOCAL_WORK_SIZE / WARP_SIZE + 1];
} TLocalScanStorage_Int;

// calculates local exclusive scan using only t_ReductionReduceOp()
// a simpler mGPU ping pong algorithm; requires a bit more memory but seems to run faster
inline int n_LocalExScan_Int(const int li, int x, int *p_sum, __local TLocalScanStorage_Int *p_storage)
{
	__local int *p_shared = p_storage->p_workspace;

	enum {
#if b_Is_POT(REDUCE_LOCAL_WORK_SIZE / WARP_SIZE)
		n_segment_num = WARP_SIZE,
		n_segment_size = REDUCE_LOCAL_WORK_SIZE / n_segment_num,
#else // !b_Is_POT(REDUCE_LOCAL_WORK_SIZE / WARP_SIZE)
		n_segment_num = REDUCE_LOCAL_WORK_SIZE / WARP_SIZE,
		n_segment_size = WARP_SIZE,
#endif // !b_Is_POT(REDUCE_LOCAL_WORK_SIZE / WARP_SIZE)
		n_segment_size_log2 = n_Log2(n_segment_size)
	};
	// want to reduce the tile to WARP_SIZE items and then do a single warp-cooperative scan
	// define warp_size segments that are nt / warp_size large
	// each warp makes log(segsize) shfl_reduce calls
	// the spine makes log(warp_size) shfl_reduce calls

	const int n_lane = li & (n_segment_size - 1);
	const int n_segment = li >> n_segment_size_log2;

	int scan = x;
	#pragma unroll
	for(int offset = 1; offset < n_segment_size; offset *= 2)
		scan = shfl_add_int32_t(scan, offset, n_segment_size);
	// scan each segment using shfl_reduce()

	if(n_lane == n_segment_size - 1)
		p_shared[n_segment] = scan;
	// store the reduction (last element) of each segment into storage

	barrier(CLK_LOCAL_MEM_FENCE);

	if(li < n_segment_num) {
		int y = p_shared[li];
		int scan = y;
		#pragma unroll
		for(int offset = 1; offset < n_segment_num; offset *= 2)
			scan = shfl_add_int32_t(scan, offset, n_segment_num);
		p_shared[li] = scan - y;
		if(n_segment_num - 1 == li)
			p_shared[n_segment_num] = scan;
	}
	// warp 0 does a full shfl warp scan on the partials, the total is
	// stored to shared[n_segment_num]

	barrier(CLK_LOCAL_MEM_FENCE);

	scan += p_shared[n_segment];
	scan -= x;
	if(0 != (int)0 && !li)
		scan = 0;
	// add the scanned partials back in and convert to exclusive scan

	*p_sum = p_shared[n_segment_num];

	//barrier(CLK_LOCAL_MEM_FENCE);

	return scan;
}

#endif // b_Is_POT(SCAN_LOCAL_WORK_SIZE / WARP_SIZE)

#endif // NVIDIA && !DISABLE_NV_SHFL

#if !defined(NVIDIA) || defined(DISABLE_NV_SHFL)

typedef struct {
	int p_storage[REDUCE_LOCAL_WORK_SIZE];
} TLocalReduceStorage_Int;

/**
 *	@brief reduces a tile of an array in local memory
 *
 *	@param[in] l is local thread id
 *	@param[in] p_storage is pointer to shared storage (only REDUCE_LOCAL_WORK_SIZE elements is accessed)
 *
 *	@return Returns the reduction result.
 *
 *	@note The result is left in p_storage->p_storage[0].
 */
inline int n_LocalReduce_Int(const unsigned int l, int x, __local TLocalReduceStorage_Int *p_storage)
{
	__local int *p_sh_mem = p_storage->p_storage; // ...

	p_sh_mem[l] = x; // store

#if REDUCE_LOCAL_WORK_SIZE <= 2048 && b_Is_POT(REDUCE_LOCAL_WORK_SIZE)
#if REDUCE_LOCAL_WORK_SIZE >= 2048
	barrier(CLK_LOCAL_MEM_FENCE);
	if(l < 1024)
		p_sh_mem[l] += p_sh_mem[l + 1024];
#endif
#if REDUCE_LOCAL_WORK_SIZE >= 1024
	barrier(CLK_LOCAL_MEM_FENCE);
	if(l < 512)
		p_sh_mem[l] += p_sh_mem[l + 512];
#endif
#if REDUCE_LOCAL_WORK_SIZE >= 512
	barrier(CLK_LOCAL_MEM_FENCE);
	if(l < 256)
		p_sh_mem[l] += p_sh_mem[l + 256];
#endif
#if REDUCE_LOCAL_WORK_SIZE >= 256
	barrier(CLK_LOCAL_MEM_FENCE);
	if(l < 128)
		p_sh_mem[l] += p_sh_mem[l + 128];
#endif
#if REDUCE_LOCAL_WORK_SIZE >= 128
	barrier(CLK_LOCAL_MEM_FENCE);
	if(l < 64)
		p_sh_mem[l] += p_sh_mem[l + 64];
#endif
	// reduce down to 32 elements stored in the shared memory

	// the below section is within a single warp, want to avoid divergence
	// even though unneccessary reductions are made no barriers required,
	// just a memory fence to avoid compiler optimization
#if REDUCE_LOCAL_WORK_SIZE > 32
	barrier(CLK_LOCAL_MEM_FENCE);
	if(l < 32) {
#elif REDUCE_LOCAL_WORK_SIZE > 16
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	if(l < 16) {
#elif REDUCE_LOCAL_WORK_SIZE > 8
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	if(l < 8) {
#elif REDUCE_LOCAL_WORK_SIZE > 4
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	if(l < 4) {
#elif REDUCE_LOCAL_WORK_SIZE > 2
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	if(l < 2) {
#else
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	{
#endif
#if REDUCE_LOCAL_WORK_SIZE >= 64
		p_sh_mem[l] += p_sh_mem[l + 32];
		write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif
#if REDUCE_LOCAL_WORK_SIZE >= 32
		p_sh_mem[l] += p_sh_mem[l + 16];
		write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif
#if REDUCE_LOCAL_WORK_SIZE >= 16
		p_sh_mem[l] += p_sh_mem[l + 8];
		write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif
#if REDUCE_LOCAL_WORK_SIZE >= 8
		p_sh_mem[l] += p_sh_mem[l + 4];
		write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif
#if REDUCE_LOCAL_WORK_SIZE >= 4
		p_sh_mem[l] += p_sh_mem[l + 2];
		write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif
#if REDUCE_LOCAL_WORK_SIZE >= 2
		p_sh_mem[l] += p_sh_mem[l + 1];
		write_mem_fence(CLK_LOCAL_MEM_FENCE); // !! so that all the threads have the same value
#endif
	}
	// reduce results down to one result per work group
#else // REDUCE_LOCAL_WORK_SIZE <= 2048 && b_Is_POT(REDUCE_LOCAL_WORK_SIZE)
	enum {
		n_first = n_Make_Lower_POT(REDUCE_LOCAL_WORK_SIZE),
		n_first_warp = (n_first > WARP_SIZE)? WARP_SIZE : n_first
	};

	#pragma unroll
	for(int offset = n_first; offset > n_first_warp; offset /= 2) {
		int size = (offset == n_first)? REDUCE_LOCAL_WORK_SIZE : offset * 2;
		barrier(CLK_LOCAL_MEM_FENCE);
		if(l < size - offset)
			p_sh_mem[l] += p_sh_mem[l + offset];
	}

	if(REDUCE_LOCAL_WORK_SIZE > WARP_SIZE)
		barrier(CLK_LOCAL_MEM_FENCE);

	if(l < n_first_warp) {
		#pragma unroll
		for(int offset = n_first_warp; offset > 0; offset /= 2) {
			int size = (offset == n_first)? REDUCE_LOCAL_WORK_SIZE : offset * 2; // not "offset == n_first_warp" but rather "offset == n_first_warp && n_first_warp == n_first" or for short "offset == n_first"
			write_mem_fence(CLK_LOCAL_MEM_FENCE);
			if(n_first_warp < REDUCE_LOCAL_WORK_SIZE || l < size - offset) // skip the condition for the last warp if the shared mem is large enough
				p_sh_mem[l] += p_sh_mem[l + offset];
		}
	}

	write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif // REDUCE_LOCAL_WORK_SIZE <= 2048 && b_Is_POT(REDUCE_LOCAL_WORK_SIZE)

	return p_sh_mem[0];
}

#endif // !NVIDIA || DISABLE_NV_SHFL

#if !defined(NVIDIA) || defined(DISABLE_NV_SHFL) || !b_Is_POT(SCAN_LOCAL_WORK_SIZE / WARP_SIZE)

typedef struct {
	int p_workspace[2 * SCAN_LOCAL_WORK_SIZE + 1];
} TLocalScanStorage_Int;

// calculates local exclusive scan using only t_ReductionReduceOp()
// a simpler mGPU ping pong algorithm; requires a bit more memory but seems to run slightly faster
inline int n_LocalExScan_Int(const int li, int x, int *p_sum, __local TLocalScanStorage_Int *p_storage)
{
	__local int *p_shared = p_storage->p_workspace;

	p_shared[li] = x;

	barrier(CLK_LOCAL_MEM_FENCE);

	int n_first = 0;
	#pragma unroll
    for(int n_offset = 1; n_offset < SCAN_LOCAL_WORK_SIZE; n_offset *= 2) {
        if(li >= n_offset)
            x += p_shared[n_first + li - n_offset];
        n_first = SCAN_LOCAL_WORK_SIZE - n_first;

        p_shared[n_first + li] = x;

        barrier(CLK_LOCAL_MEM_FENCE);
    }
    *p_sum = p_shared[n_first + SCAN_LOCAL_WORK_SIZE - 1];
	// note that this is not work-efficient but performs the reduction in fewer steps than Harris' scan

	x = (li)? p_shared[n_first + li - 1] : 0;
	// calculates inclusive scan; shift right and zero the first element

	//barrier(CLK_LOCAL_MEM_FENCE);

	return x;
}

#endif // !NVIDIA || DISABLE_NV_SHFL || !b_Is_POT(SCAN_LOCAL_WORK_SIZE / WARP_SIZE)

#endif // !__INT_WORKGROUP_SCAN_AND_REDUCE_INCLUDED
