
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#pragma OPENCL EXTENSION cl_nv_pragma_unroll: enable
// want to use doubles and unroll loops

/**
 *	@def TWOWORD
 *	@brief macro to enable passing two words through 
 */
#define TWOWORD(a,b) a b

#define __restrict
// surprisingly, restrict gains consistently lower rates. ugh.

#include "../kernel_utils/StaticAssert.h"
#include "../kernel_utils/VendorDetect.h"
#include "../kernel_utils/Integer.h"
#include "../kernel_utils/NVIDIA.h" // includes them all
#include "../kernel_utils/LoadStore.h"

// ---- scan reduce macros ----

//#define SCAN_SCALAR_TYPE uint32_t
//#define SCAN_SCALAR_SIZE 4
// configured via compiler commandline

typedef SCAN_SCALAR_TYPE _TyScalar;

STATIC_ASSERT(SCAN_SCALAR_SIZE == sizeof(_TyScalar), SCAN_SCALAR_SIZE_SCAN_SCALAR_TYPE_MISMATCH); // make sure this is correct

//#define SCAN_LOCAL_WORK_SIZE 256 // best on GTX-680 (over 74 GB/sec, over 92 GB/sec on GTX-780)
//#define SCAN_TILE_SIZE 1024 // or just (2 * SCAN_LOCAL_WORK_SIZE)
// configured via compiler commandline

//STATIC_ASSERT(b_Is_POT(SCAN_TILE_SIZE), SCAN_TILE_SIZE_MUST_BE_POWER_OF_TWO); // does it need to be? do not think so, not anymore with the workgroup approach

#define SCAN_ITEMS_THREAD  ((SCAN_TILE_SIZE) / (SCAN_LOCAL_WORK_SIZE))
//#define SCAN_ITEMS_THREAD_HALF  ((SCAN_ITEMS_THREAD) / 2) // not needed any more, now we're working with the warp tiles
// could be enums, but preprocessor conditions fail if they are :-/

STATIC_ASSERT(SCAN_TILE_SIZE == SCAN_ITEMS_THREAD * SCAN_LOCAL_WORK_SIZE, SCAN_TILE_SIZE_MUST_BE_MULTIPLE_OF_SCAN_LOCAL_WORK_SIZE);
//STATIC_ASSERT(b_Is_POT(SCAN_ITEMS_THREAD) && SCAN_ITEMS_THREAD >= 2, SCAN_ITEMS_THREAD_MUST_BE_AT_LEAST_TWO); // not any more, now we're working with the warp tiles
// must be a multiple of SCAN_LOCAL_WORK_SIZE and at least double

/**
 *	@brief enables kernel to force a specified workgroup size
 *	@note In case the launch workgroup size doesn't match,
 *		CL_INVALID_WORK_GROUP_SIZE (-54) is returned from clEnqueueNDRangeKernel.
 */
#define REQUIRE_WG_SIZE(n_size) __attribute__((reqd_work_group_size(n_size, 1, 1)))

// reduction parts

#define REDUCE_LOCAL_WORK_SIZE SCAN_LOCAL_WORK_SIZE
#define REDUCE_TILE_SIZE SCAN_TILE_SIZE
#define REDUCE_ITEMS_THREAD  ((REDUCE_TILE_SIZE) / (REDUCE_LOCAL_WORK_SIZE))
// those are the same as for scan

//#define REDUCTION_ELEM_OP ((x))
//#define REDUCTION_REDUCE_OP ((x) + (y))
//#define REDUCTION_REDUCE_OPERATOR '+'
//#define REDUCTION_FINAL_OP ((x))
//#define REDUCTION_IDENTITY ((_TyScalar)0)
// configured via compiler commandline

inline _TyScalar t_ReductionElemOp(_TyScalar x) { return (REDUCTION_ELEM_OP); }
inline _TyScalar t_ReductionReduceOp(_TyScalar x, _TyScalar y) { return (REDUCTION_REDUCE_OP); }
inline _TyScalar t_ReductionFinalOp(_TyScalar x) { return (REDUCTION_FINAL_OP); }
// to make the code pretty

#if defined(NVIDIA) && (REDUCTION_REDUCE_OPERATOR == '+' || REDUCTION_REDUCE_OPERATOR == '-' || \
	REDUCTION_REDUCE_OPERATOR == '*') && SCAN_SCALAR_SIZE == 4 && !defined(DISABLE_NV_SHFL)

#pragma message("building the NV SHFL branch")

#include "../kernel_utils/NV30.h"

#if REDUCTION_REDUCE_OPERATOR == '+'

inline _TyScalar shfl_reduce(_TyScalar x, int offset, int width) // "overloaded" shfl_add function for the current type
{
	return CONCAT(shfl_add, CONCAT(_, EXPAND(SCAN_SCALAR_TYPE)))(x, offset, width);
}

#elif REDUCTION_REDUCE_OPERATOR == '-'

inline _TyScalar shfl_reduce(_TyScalar x, int offset, int width) // "overloaded" shfl_sub function for the current type
{
	return CONCAT(shfl_sub, CONCAT(_, EXPAND(SCAN_SCALAR_TYPE)))(x, offset, width);
}

#elif REDUCTION_REDUCE_OPERATOR == '*'

inline _TyScalar shfl_reduce(_TyScalar x, int offset, int width) // "overloaded" shfl_mul function for the current type
{
	return CONCAT(shfl_mul, CONCAT(_, EXPAND(SCAN_SCALAR_TYPE)))(x, offset, width);
}

#endif // REDUCTION_REDUCE_OPERATOR == '+'

typedef struct {
	_TyScalar p_storage[WARP_SIZE];
} TLocalReduceStorage;

inline _TyScalar t_LocalReduce(const unsigned int l, _TyScalar x, __local TLocalReduceStorage *p_storage)
{
	__local _TyScalar *p_sh_mem = p_storage->p_storage; // ...

	//STATIC_ASSERT(b_Is_POT(REDUCE_LOCAL_WORK_SIZE / WARP_SIZE), NVIDIA_SHUFFLE_REDUCE_ONLY_WORKS_WITH_POWER_OF_TWO_WORKGROUPS);

	enum {
#if b_Is_POT(REDUCE_LOCAL_WORK_SIZE / WARP_SIZE)
		n_section_num = WARP_SIZE,
		n_section_size = REDUCE_LOCAL_WORK_SIZE / n_section_num,
#else // b_Is_POT(REDUCE_LOCAL_WORK_SIZE / WARP_SIZE)
		n_section_num = REDUCE_LOCAL_WORK_SIZE / WARP_SIZE,
		n_section_size = WARP_SIZE,
#endif // b_Is_POT(REDUCE_LOCAL_WORK_SIZE / WARP_SIZE)
		n_section_size_log2 = n_Log2(n_section_size)
	};
	const int n_lane = l & (n_section_size - 1);
	const int n_section = l >> n_section_size_log2;
	// want to reduce the tile to WARP_SIZE items and then do a single warp-cooperative scan

    #pragma unroll
    for(int offset = 1; offset < n_section_size; offset *= 2)
        x = shfl_reduce(x, offset, n_section_size);

	if(n_section_size - 1 == n_lane)
		p_sh_mem[n_section] = x;

	barrier(CLK_LOCAL_MEM_FENCE);

	if(l < n_section_num) {
        x = p_sh_mem[l];
        #pragma unroll
        for(int offset = 1; offset < n_section_num; offset *= 2)
            x = shfl_reduce(x, offset, n_section_num);
        p_sh_mem[l] = x;
    }

    barrier(CLK_LOCAL_MEM_FENCE);

    _TyScalar reduction = p_sh_mem[n_section_num - 1];

    return reduction;
}

#else // NVIDIA && (REDUCTION_REDUCE_OPERATOR == '+' || REDUCTION_REDUCE_OPERATOR == '-' || \
//	REDUCTION_REDUCE_OPERATOR == '*') && SCAN_SCALAR_SIZE == 4 && !DISABLE_NV_SHFL

#pragma message("building the generic reduce branch")

typedef struct {
	_TyScalar p_storage[REDUCE_LOCAL_WORK_SIZE];
} TLocalReduceStorage;

/**
 *	@brief reduces a tile of an array in local memory
 *
 *	@param[in] l is local thread id
 *	@param[in] p_sh_mem is pointer to shared memory (only REDUCE_LOCAL_WORK_SIZE elements is accessed)
 *
 *	@note The result is left in p_sh_mem[0].
 *	@note This version uses REDUCTION_REDUCE_OP operation and _TyScalar.
 */
inline _TyScalar t_LocalReduce(const unsigned int l, _TyScalar x, __local TLocalReduceStorage *p_storage)
{
	__local _TyScalar *p_sh_mem = p_storage->p_storage; // ...

	p_sh_mem[l] = x; // store

#if REDUCE_LOCAL_WORK_SIZE <= 2048 && b_Is_POT(REDUCE_LOCAL_WORK_SIZE)
#if REDUCE_LOCAL_WORK_SIZE >= 2048
	barrier(CLK_LOCAL_MEM_FENCE);
	if(l < 1024)
		p_sh_mem[l] = t_ReductionReduceOp(p_sh_mem[l], p_sh_mem[l + 1024]);
#endif
#if REDUCE_LOCAL_WORK_SIZE >= 1024
	barrier(CLK_LOCAL_MEM_FENCE);
	if(l < 512)
		p_sh_mem[l] = t_ReductionReduceOp(p_sh_mem[l], p_sh_mem[l + 512]);
#endif
#if REDUCE_LOCAL_WORK_SIZE >= 512
	barrier(CLK_LOCAL_MEM_FENCE);
	if(l < 256)
		p_sh_mem[l] = t_ReductionReduceOp(p_sh_mem[l], p_sh_mem[l + 256]);
#endif
#if REDUCE_LOCAL_WORK_SIZE >= 256
	barrier(CLK_LOCAL_MEM_FENCE);
	if(l < 128)
		p_sh_mem[l] = t_ReductionReduceOp(p_sh_mem[l], p_sh_mem[l + 128]);
#endif
#if REDUCE_LOCAL_WORK_SIZE >= 128
	barrier(CLK_LOCAL_MEM_FENCE);
	if(l < 64)
		p_sh_mem[l] = t_ReductionReduceOp(p_sh_mem[l], p_sh_mem[l + 64]);
#endif
	// reduce down to 32 elements stored in the shared memory

	// the below section is within a single warp, want to avoid divergence
	// even though unneccessary reductions are made no barriers required,
	// just a memory fence to avoid compiler optimization
#if REDUCE_LOCAL_WORK_SIZE > 32
	barrier(CLK_LOCAL_MEM_FENCE);
	if(l < 32) {
#elif REDUCE_LOCAL_WORK_SIZE > 16
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	if(l < 16) {
#elif REDUCE_LOCAL_WORK_SIZE > 8
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	if(l < 8) {
#elif REDUCE_LOCAL_WORK_SIZE > 4
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	if(l < 4) {
#elif REDUCE_LOCAL_WORK_SIZE > 2
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	if(l < 2) {
#else
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	{
#endif
#if REDUCE_LOCAL_WORK_SIZE >= 64
		p_sh_mem[l] = t_ReductionReduceOp(p_sh_mem[l], p_sh_mem[l + 32]);
		write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif
#if REDUCE_LOCAL_WORK_SIZE >= 32
		p_sh_mem[l] = t_ReductionReduceOp(p_sh_mem[l], p_sh_mem[l + 16]);
		write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif
#if REDUCE_LOCAL_WORK_SIZE >= 16
		p_sh_mem[l] = t_ReductionReduceOp(p_sh_mem[l], p_sh_mem[l + 8]);
		write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif
#if REDUCE_LOCAL_WORK_SIZE >= 8
		p_sh_mem[l] = t_ReductionReduceOp(p_sh_mem[l], p_sh_mem[l + 4]);
		write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif
#if REDUCE_LOCAL_WORK_SIZE >= 4
		p_sh_mem[l] = t_ReductionReduceOp(p_sh_mem[l], p_sh_mem[l + 2]);
		write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif
#if REDUCE_LOCAL_WORK_SIZE >= 2
		p_sh_mem[l] = t_ReductionReduceOp(p_sh_mem[l], p_sh_mem[l + 1]);
		write_mem_fence(CLK_LOCAL_MEM_FENCE); // !! so that all the threads have the same value
#endif
	}
	// reduce results down to one result per work group
#else // REDUCE_LOCAL_WORK_SIZE <= 2048 && b_Is_POT(REDUCE_LOCAL_WORK_SIZE)
	enum {
		n_first = n_Make_Lower_POT(REDUCE_LOCAL_WORK_SIZE),
		n_first_warp = (n_first > WARP_SIZE)? WARP_SIZE : n_first
	};

	#pragma unroll
	for(int offset = n_first; offset > n_first_warp; offset /= 2) {
		int size = (offset == n_first)? REDUCE_LOCAL_WORK_SIZE : offset * 2;
		barrier(CLK_LOCAL_MEM_FENCE);
		if(l < size - offset)
			p_sh_mem[l] = t_ReductionReduceOp(p_sh_mem[l], p_sh_mem[l + offset]);
	}

	if(REDUCE_LOCAL_WORK_SIZE > WARP_SIZE)
		barrier(CLK_LOCAL_MEM_FENCE);

	if(l < n_first_warp) {
		#pragma unroll
		for(int offset = n_first_warp; offset > 0; offset /= 2) {
			int size = (offset == n_first)? REDUCE_LOCAL_WORK_SIZE : offset * 2; // not "offset == n_first_warp" but rather "offset == n_first_warp && n_first_warp == n_first" or for short "offset == n_first"
			write_mem_fence(CLK_LOCAL_MEM_FENCE);
			if(n_first_warp < REDUCE_LOCAL_WORK_SIZE || l < size - offset) // skip the condition for the last warp if the shared mem is large enough
				p_sh_mem[l] = t_ReductionReduceOp(p_sh_mem[l], p_sh_mem[l + offset]);
		}
	}

	write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif // REDUCE_LOCAL_WORK_SIZE <= 2048 && b_Is_POT(REDUCE_LOCAL_WORK_SIZE)

	return p_sh_mem[0];
}

#endif // NVIDIA && (REDUCTION_REDUCE_OPERATOR == '+' || REDUCTION_REDUCE_OPERATOR == '-' ||
	   // REDUCTION_REDUCE_OPERATOR == '*') && SCAN_SCALAR_SIZE == 4 && !DISABLE_NV_SHFL

#ifdef BUILD_REDUCE

// reduces p_data to p_reduce, in tiles of REDUCE_TILE_SIZE elements
// this runs REDUCE_LOCAL_WORK_SIZE threads workgroups, each thread processes REDUCE_ITEMS_THREAD = REDUCE_TILE_SIZE / REDUCE_LOCAL_WORK_SIZE
__kernel REQUIRE_WG_SIZE(REDUCE_LOCAL_WORK_SIZE) void TileReduce(__global __write_only _TyScalar *__restrict p_reduce,
	/*const unsigned int n_reduce_off,*/ __global __read_only const _TyScalar *__restrict p_data,
	/*const unsigned int n_data_off,*/ const unsigned int n_size)
{
	/*p_reduce += n_reduce_off;
	p_data += n_data_off;
	// offset the pointers as needed (might screw up the accesses though)*/

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	p_reduce += g;

	const unsigned int n_start = g * REDUCE_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + REDUCE_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;// = min(n_size, n_start + REDUCE_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	p_data += n_start;

	_TyScalar p_my_data[REDUCE_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
	GLOBAL_TO_REGISTER(p_my_data, l, REDUCE_LOCAL_WORK_SIZE, REDUCE_ITEMS_THREAD,
		p_data, n_remainder, REDUCTION_IDENTITY);
	// copy data to registers

	_TyScalar partial = t_ReductionElemOp(p_my_data[0]);
    #pragma unroll
    for(int i = 1; i < REDUCE_ITEMS_THREAD; ++ i)
        partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
	// perform per-thread reductions

	__local TLocalReduceStorage reduce_temp;
	_TyScalar reduction = t_LocalReduce(l, partial, &reduce_temp);
	// perform the workgroup reduction

	if(!l)
		*p_reduce = reduction;
	// store the tile reduction
}

// reduces p_data to p_reduce, in tiles of REDUCE_TILE_SIZE elements
// this runs REDUCE_LOCAL_WORK_SIZE threads workgroups, each thread processes REDUCE_ITEMS_THREAD = REDUCE_TILE_SIZE / REDUCE_LOCAL_WORK_SIZE
__kernel REQUIRE_WG_SIZE(REDUCE_LOCAL_WORK_SIZE) void TileReduce_Multi(
	__global __write_only _TyScalar *__restrict p_reduce,
	/*const unsigned int n_reduce_off,*/ __global __read_only const _TyScalar *__restrict p_data,
	/*const unsigned int n_data_off,*/ const unsigned int n_size, unsigned int n_tiles_per_workgroup)
{
	/*p_reduce += n_reduce_off;
	p_data += n_data_off;
	// offset the pointers as needed (might screw up the accesses though)*/

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	p_reduce += g;

	unsigned int n_start = g * REDUCE_TILE_SIZE * n_tiles_per_workgroup;

	p_data += n_start;

	unsigned int n_remainder = min(n_size, n_start + REDUCE_TILE_SIZE * n_tiles_per_workgroup) - n_start;
	// start is always below n_size
	// version b

	//n_tiles_per_workgroup = (n_remainder + REDUCE_TILE_SIZE * n_tiles_per_workgroup - 1) /
	//	(REDUCE_TILE_SIZE * n_tiles_per_workgroup);
	enum {
		b_pot_tiles = b_Is_POT(REDUCE_TILE_SIZE),
		n_log2_tile_size = n_Log2(REDUCE_TILE_SIZE)
	};
	if(b_pot_tiles) // for some reason this is needed, the compiler won't simplify the division by itself
		n_tiles_per_workgroup -= (REDUCE_TILE_SIZE * n_tiles_per_workgroup - n_remainder) >> n_log2_tile_size; // uses right shift for POT tile sizes
	else
		n_tiles_per_workgroup -= (REDUCE_TILE_SIZE * n_tiles_per_workgroup - n_remainder) / REDUCE_TILE_SIZE;
	// there should be always at least a single tile
	// version b

	_TyScalar partial = REDUCTION_IDENTITY;
	for(;;) {
		/*unsigned int n_remainder = min(n_size, n_start + REDUCE_TILE_SIZE * n_tiles_per_workgroup) - n_start; // avoid underflow ... data size is *not* always smaller than start
		if(!n_remainder)
			break;*/
		// version a

		_TyScalar p_my_data[REDUCE_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
		GLOBAL_TO_REGISTER(p_my_data, l, REDUCE_LOCAL_WORK_SIZE, REDUCE_ITEMS_THREAD,
			p_data, n_remainder, REDUCTION_IDENTITY);
		// copy data to registers

		#pragma unroll
		for(int i = 0; i < REDUCE_ITEMS_THREAD; ++ i)
			partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
		// perform per-thread reductions

		if(!(-- n_tiles_per_workgroup))
			break;

		p_data += REDUCE_TILE_SIZE;
		//n_start += REDUCE_TILE_SIZE; // version a
		n_remainder -= REDUCE_TILE_SIZE; // version b
		// will do another consecutive tile
	}

	__local TLocalReduceStorage reduce_temp;
	_TyScalar reduction = t_LocalReduce(l, partial, &reduce_temp);
	// perform the workgroup reduction

	if(!l)
		*p_reduce = reduction;
	// store the tile reduction
}

#endif // BUILD_REDUCE

#if 0 //def BUILD_ELEMENT_OP_LESS_REDUCE // todo - not sure if this is needed, provided that t_ReductionElemOp() is inverse of t_ReductionFinalOp() - otherwise there would be trouble with scan as well

// reduces p_data to p_reduce, in tiles of REDUCE_TILE_SIZE elements
// this runs REDUCE_LOCAL_WORK_SIZE threads workgroups, each thread processes REDUCE_ITEMS_THREAD = REDUCE_TILE_SIZE / REDUCE_LOCAL_WORK_SIZE
__kernel REQUIRE_WG_SIZE(REDUCE_LOCAL_WORK_SIZE) void TileReduce_NoElemOp(__global __write_only _TyScalar *__restrict p_reduce,
	/*const unsigned int n_reduce_off,*/ __global __read_only const _TyScalar *__restrict p_data,
	/*const unsigned int n_data_off,*/ const unsigned int n_size)
{
	/*p_reduce += n_reduce_off;
	p_data += n_data_off;
	// offset the pointers as needed (might screw up the accesses though)*/

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	p_reduce += g;

	const unsigned int n_start = g * REDUCE_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + REDUCE_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;// = min(n_size, n_start + REDUCE_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	p_data += n_start;

	_TyScalar p_my_data[REDUCE_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
	GLOBAL_TO_REGISTER(p_my_data, l, REDUCE_LOCAL_WORK_SIZE, REDUCE_ITEMS_THREAD,
		p_data, n_remainder, REDUCTION_IDENTITY);
	// copy data to registers

	_TyScalar partial = t_ReductionElemOp(p_my_data[0]);
    #pragma unroll
    for(int i = 1; i < REDUCE_ITEMS_THREAD; ++ i)
        partial = t_ReductionReduceOp(partial, /*t_ReductionElemOp*/(p_my_data[i]));
	// perform per-thread reductions

	__local TLocalReduceStorage reduce_temp;
	_TyScalar reduction = t_LocalReduce(l, partial, &reduce_temp);
	// perform the workgroup reduction

	if(!l)
		*p_reduce = /*t_ReductionFinalOp*/(reduction);
	// store the tile reduction
}

// reduces p_data to p_reduce, in tiles of REDUCE_TILE_SIZE elements
// this runs REDUCE_LOCAL_WORK_SIZE threads workgroups, each thread processes REDUCE_ITEMS_THREAD = REDUCE_TILE_SIZE / REDUCE_LOCAL_WORK_SIZE
__kernel REQUIRE_WG_SIZE(REDUCE_LOCAL_WORK_SIZE) void TileReduce_Multi_NoElemOp(
	__global __write_only _TyScalar *__restrict p_reduce,
	/*const unsigned int n_reduce_off,*/ __global __read_only const _TyScalar *__restrict p_data,
	/*const unsigned int n_data_off,*/ const unsigned int n_size, unsigned int n_tiles_per_workgroup)
{
	/*p_reduce += n_reduce_off;
	p_data += n_data_off;
	// offset the pointers as needed (might screw up the accesses though)*/

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	p_reduce += g;

	unsigned int n_start = g * REDUCE_TILE_SIZE * n_tiles_per_workgroup;

	p_data += n_start;

	unsigned int n_remainder = min(n_size, n_start + REDUCE_TILE_SIZE * n_tiles_per_workgroup) - n_start;
	// start is always below n_size
	// version b

	//n_tiles_per_workgroup = (n_remainder + REDUCE_TILE_SIZE * n_tiles_per_workgroup - 1) /
	//	(REDUCE_TILE_SIZE * n_tiles_per_workgroup);
	enum {
		b_pot_tiles = b_Is_POT(REDUCE_TILE_SIZE),
		n_log2_tile_size = n_Log2(REDUCE_TILE_SIZE)
	};
	if(b_pot_tiles) // for some reason this is needed, the compiler won't simplify the division by itself
		n_tiles_per_workgroup -= (REDUCE_TILE_SIZE * n_tiles_per_workgroup - n_remainder) >> n_log2_tile_size; // uses right shift for POT tile sizes
	else
		n_tiles_per_workgroup -= (REDUCE_TILE_SIZE * n_tiles_per_workgroup - n_remainder) / REDUCE_TILE_SIZE;
	// there should be always at least a single tile
	// version b

	_TyScalar partial = REDUCTION_IDENTITY;
	for(;;) {
		/*unsigned int n_remainder = min(n_size, n_start + REDUCE_TILE_SIZE * n_tiles_per_workgroup) - n_start; // avoid underflow ... data size is *not* always smaller than start
		if(!n_remainder)
			break;*/
		// version a

		_TyScalar p_my_data[REDUCE_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
		GLOBAL_TO_REGISTER(p_my_data, l, REDUCE_LOCAL_WORK_SIZE, REDUCE_ITEMS_THREAD,
			p_data, n_remainder, REDUCTION_IDENTITY);
		// copy data to registers

		#pragma unroll
		for(int i = 0; i < REDUCE_ITEMS_THREAD; ++ i)
			partial = t_ReductionReduceOp(partial, /*t_ReductionElemOp*/(p_my_data[i]));
		// perform per-thread reductions

		if(!(-- n_tiles_per_workgroup))
			break;

		p_data += REDUCE_TILE_SIZE;
		//n_start += REDUCE_TILE_SIZE; // version a
		n_remainder -= REDUCE_TILE_SIZE; // version b
		// will do another consecutive tile
	}

	__local TLocalReduceStorage reduce_temp;
	_TyScalar reduction = t_LocalReduce(l, partial, &reduce_temp);
	// perform the workgroup reduction

	if(!l)
		*p_reduce = /*t_ReductionFinalOp*/(reduction);
	// store the tile reduction
}

#endif // BUILD_ELEMENT_OP_LESS_REDUCE

#ifdef BUILD_FINALIZING_REDUCE

#if 0 //def BUILD_ELEMENT_OP_LESS_REDUCE

// reduces p_data to p_reduce, in tiles of REDUCE_TILE_SIZE elements
// this runs REDUCE_LOCAL_WORK_SIZE threads workgroups, each thread processes REDUCE_ITEMS_THREAD = REDUCE_TILE_SIZE / REDUCE_LOCAL_WORK_SIZE
__kernel REQUIRE_WG_SIZE(REDUCE_LOCAL_WORK_SIZE) void TileReduce_NoElemOp_FinalOp(__global __write_only _TyScalar *__restrict p_reduce,
	/*const unsigned int n_reduce_off,*/ __global __read_only const _TyScalar *__restrict p_data,
	/*const unsigned int n_data_off,*/ const unsigned int n_size)
{
	/*p_reduce += n_reduce_off;
	p_data += n_data_off;
	// offset the pointers as needed (might screw up the accesses though)*/

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	p_reduce += g;

	const unsigned int n_start = g * REDUCE_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + REDUCE_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;// = min(n_size, n_start + REDUCE_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	p_data += n_start;

	_TyScalar p_my_data[REDUCE_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
	GLOBAL_TO_REGISTER(p_my_data, l, REDUCE_LOCAL_WORK_SIZE, REDUCE_ITEMS_THREAD,
		p_data, n_remainder, REDUCTION_IDENTITY);
	// copy data to registers

	_TyScalar partial = t_ReductionElemOp(p_my_data[0]);
    #pragma unroll
    for(int i = 1; i < REDUCE_ITEMS_THREAD; ++ i)
        partial = t_ReductionReduceOp(partial, /*t_ReductionElemOp*/(p_my_data[i]));
	// perform per-thread reductions

	__local TLocalReduceStorage reduce_temp;
	_TyScalar reduction = t_LocalReduce(l, partial, &reduce_temp);
	// perform the workgroup reduction

	if(!l)
		*p_reduce = t_ReductionFinalOp(reduction);
	// store the tile reduction
}

// reduces p_data to p_reduce, in tiles of REDUCE_TILE_SIZE elements
// this runs REDUCE_LOCAL_WORK_SIZE threads workgroups, each thread processes REDUCE_ITEMS_THREAD = REDUCE_TILE_SIZE / REDUCE_LOCAL_WORK_SIZE
__kernel REQUIRE_WG_SIZE(REDUCE_LOCAL_WORK_SIZE) void TileReduce_Multi_NoElemOp_FinalOp(
	__global __write_only _TyScalar *__restrict p_reduce,
	/*const unsigned int n_reduce_off,*/ __global __read_only const _TyScalar *__restrict p_data,
	/*const unsigned int n_data_off,*/ const unsigned int n_size, unsigned int n_tiles_per_workgroup)
{
	/*p_reduce += n_reduce_off;
	p_data += n_data_off;
	// offset the pointers as needed (might screw up the accesses though)*/

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	p_reduce += g;

	unsigned int n_start = g * REDUCE_TILE_SIZE * n_tiles_per_workgroup;

	p_data += n_start;

	unsigned int n_remainder = min(n_size, n_start + REDUCE_TILE_SIZE * n_tiles_per_workgroup) - n_start;
	// start is always below n_size
	// version b

	//n_tiles_per_workgroup = (n_remainder + REDUCE_TILE_SIZE * n_tiles_per_workgroup - 1) /
	//	(REDUCE_TILE_SIZE * n_tiles_per_workgroup);
	enum {
		b_pot_tiles = b_Is_POT(REDUCE_TILE_SIZE),
		n_log2_tile_size = n_Log2(REDUCE_TILE_SIZE)
	};
	if(b_pot_tiles) // for some reason this is needed, the compiler won't simplify the division by itself
		n_tiles_per_workgroup -= (REDUCE_TILE_SIZE * n_tiles_per_workgroup - n_remainder) >> n_log2_tile_size; // uses right shift for POT tile sizes
	else
		n_tiles_per_workgroup -= (REDUCE_TILE_SIZE * n_tiles_per_workgroup - n_remainder) / REDUCE_TILE_SIZE;
	// there should be always at least a single tile
	// version b

	_TyScalar partial = REDUCTION_IDENTITY;
	for(;;) {
		/*unsigned int n_remainder = min(n_size, n_start + REDUCE_TILE_SIZE * n_tiles_per_workgroup) - n_start; // avoid underflow ... data size is *not* always smaller than start
		if(!n_remainder)
			break;*/
		// version a

		_TyScalar p_my_data[REDUCE_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
		GLOBAL_TO_REGISTER(p_my_data, l, REDUCE_LOCAL_WORK_SIZE, REDUCE_ITEMS_THREAD,
			p_data, n_remainder, REDUCTION_IDENTITY);
		// copy data to registers

		#pragma unroll
		for(int i = 0; i < REDUCE_ITEMS_THREAD; ++ i)
			partial = t_ReductionReduceOp(partial, /*t_ReductionElemOp*/(p_my_data[i]));
		// perform per-thread reductions

		if(!(-- n_tiles_per_workgroup))
			break;

		p_data += REDUCE_TILE_SIZE;
		//n_start += REDUCE_TILE_SIZE; // version a
		n_remainder -= REDUCE_TILE_SIZE; // version b
		// will do another consecutive tile
	}

	__local TLocalReduceStorage reduce_temp;
	_TyScalar reduction = t_LocalReduce(l, partial, &reduce_temp);
	// perform the workgroup reduction

	if(!l)
		*p_reduce = t_ReductionFinalOp(reduction);
	// store the tile reduction
}

#endif // BUILD_ELEMENT_OP_LESS_REDUCE

// reduces p_data to p_reduce, in tiles of REDUCE_TILE_SIZE elements
// this runs REDUCE_LOCAL_WORK_SIZE threads workgroups, each thread processes REDUCE_ITEMS_THREAD = REDUCE_TILE_SIZE / REDUCE_LOCAL_WORK_SIZE
__kernel REQUIRE_WG_SIZE(REDUCE_LOCAL_WORK_SIZE) void TileReduce_FinalOp(__global __write_only _TyScalar *__restrict p_reduce,
	/*const unsigned int n_reduce_off,*/ __global __read_only const _TyScalar *__restrict p_data,
	/*const unsigned int n_data_off,*/ const unsigned int n_size)
{
	/*p_reduce += n_reduce_off;
	p_data += n_data_off;
	// offset the pointers as needed (might screw up the accesses though)*/

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	p_reduce += g;

	const unsigned int n_start = g * REDUCE_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + REDUCE_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;// = min(n_size, n_start + REDUCE_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	p_data += n_start;

	_TyScalar p_my_data[REDUCE_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
	GLOBAL_TO_REGISTER(p_my_data, l, REDUCE_LOCAL_WORK_SIZE, REDUCE_ITEMS_THREAD,
		p_data, n_remainder, REDUCTION_IDENTITY);
	// copy data to registers

	_TyScalar partial = t_ReductionElemOp(p_my_data[0]);
    #pragma unroll
    for(int i = 1; i < REDUCE_ITEMS_THREAD; ++ i)
        partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
	// perform per-thread reductions

	__local TLocalReduceStorage reduce_temp;
	_TyScalar reduction = t_LocalReduce(l, partial, &reduce_temp);
	// perform the workgroup reduction

	if(!l)
		*p_reduce = t_ReductionFinalOp(reduction);
	// store the tile reduction
}

// reduces p_data to p_reduce, in tiles of REDUCE_TILE_SIZE elements
// this runs REDUCE_LOCAL_WORK_SIZE threads workgroups, each thread processes REDUCE_ITEMS_THREAD = REDUCE_TILE_SIZE / REDUCE_LOCAL_WORK_SIZE
__kernel REQUIRE_WG_SIZE(REDUCE_LOCAL_WORK_SIZE) void TileReduce_Multi_FinalOp(
	__global __write_only _TyScalar *__restrict p_reduce,
	/*const unsigned int n_reduce_off,*/ __global __read_only const _TyScalar *__restrict p_data,
	/*const unsigned int n_data_off,*/ const unsigned int n_size, unsigned int n_tiles_per_workgroup)
{
	/*p_reduce += n_reduce_off;
	p_data += n_data_off;
	// offset the pointers as needed (might screw up the accesses though)*/

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	p_reduce += g;

	unsigned int n_start = g * REDUCE_TILE_SIZE * n_tiles_per_workgroup;

	p_data += n_start;

	unsigned int n_remainder = min(n_size, n_start + REDUCE_TILE_SIZE * n_tiles_per_workgroup) - n_start;
	// start is always below n_size
	// version b

	//n_tiles_per_workgroup = (n_remainder + REDUCE_TILE_SIZE * n_tiles_per_workgroup - 1) /
	//	(REDUCE_TILE_SIZE * n_tiles_per_workgroup);
	enum {
		b_pot_tiles = b_Is_POT(REDUCE_TILE_SIZE),
		n_log2_tile_size = n_Log2(REDUCE_TILE_SIZE)
	};
	if(b_pot_tiles) // for some reason this is needed, the compiler won't simplify the division by itself
		n_tiles_per_workgroup -= (REDUCE_TILE_SIZE * n_tiles_per_workgroup - n_remainder) >> n_log2_tile_size; // uses right shift for POT tile sizes
	else
		n_tiles_per_workgroup -= (REDUCE_TILE_SIZE * n_tiles_per_workgroup - n_remainder) / REDUCE_TILE_SIZE;
	// there should be always at least a single tile
	// version b

	_TyScalar partial = REDUCTION_IDENTITY;
	for(;;) {
		/*unsigned int n_remainder = min(n_size, n_start + REDUCE_TILE_SIZE * n_tiles_per_workgroup) - n_start; // avoid underflow ... data size is *not* always smaller than start
		if(!n_remainder)
			break;*/
		// version a

		_TyScalar p_my_data[REDUCE_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
		GLOBAL_TO_REGISTER(p_my_data, l, REDUCE_LOCAL_WORK_SIZE, REDUCE_ITEMS_THREAD,
			p_data, n_remainder, REDUCTION_IDENTITY);
		// copy data to registers

		#pragma unroll
		for(int i = 0; i < REDUCE_ITEMS_THREAD; ++ i)
			partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
		// perform per-thread reductions

		if(!(-- n_tiles_per_workgroup))
			break;

		p_data += REDUCE_TILE_SIZE;
		//n_start += REDUCE_TILE_SIZE; // version a
		n_remainder -= REDUCE_TILE_SIZE; // version b
		// will do another consecutive tile
	}

	__local TLocalReduceStorage reduce_temp;
	_TyScalar reduction = t_LocalReduce(l, partial, &reduce_temp);
	// perform the workgroup reduction

	if(!l)
		*p_reduce = t_ReductionFinalOp(reduction);
	// store the tile reduction
}

#endif // BUILD_FINALIZING_REDUCE

// ---- scan ----

#if defined(NVIDIA) && (REDUCTION_REDUCE_OPERATOR == '+' || REDUCTION_REDUCE_OPERATOR == '-' /*||*/ \
	/*REDUCTION_REDUCE_OPERATOR == '*'*/) && SCAN_SCALAR_SIZE == 4 && !defined(DISABLE_NV_SHFL) && b_Is_POT(SCAN_LOCAL_WORK_SIZE / WARP_SIZE) // conversion to exscan won't work with multiplication - todo

typedef struct {
	_TyScalar p_workspace[SCAN_LOCAL_WORK_SIZE / WARP_SIZE + 1];
} TLocalScanStorage;

// calculates local exclusive scan using only t_ReductionReduceOp()
// a simpler mGPU ping pong algorithm; requires a bit more memory but seems to run faster
inline _TyScalar t_LocalExScan(const int li, _TyScalar x, _TyScalar *p_sum, __local TLocalScanStorage *p_storage)
{
	__local _TyScalar *p_shared = p_storage->p_workspace;

	STATIC_ASSERT(b_Is_POT(REDUCE_LOCAL_WORK_SIZE / WARP_SIZE), NVIDIA_SHUFFLE_SCAN_ONLY_WORKS_WITH_POWER_OF_TWO_WORKGROUPS);

	enum {
		n_segment_num = WARP_SIZE,
		n_segment_size = REDUCE_LOCAL_WORK_SIZE / n_segment_num,
		n_segment_size_log2 = n_Log2(n_segment_size)
	};
	// want to reduce the tile to WARP_SIZE items and then do a single warp-cooperative scan
	// define warp_size segments that are nt / warp_size large
	// each warp makes log(segsize) shfl_reduce calls
	// the spine makes log(warp_size) shfl_reduce calls

#if b_Is_POT(REDUCE_LOCAL_WORK_SIZE / WARP_SIZE)
	const int n_lane = li & (n_segment_size - 1);
	const int n_segment = li >> n_segment_size_log2;
#else // b_Is_POT(REDUCE_LOCAL_WORK_SIZE / WARP_SIZE)
	const int n_lane = li % n_segment_size;
	const int n_segment = li / n_segment_size;
#endif // b_Is_POT(REDUCE_LOCAL_WORK_SIZE / WARP_SIZE)

	_TyScalar scan = x;
	#pragma unroll
	for(int offset = 1; offset < n_segment_size; offset *= 2)
		scan = shfl_reduce(scan, offset, n_segment_size);
	// scan each segment using shfl_reduce()

	if(n_lane == n_segment_size - 1)
		p_shared[n_segment] = scan;
	// store the reduction (last element) of each segment into storage

	barrier(CLK_LOCAL_MEM_FENCE);

	if(li < n_segment_num) {
		_TyScalar y = p_shared[li];
		_TyScalar scan = y;
		#pragma unroll
		for(int offset = 1; offset < n_segment_num; offset *= 2)
			scan = shfl_reduce(scan, offset, n_segment_num);
#if REDUCTION_REDUCE_OPERATOR == '+'
		p_shared[li] = scan - y;
#else
		p_shared[li] = scan + y;
#endif
		if(n_segment_num - 1 == li)
			p_shared[n_segment_num] = scan;
	}
	// warp 0 does a full shfl warp scan on the partials, the total is
	// stored to shared[n_segment_num]

	barrier(CLK_LOCAL_MEM_FENCE);

#if REDUCTION_REDUCE_OPERATOR == '+'
	scan += p_shared[n_segment];
	scan -= x;
#else
	scan -= p_shared[n_segment];
	scan += x;
#endif
	if(REDUCTION_IDENTITY != (_TyScalar)0 && !li)
		scan = REDUCTION_IDENTITY;
	// add the scanned partials back in and convert to exclusive scan

	*p_sum = p_shared[n_segment_num];

	//barrier(CLK_LOCAL_MEM_FENCE);

	return scan;
}

#else // NVIDIA && (REDUCTION_REDUCE_OPERATOR == '+' || REDUCTION_REDUCE_OPERATOR == '-' || \
//	REDUCTION_REDUCE_OPERATOR == '*') && SCAN_SCALAR_SIZE == 4 && !DISABLE_NV_SHFL && b_Is_POT(SCAN_LOCAL_WORK_SIZE / WARP_SIZE)

#ifdef USE_HARRIS_SCAN

#define WARP_COOP_SCAN_ELEMS_PER_THREAD_HINT 2
// subject to tuning, 2 seems to be quite efficient; this is only hint and may be
// reduced - do not use this constant, use SCAN_ITEMS_WARPTHREAD instead!

#define WARP_SCAN_SIZE (((SCAN_LOCAL_WORK_SIZE) < ((WARP_SIZE) * (WARP_COOP_SCAN_ELEMS_PER_THREAD_HINT)))? \
	(WARP_SIZE) : ((WARP_SIZE) * (WARP_COOP_SCAN_ELEMS_PER_THREAD_HINT)))
// size of warp-cooperative scan

//#undef WARP_COOP_SCAN_ELEMS_PER_THREAD_HINT // can't, it is referenced from inside WARP_SCAN_SIZE, can't seem to expand
// to make sure it doesn't get used in the code below

#define SCAN_ITEMS_WARPTHREAD ((SCAN_LOCAL_WORK_SIZE) / (WARP_SCAN_SIZE))
// SCAN_ITEMS_WARPTHREAD = WARP_COOP_SCAN_ELEMS_PER_THREAD most of the time

STATIC_ASSERT(SCAN_LOCAL_WORK_SIZE == SCAN_ITEMS_WARPTHREAD * WARP_SCAN_SIZE, WARP_SCAN_SIZE_MUST_DIVIDE_SCAN_LOCAL_WORK_SIZE);

typedef struct {
	_TyScalar p_warp_workspace[WARP_SCAN_SIZE + 1 + CONFLICT_FREE_OFFSET(WARP_SCAN_SIZE)]; // if WARP_SCAN_SIZE == LOCAL_MEMORY_BANK_NUM then this array could hide inside p_workspace (in the skipped entries) but this would then make the warp scan nontrivial
	// only enough storage to calculate the warp size storage

	_TyScalar p_workspace[SCAN_LOCAL_WORK_SIZE + CONFLICT_FREE_OFFSET(SCAN_LOCAL_WORK_SIZE - 1)];
	// add some padding to avoid bank conflicts
} TLocalScanStorage; // t_odo - need to modify this for the *local* scan (the size of workgroup threads, e.g. 256) rather than *tile* scan (e.g. 1024)

// todo - use the element / reduce / finalize operators in here (maybe drop the finalize operator? will need two kinds of kernels, one with and one without finalization)

// algorithm due to harris, works in tree-like fashion
// calculates scan of WARP_SCAN_SIZE elements which need to be stored in p_workspace, using CONFLICT_FREE_INDEX()
// leaves the sum inside p_workspace[CONFLICT_FREE_INDEX(n_scan_size)]
#define WARP_COOP_ExSCAN_TEMPLATE(_n_scan_size,_n_thread_num,T,p_workspace,li)										\
	do {																											\
		enum {																										\
			n_scan_size = (_n_scan_size),																			\
			n_thread_num = (_n_thread_num), /* make sure those are compile-time consts */							\
			n_item_per_thread_num = (n_scan_size >= 2 * n_thread_num)? n_scan_size / (2 * n_thread_num) : 1			\
			/* the extra threads will not be used here, they will just get their copy of the sum */					\
		};																											\
		/* number of pairs to swap/add per thread */																\
																													\
		STATIC_ASSERT(n_thread_num <= WARP_SIZE, WARP_SCAN_WITH_TOO_MANY_THREADS);									\
		STATIC_ASSERT(n_scan_size % n_thread_num == 0, HALF_SCAN_SIZE_MUST_DIVIDE_THREAD_COUNT);					\
																													\
		_Pragma("unroll")																							\
		for(int d = n_scan_size >> 1, n_step = 1; d > 0; d >>= 1) { /* surprisingly unrolls */						\
			write_mem_fence(CLK_LOCAL_MEM_FENCE);																	\
																													\
			if(li < d) {																							\
				int n_index0 = n_step * (2 * li + 1) - 1;															\
				int n_index1 = n_step * (2 * li + 2) - 1;															\
				p_workspace[CONFLICT_FREE_INDEX(n_index1)] =														\
					t_ReductionReduceOp(p_workspace[CONFLICT_FREE_INDEX(n_index0)],									\
					p_workspace[CONFLICT_FREE_INDEX(n_index1)]);													\
			}																										\
			if(n_item_per_thread_num > 1) {																			\
				int pi = li + n_thread_num;																			\
				if(pi < d) {																						\
					int n_index0 = n_step * (2 * pi + 1) - 1;														\
					int n_index1 = n_step * (2 * pi + 2) - 1;														\
					p_workspace[CONFLICT_FREE_INDEX(n_index1)] =													\
						t_ReductionReduceOp(p_workspace[CONFLICT_FREE_INDEX(n_index0)],								\
						p_workspace[CONFLICT_FREE_INDEX(n_index1)]);												\
				}																									\
			}																										\
			if(n_item_per_thread_num > 2) {																			\
				int pi = li + n_thread_num * 2;																		\
				if(pi < d) {																						\
					int n_index0 = n_step * (2 * pi + 1) - 1;														\
					int n_index1 = n_step * (2 * pi + 2) - 1;														\
					p_workspace[CONFLICT_FREE_INDEX(n_index1)] =													\
						t_ReductionReduceOp(p_workspace[CONFLICT_FREE_INDEX(n_index0)],								\
						p_workspace[CONFLICT_FREE_INDEX(n_index1)]);												\
				}																									\
			}																										\
			if(n_item_per_thread_num > 3) {																			\
				int pi = li + n_thread_num * 3;																		\
				if(pi < d) {																						\
					int n_index0 = n_step * (2 * pi + 1) - 1;														\
					int n_index1 = n_step * (2 * pi + 2) - 1;														\
					p_workspace[CONFLICT_FREE_INDEX(n_index1)] =													\
						t_ReductionReduceOp(p_workspace[CONFLICT_FREE_INDEX(n_index0)],								\
						p_workspace[CONFLICT_FREE_INDEX(n_index1)]);												\
				}																									\
			}																										\
			if(n_item_per_thread_num > 4) {																			\
				_Pragma("unroll")																					\
				for(int i = 4; i < n_item_per_thread_num; ++ i) { /* this sadly does not unroll very well */		\
					int pi = li + n_thread_num * i;																	\
					if(pi < d) {																					\
						int n_index0 = n_step * (2 * pi + 1) - 1;													\
						int n_index1 = n_step * (2 * pi + 2) - 1;													\
						p_workspace[CONFLICT_FREE_INDEX(n_index1)] =												\
							t_ReductionReduceOp(p_workspace[CONFLICT_FREE_INDEX(n_index0)],							\
							p_workspace[CONFLICT_FREE_INDEX(n_index1)]);											\
					}																								\
				}																									\
			}																										\
			n_step += n_step;																						\
		}																											\
		/* build sum in place up the tree   */																		\
																													\
		write_mem_fence(CLK_LOCAL_MEM_FENCE); /* required on K40 */													\
		p_workspace[CONFLICT_FREE_INDEX(n_scan_size)] = p_workspace[CONFLICT_FREE_INDEX(n_scan_size - 1)];			\
		read_mem_fence(CLK_LOCAL_MEM_FENCE); /* required on K40 */													\
		/* copy the sum to the last element; keepint it here makes the divergent branch below shorter */			\
																													\
		if(!li)																										\
			p_workspace[CONFLICT_FREE_INDEX(n_scan_size - 1)] = REDUCTION_IDENTITY;									\
		/* clear the last element // note that it will contain the sum of elements */								\
																													\
		_Pragma("unroll")																							\
		for(int d = 1, n_step = n_scan_size; d < n_scan_size; d += d) { /* surprisingly unrolls */					\
			n_step >>= 1;																							\
			write_mem_fence(CLK_LOCAL_MEM_FENCE);																	\
																													\
			if(li < d) {																							\
				int n_index0 = n_step * (2 * li + 1) - 1;															\
				int n_index1 = n_step * (2 * li + 2) - 1;															\
				n_index0 = CONFLICT_FREE_INDEX(n_index0);															\
				n_index1 = CONFLICT_FREE_INDEX(n_index1);															\
				T n_temp = p_workspace[n_index0];																	\
				p_workspace[n_index1] = t_ReductionReduceOp(														\
					p_workspace[n_index0] = p_workspace[n_index1], n_temp);											\
			}																										\
			if(n_item_per_thread_num > 1) {																			\
				int pi = li + n_thread_num;																			\
				if(pi < d) {																						\
					int n_index0 = n_step * (2 * pi + 1) - 1;														\
					int n_index1 = n_step * (2 * pi + 2) - 1;														\
					n_index0 = CONFLICT_FREE_INDEX(n_index0);														\
					n_index1 = CONFLICT_FREE_INDEX(n_index1);														\
					T n_temp = p_workspace[n_index0];																\
					p_workspace[n_index1] = t_ReductionReduceOp(													\
						p_workspace[n_index0] = p_workspace[n_index1], n_temp);										\
				}																									\
			}																										\
			if(n_item_per_thread_num > 2) {																			\
				int pi = li + n_thread_num * 2;																		\
				if(pi < d) {																						\
					int n_index0 = n_step * (2 * pi + 1) - 1;														\
					int n_index1 = n_step * (2 * pi + 2) - 1;														\
					n_index0 = CONFLICT_FREE_INDEX(n_index0);														\
					n_index1 = CONFLICT_FREE_INDEX(n_index1);														\
					T n_temp = p_workspace[n_index0];																\
					p_workspace[n_index1] = t_ReductionReduceOp(													\
						p_workspace[n_index0] = p_workspace[n_index1], n_temp);										\
				}																									\
			}																										\
			if(n_item_per_thread_num > 3) {																			\
				int pi = li + n_thread_num * 3;																		\
				if(pi < d) {																						\
					int n_index0 = n_step * (2 * pi + 1) - 1;														\
					int n_index1 = n_step * (2 * pi + 2) - 1;														\
					n_index0 = CONFLICT_FREE_INDEX(n_index0);														\
					n_index1 = CONFLICT_FREE_INDEX(n_index1);														\
					T n_temp = p_workspace[n_index0];																\
					p_workspace[n_index1] = t_ReductionReduceOp(													\
						p_workspace[n_index0] = p_workspace[n_index1], n_temp);										\
				}																									\
			}																										\
			if(n_item_per_thread_num > 4) {																			\
				_Pragma("unroll")																					\
				for(int i = 4; i < n_item_per_thread_num; ++ i) { /* this sadly does not unroll very well */		\
					int pi = li + n_thread_num * i;																	\
					if(pi < d) {																					\
						int n_index0 = n_step * (2 * pi + 1) - 1;													\
						int n_index1 = n_step * (2 * pi + 2) - 1;													\
						n_index0 = CONFLICT_FREE_INDEX(n_index0);													\
						n_index1 = CONFLICT_FREE_INDEX(n_index1);													\
						T n_temp = p_workspace[n_index0];															\
						p_workspace[n_index1] = t_ReductionReduceOp(												\
							p_workspace[n_index0] = p_workspace[n_index1], n_temp);									\
					}																								\
				}																									\
			}																										\
		}																											\
		/* traverse down the tree, build scan */																	\
	} while(0)

// calculates local exclusive scan using only t_ReductionReduceOp()
// uses tree-like algorithm of Harris
inline _TyScalar t_LocalExScan(const int li, _TyScalar x, _TyScalar *p_sum, __local TLocalScanStorage *p_storage)
{
	__local _TyScalar *p_warp_workspace = p_storage->p_warp_workspace;//[WARP_SCAN_SIZE + CONFLICT_FREE_OFFSET(WARP_SCAN_SIZE - 1)]; // if WARP_SCAN_SIZE == LOCAL_MEMORY_BANK_NUM then this array could hide inside p_workspace (in the skipped entries) but this would then make the warp scan nontrivial
	// only enough storage to calculate the warp size storage

	__local _TyScalar *p_workspace = p_storage->p_workspace;//[SCAN_TILE_SIZE + CONFLICT_FREE_OFFSET(SCAN_TILE_SIZE - 1)];
	// add some padding to avoid bank conflicts
	// note that this can be inside, doesn't slow down when calling from kernel

	p_workspace[CONFLICT_FREE_INDEX(li)] = x;
	// copy the data from the array to local memory

	barrier(CLK_LOCAL_MEM_FENCE);

	if(li < WARP_SCAN_SIZE) {
		_TyScalar n_local_sum = 0;

		#pragma unroll
		for(int j = 0, n_src = li * SCAN_ITEMS_WARPTHREAD; j < SCAN_ITEMS_WARPTHREAD; ++ j, ++ n_src) {
			const _TyScalar n_add = p_workspace[n_src + CONFLICT_FREE_OFFSET(n_src)];
			p_workspace[n_src + CONFLICT_FREE_OFFSET(n_src)] = n_local_sum;
			n_local_sum = t_ReductionReduceOp(n_local_sum, n_add);
		}
		// calculate thread-local exclusive scan (e.g. of 8 elements), this saves greatly on synchronization

		p_warp_workspace[CONFLICT_FREE_INDEX(li)] = n_local_sum;
	}
	// perform spine scan

	barrier(CLK_LOCAL_MEM_FENCE);

	if(li < WARP_SIZE) // could put this inside of Warp_ExScan_SingleTile() and then all the threads would have sum; or make n_sum __local
		WARP_COOP_ExSCAN_TEMPLATE(WARP_SCAN_SIZE, WARP_SIZE, _TyScalar, p_warp_workspace, li); // this actually only uses 16 threads to reduce 32 elems or 32 threads to reduce 64 elems
	// calculate warp scan

	barrier(CLK_LOCAL_MEM_FENCE);

	*p_sum = p_warp_workspace[CONFLICT_FREE_INDEX(WARP_SCAN_SIZE)];
	// all threads must get the sum (not only the first warp)!

	return t_ReductionReduceOp(p_workspace[CONFLICT_FREE_INDEX(li)],
		p_warp_workspace[CONFLICT_FREE_INDEX(li / SCAN_ITEMS_WARPTHREAD)]);
	// add spine scan to the thread scan
	// no back conflicts, resolved using broadcast
}

#else // USE_HARRIS_SCAN

typedef struct {
	_TyScalar p_workspace[2 * SCAN_LOCAL_WORK_SIZE + 1];
} TLocalScanStorage;

// calculates local exclusive scan using only t_ReductionReduceOp()
// a simpler mGPU ping pong algorithm; requires a bit more memory but seems to run slightly faster
inline _TyScalar t_LocalExScan(const int li, _TyScalar x, _TyScalar *p_sum, __local TLocalScanStorage *p_storage)
{
	__local _TyScalar *p_shared = p_storage->p_workspace;

	p_shared[li] = x;

	barrier(CLK_LOCAL_MEM_FENCE);

	int n_first = 0;
	#pragma unroll
    for(int n_offset = 1; n_offset < SCAN_LOCAL_WORK_SIZE; n_offset *= 2) {
        if(li >= n_offset)
            x = t_ReductionReduceOp(p_shared[n_first + li - n_offset], x);
        n_first = SCAN_LOCAL_WORK_SIZE - n_first;

        p_shared[n_first + li] = x;

        barrier(CLK_LOCAL_MEM_FENCE);
    }
    *p_sum = p_shared[n_first + SCAN_LOCAL_WORK_SIZE - 1];
	// note that this is not work-efficient but performs the reduction in fewer steps than Harris' scan

	x = (li)? p_shared[n_first + li - 1] : REDUCTION_IDENTITY;
	// calculates inclusive scan; shift right and zero the first element

	//barrier(CLK_LOCAL_MEM_FENCE);

	return x;
}

#endif // USE_HARRIS_SCAN

#endif // NVIDIA && (REDUCTION_REDUCE_OPERATOR == '+' || REDUCTION_REDUCE_OPERATOR == '-' ||
	   // REDUCTION_REDUCE_OPERATOR == '*') && SCAN_SCALAR_SIZE == 4 && !DISABLE_NV_SHFL &&
	   // b_Is_POT(SCAN_LOCAL_WORK_SIZE / WARP_SIZE)

#if !(SCAN_ITEMS_THREAD & 1) // even SCAN_ITEMS_THREAD, must be 2, 4, ...
#define USE_CFI
#define CFI_FLAG 1
// for some reason without CFI, scan crashes on K40 otherwise almost as if it couldnt handle the bank conflicts
#else // !(SCAN_ITEMS_THREAD & 1)
#define CFI_FLAG 0
#endif // !(SCAN_ITEMS_THREAD & 1)

#ifdef BUILD_SCAN

// scans p_data to p_reduce, in tiles of SCAN_TILE_SIZE elements
// this runs SCAn_LOCAL_WORK_SIZE threads workgroups, each thread processes SCAN_ITEMS_THREAD = SCAN_TILE_SIZE / SCAn_LOCAL_WORK_SIZE
__kernel REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) void TileExScan(__global __write_only _TyScalar *__restrict p_scan,
	/*const unsigned int n_scan_off,*/ __global __read_only const _TyScalar *__restrict p_data,
	/*const unsigned int n_data_off,*/ const unsigned int n_size)
{
	/*p_scan += n_scan_off;
	p_data += n_data_off;
	// offset the pointers as needed (might screw up the accesses though)*/

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	const unsigned int n_start = g * SCAN_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + SCAN_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;// = min(n_size, n_start + SCAN_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	p_scan += n_start;
	p_data += n_start;

	__local union {
		_TyScalar p_shared_data[ORDERED_LOAD_TEMP_SIZE(CFI_FLAG, SCAN_ITEMS_THREAD, SCAN_TILE_SIZE)];
		TLocalScanStorage scan_temp;
	} temp;

	_TyScalar p_my_data[SCAN_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
	GLOBAL_TO_REGISTER_ORDERED_CFI_COND(CFI_FLAG, p_my_data, l, SCAN_LOCAL_WORK_SIZE,
		SCAN_ITEMS_THREAD, p_data, n_remainder, REDUCTION_IDENTITY, temp.p_shared_data);
	// copy data to registers

	_TyScalar partial = t_ReductionElemOp(p_my_data[0]);
    #pragma unroll
    for(int i = 1; i < SCAN_ITEMS_THREAD; ++ i)
        partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
	// perform per-thread reductions

	if(SCAN_ITEMS_THREAD > 1) // compile-time const
		barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all threads to finish UNINTERLEAVE_LOCAL_TO_REGISTER() before reusing the local storage

	_TyScalar sum;
	_TyScalar scan = t_LocalExScan(l, partial, &sum, &temp.scan_temp);
	// perform the workgroup scan

    #pragma unroll
	for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
		_TyScalar elem = p_my_data[i];
		p_my_data[i] = scan; // exclusive
		scan = t_ReductionReduceOp(scan, elem);
		//p_my_data[i] = scan; // inclusive
	}

	REGISTER_TO_GLOBAL_ORDERED_CFI_COND(CFI_FLAG, p_scan, n_remainder, l,
		SCAN_LOCAL_WORK_SIZE, SCAN_ITEMS_THREAD, p_my_data, temp.p_shared_data);
	// write out scan to memory
}

// scans p_data to p_reduce, in tiles of SCAN_TILE_SIZE elements
// this runs SCAn_LOCAL_WORK_SIZE threads workgroups, each thread processes SCAN_ITEMS_THREAD = SCAN_TILE_SIZE / SCAn_LOCAL_WORK_SIZE
__kernel REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) void TileInScan(__global __write_only _TyScalar *__restrict p_scan,
	/*const unsigned int n_scan_off,*/ __global __read_only const _TyScalar *__restrict p_data,
	/*const unsigned int n_data_off,*/ const unsigned int n_size)
{
	/*p_scan += n_scan_off;
	p_data += n_data_off;
	// offset the pointers as needed (might screw up the accesses though)*/

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	const unsigned int n_start = g * SCAN_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + SCAN_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;// = min(n_size, n_start + SCAN_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	p_scan += n_start;
	p_data += n_start;

	__local union {
		_TyScalar p_shared_data[ORDERED_LOAD_TEMP_SIZE(CFI_FLAG, SCAN_ITEMS_THREAD, SCAN_TILE_SIZE)];
		TLocalScanStorage scan_temp;
	} temp;

	_TyScalar p_my_data[SCAN_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
	GLOBAL_TO_REGISTER_ORDERED_CFI_COND(CFI_FLAG, p_my_data, l, SCAN_LOCAL_WORK_SIZE,
		SCAN_ITEMS_THREAD, p_data, n_remainder, REDUCTION_IDENTITY, temp.p_shared_data);
	// copy data to registers

	_TyScalar partial = t_ReductionElemOp(p_my_data[0]);
    #pragma unroll
    for(int i = 1; i < SCAN_ITEMS_THREAD; ++ i)
        partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
	// perform per-thread reductions

	if(SCAN_ITEMS_THREAD > 1) // compile-time const
		barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all threads to finish UNINTERLEAVE_LOCAL_TO_REGISTER() before reusing the local storage

	_TyScalar sum;
	_TyScalar scan = t_LocalExScan(l, partial, &sum, &temp.scan_temp);
	// perform the workgroup scan

    #pragma unroll
	for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
		_TyScalar elem = p_my_data[i];
		//p_my_data[i] = scan; // exclusive
		scan = t_ReductionReduceOp(scan, elem);
		p_my_data[i] = scan; // inclusive
	}

	REGISTER_TO_GLOBAL_ORDERED_CFI_COND(CFI_FLAG, p_scan, n_remainder, l,
		SCAN_LOCAL_WORK_SIZE, SCAN_ITEMS_THREAD, p_my_data, temp.p_shared_data);
	// write out scan to memory
}

__kernel REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) void TileExScan_Downsweep(__global __write_only _TyScalar *__restrict p_scan,
									   __global __read_only const _TyScalar *__restrict p_spine,
	/*const unsigned int n_scan_off,*/ __global __read_only const _TyScalar *__restrict p_data,
	/*const unsigned int n_data_off,*/ const unsigned int n_size)
{
	/*p_scan += n_scan_off;
	p_data += n_data_off;
	// offset the pointers as needed (might screw up the accesses though)*/

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	const unsigned int n_start = g * SCAN_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + SCAN_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;// = min(n_size, n_start + SCAN_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	p_scan += n_start;
	p_data += n_start;

	__local union {
		_TyScalar p_shared_data[ORDERED_LOAD_TEMP_SIZE(CFI_FLAG, SCAN_ITEMS_THREAD, SCAN_TILE_SIZE)];
		TLocalScanStorage scan_temp;
	} temp;

	_TyScalar p_my_data[SCAN_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
	GLOBAL_TO_REGISTER_ORDERED_CFI_COND(CFI_FLAG, p_my_data, l, SCAN_LOCAL_WORK_SIZE,
		SCAN_ITEMS_THREAD, p_data, n_remainder, REDUCTION_IDENTITY, temp.p_shared_data);
	// copy data to registers

	_TyScalar partial = t_ReductionElemOp(p_my_data[0]);
    #pragma unroll
    for(int i = 1; i < SCAN_ITEMS_THREAD; ++ i)
        partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
	// perform per-thread reductions

	if(SCAN_ITEMS_THREAD > 1) // compile-time const
		barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all threads to finish UNINTERLEAVE_LOCAL_TO_REGISTER() before reusing the local storage

	_TyScalar sum;
	_TyScalar scan = t_LocalExScan(l, partial, &sum, &temp.scan_temp);
	// perform the workgroup scan

	scan = t_ReductionReduceOp(scan, p_spine[g]);
	// all threads, broadcast, save a branch

    #pragma unroll
	for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
		_TyScalar elem = p_my_data[i];
		p_my_data[i] = t_ReductionFinalOp(scan); // exclusive
		scan = t_ReductionReduceOp(scan, elem);
		//p_my_data[i] = t_ReductionFinalOp(scan); // inclusive
	}

	REGISTER_TO_GLOBAL_ORDERED_CFI_COND(CFI_FLAG, p_scan, n_remainder, l,
		SCAN_LOCAL_WORK_SIZE, SCAN_ITEMS_THREAD, p_my_data, temp.p_shared_data);
	// write out scan to memory
}

// scans p_data to p_reduce, in tiles of SCAN_TILE_SIZE elements
// this runs SCAn_LOCAL_WORK_SIZE threads workgroups, each thread processes SCAN_ITEMS_THREAD = SCAN_TILE_SIZE / SCAn_LOCAL_WORK_SIZE
__kernel REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) void TileInScan_Downsweep(__global __write_only _TyScalar *__restrict p_scan,
									   __global __read_only const _TyScalar *__restrict p_spine,
	/*const unsigned int n_scan_off,*/ __global __read_only const _TyScalar *__restrict p_data,
	/*const unsigned int n_data_off,*/ const unsigned int n_size)
{
	/*p_scan += n_scan_off;
	p_data += n_data_off;
	// offset the pointers as needed (might screw up the accesses though)*/

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	const unsigned int n_start = g * SCAN_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + SCAN_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;// = min(n_size, n_start + SCAN_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	p_scan += n_start;
	p_data += n_start;

	__local union {
		_TyScalar p_shared_data[ORDERED_LOAD_TEMP_SIZE(CFI_FLAG, SCAN_ITEMS_THREAD, SCAN_TILE_SIZE)];
		TLocalScanStorage scan_temp;
	} temp;

	_TyScalar p_my_data[SCAN_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
	GLOBAL_TO_REGISTER_ORDERED_CFI_COND(CFI_FLAG, p_my_data, l, SCAN_LOCAL_WORK_SIZE,
		SCAN_ITEMS_THREAD, p_data, n_remainder, REDUCTION_IDENTITY, temp.p_shared_data);
	// copy data to registers

	_TyScalar partial = t_ReductionElemOp(p_my_data[0]);
    #pragma unroll
    for(int i = 1; i < SCAN_ITEMS_THREAD; ++ i)
        partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
	// perform per-thread reductions

	if(SCAN_ITEMS_THREAD > 1) // compile-time const
		barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all threads to finish UNINTERLEAVE_LOCAL_TO_REGISTER() before reusing the local storage

	_TyScalar sum;
	_TyScalar scan = t_LocalExScan(l, partial, &sum, &temp.scan_temp);
	// perform the workgroup scan

	scan = t_ReductionReduceOp(scan, p_spine[g]);
	// all threads, broadcast, save a branch

    #pragma unroll
	for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
		_TyScalar elem = p_my_data[i];
		//p_my_data[i] = t_ReductionFinalOp(scan); // exclusive
		scan = t_ReductionReduceOp(scan, elem);
		p_my_data[i] = t_ReductionFinalOp(scan); // inclusive
	}

	REGISTER_TO_GLOBAL_ORDERED_CFI_COND(CFI_FLAG, p_scan, n_remainder, l,
		SCAN_LOCAL_WORK_SIZE, SCAN_ITEMS_THREAD, p_my_data, temp.p_shared_data);
	// write out scan to memory
}

#endif // BUILD_SCAN

/*
 *	end-of-file
 */
