/*
								+--------------------------------+
								|                                |
								| *** OpenCL scan kernels v2 *** |
								|                                |
								|  Copyright  -tHE SWINe- 2014  |
								|                                |
								|        ScanKernelsv2.h         |
								|                                |
								+--------------------------------+
*/

#pragma once
#ifndef __OPENCL_SCAN_KERNELS_v2_INCLUDED
#define __OPENCL_SCAN_KERNELS_v2_INCLUDED

/**
 *	@file ScanKernelsv2.h
 *	@author -tHE SWINe-
 *	@brief OpenCL scan kernels class
 *	@date 2014
 *
 *	@date 2014-11-24
 *
 *	Added inclusive scan routines, exposed blocked scan interface.
 *
 *	@date 2015-12-02
 *
 *	Fixed inclusive scan synchronization bug that occurred on small GPUs
 *	where the whole block was unable to run simultaneously, due to faster
 *	threads overwriting values of the slower threads in the skewed store
 *	to global memory.
 *
 */

#include "../NewFix.h"
#include "../CallStack.h"
#include "ClUtils.h"
#include "../Integer.h"
#include "../Timer.h"

#include "kernels/ScanKernelsv2Src.h"

/**
 *	@brief OpenCL scan primitive implementation
 */
class CCLScanKernels {
public:
	enum {
		max_ReductionLevel_Num = 4, /**< @brief number of reduction levels @note For blocks of 1024, reducing 3x covers 1024^3 items, amounting to 4GB of memory with 32-bit integers or floats. */
		max_ExtraReduction_Num = max_ReductionLevel_Num - 1 /**< @brief number extra reductions, which need temporary storage */
	};

protected:
	bool m_b_status; /**< @brief flag whether the programs were compiled */

	const size_t m_n_block_size;
	const size_t m_n_thread_num;
	int m_n_scalar_type_size;
	const char *m_p_s_scalar_type;
	size_t m_p_block_sum_buffer_size[max_ExtraReduction_Num];
	CCLUniqueMem m_dp_block_sums[max_ExtraReduction_Num];

	size_t m_n_SM_num; /**< @brief number of streaming multiprocessors on the device */
	size_t m_n_local_memory_size; /**< @brief amount of shared memory per SM on the device, in bytes */

	CCLUniqueProgram m_program;
	CCLUniqueKernel m_ex_scan_kernel;
	CCLUniqueKernel m_ex_scan_noloop_kernel;
	CCLUniqueKernel m_ex_scan_single_kernel;
	CCLUniqueKernel m_ex_scan_sums_kernel;
	CCLUniqueKernel m_ex_scan_sums_noloop_kernel;
	//CCLUniqueKernel m_ex_scan_sums_single_kernel; // unused
	CCLUniqueKernel m_in_scan_kernel;
	CCLUniqueKernel m_in_scan_noloop_kernel;
	CCLUniqueKernel m_in_scan_single_kernel;
	CCLUniqueKernel m_in_scan_sums_kernel;
	CCLUniqueKernel m_in_scan_sums_noloop_kernel;
	CCLUniqueKernel m_offset_kernel;
	CCLUniqueKernel m_offset_single_kernel;

public:
	/**
	 *	@brief default constructor; specifies block size and number of threads
	 *
	 *	@param[in] n_block_size is size of local scan blocks, in elements (must be power
	 *		of two and not more than four times maximum number of threads per block)
	 *	@param[in] n_local_work_size is number of threads, working on a single block
	 *		(must divide n_block_size and must be no higher than half n_block_size)
	 */
// with v0 "old" block reduce:
	//CCLScanKernels(size_t n_block_size = 1024, size_t n_local_work_size = 256) // 680 peaks at 99 GB/s
//	CCLScanKernels(size_t n_block_size = 512, size_t n_local_work_size = 128) // 680 peaks at 106 GB/s, K40 at 116 GB/s
	//CCLScanKernels(size_t n_block_size = 1024, size_t n_local_work_size = 128) // 680 peaks at 75 GB/s
	//CCLScanKernels(size_t n_block_size = 256, size_t n_local_work_size = 64) // crash
// with v1 serial / warp-64 block reduce:
//	CCLScanKernels(size_t n_block_size = 1024, size_t n_local_work_size = 128) // 680 peaks at 135 GB/s, K40 at 185 GB/s, 780 at 190 GB/s
// with v2 warp hierarchy block reduce:
	CCLScanKernels(size_t n_block_size = 1024, size_t n_local_work_size = 128) // 1024 / 64 = 16 items / small block, 512 / 64 = 8 threads cooperating on each
		:m_b_status(false), m_n_block_size(n_block_size), m_n_thread_num(n_local_work_size),
		m_n_scalar_type_size(4), m_p_s_scalar_type("int"), m_n_SM_num(0),
		m_n_local_memory_size(0)
	{
		_ASSERTE(n_block_size > 0 && b_Is_POT(n_block_size)); // block size must be power of two
		_ASSERTE(n_local_work_size > 0 && n_block_size % n_local_work_size == 0); // number of threads must divide block size
		_ASSERTE(n_block_size / n_local_work_size >= 2); // the reduction scheme permits at most 1 thread per every 2 block elements

		for(int i = 0; i < max_ExtraReduction_Num; ++ i)
			m_p_block_sum_buffer_size[i] = 0;
	}

	/**
	 *	@brief gets block size
	 *	@return Returns block size, in elements.
	 */
	inline size_t n_Block_Size() const
	{
		return m_n_block_size;
	}

	/**
	 *	@brief gets the size of temporary buffers
	 *	@return Returns the size of temporary buffers, in bytes.
	 *	@note The size of temporary buffers is bounded by the maximal permitted array.
	 *		For a 4 GB array, buffers will use at most <tt>4 / 1024 + 4 / 1024^2 +
	 *		4 / 1024^3 + ... = 4 MB</tt>, regardless of data type used.
	 */
	inline size_t n_TempBuffers_Size() const
	{
		size_t n_size = 0;
		for(int i = 0; i < max_ExtraReduction_Num; ++ i)
			n_size += m_p_block_sum_buffer_size[i];
		return n_size;
	}

	/**
	 *	@brief deletes temporary buffers, if allocated
	 *	@note The buffers may be in use, unless all the scheduled kernels finished.
	 */
	void Free_TempBuffers()
	{
		for(int i = 0; i < max_ExtraReduction_Num; ++ i) {
			if(m_p_block_sum_buffer_size[i]) {
				m_p_block_sum_buffer_size[i] = 0;
				m_dp_block_sums[i] = 0;
			}
		}
	}

	/**
	 *	@brief configure data types (must be called before Compile())
	 *
	 *	@param[in] p_s_data_type is data type name (currently must be single word,
	 *		don't use e.g. "unsigned int")
	 *	@param[in] n_size_of_data_type is size of the given data type, in bytes
	 */
	void Set_DataTypes(const char *p_s_data_type = "int", int n_size_of_data_type = 4)
	{
		_ASSERTE(p_s_data_type && !strchr(p_s_data_type, ' ')); // no spaces; can use e.g. "TWOWORD(unsigned,int)"
		m_n_scalar_type_size = n_size_of_data_type;
		m_p_s_scalar_type = p_s_data_type;
		// ...
	}

	/*void Configure_ScanOp(const char *UNUSED(p_s_read_op) = "(x)")
	{
		// need to read keys from a second type (a different scan operation), but will write them always as an int, i think
		// could configure scan operator; could be any of + - * / (just the operator itself, no need to )

		// not sure what to configure at the moment
		// need to read and write keys from the data type
	}*/

	/**
	 *	@brief compiles the kernels with the current settings
	 *
	 *	@param[in] h_context is OpenCL context
	 *	@param[in] h_device is target device (currently only supports single device)
	 *	@param[in] b_verbose is verbosity flag (set to enable verbose)
	 *
	 *	@return Returns true on success, false on failure.
	 *
	 *	@note Once the kernels are compiled, this function has no effect and
	 *		always returns true.
	 */
	bool Compile(cl_context h_context, cl_device_id h_device, bool b_verbose = false)
	{
		if(m_b_status)
			return true;
		// already compiled

		_ASSERTE(b_Is_POT(m_n_block_size)); // must be a power of two

		std::string s_preprocessor;
		if(!stl_ut::Format(s_preprocessor,
		   "-D SCAN_LOCAL_WORK_SIZE=" PRIsize " "
		   "-D SCAN_BLOCK_SIZE=" PRIsize " "
		   "-D SCAN_SCALAR_TYPE=%s ",
		   m_n_thread_num,
		   m_n_block_size,
		   m_p_s_scalar_type))
			return false;
		//int n_flags;
		if(b_verbose)
			printf("loading \'%s\' ... ", "compressed kernels");
		//m_program = CCLUniqueProgram(h_context, "ScanKernels.c", CCLUniqueProgram::from_file /*CScanKernels_Src(),
		//	CCLUniqueProgram::from_compressed*/, s_preprocessor.c_str(), "%temp_default%ScanKernels");
		m_program = CCLUniqueProgram(h_context, CScanKernels_Src(),
			CCLUniqueProgram::from_compressed, s_preprocessor.c_str(), "%temp_default%ScanKernels");
		CLresult n_result = m_program.n_Status();/*CCLProgramCompiler::n_CompileProgramFile(h_context, &m_h_program, p_s_kernels_src,
			1, &h_device, s_preprocessor.c_str(), "%temp_default%ScanKernels", &n_flags);*/
		if(b_verbose)
			m_program.Dump_StatusWord();//CCLProgramCompiler::Dump_StatusWord(n_flags); // see the result
		if(n_result != CL_SUCCESS) {
			if(b_verbose)
				fprintf(stderr, "error: failed to load OpenCL program\n");
			return false;
		}
		// compile program

		CCLDeviceParams device_params(h_device);
		m_n_SM_num = device_params.n_Multiprocessor_Num();
		m_n_local_memory_size = device_params.t_Properties().sharedMemPerBlock;
		// get device params

		{
			m_ex_scan_kernel = m_program.h_Get_Kernel("LocalScan_v0", n_result);
			if(n_result != CL_SUCCESS) {
				if(b_verbose)
					fprintf(stderr, "error: failed to create OpenCL kernel (%d)\n", n_result);
				return false;
			}
			m_ex_scan_noloop_kernel = m_program.h_Get_Kernel("LocalScan_NoLoop_v0", n_result);
			if(n_result != CL_SUCCESS) {
				if(b_verbose)
					fprintf(stderr, "error: failed to create OpenCL kernel (%d)\n", n_result);
				return false;
			}
			m_ex_scan_single_kernel = m_program.h_Get_Kernel("LocalScan_Single_v0", n_result);
			if(n_result != CL_SUCCESS) {
				if(b_verbose)
					fprintf(stderr, "error: failed to create OpenCL kernel (%d)\n", n_result);
				return false;
			}
			m_ex_scan_sums_kernel = m_program.h_Get_Kernel("LocalScan_Sums_v0", n_result);
			if(n_result != CL_SUCCESS) {
				if(b_verbose)
					fprintf(stderr, "error: failed to create OpenCL kernel (%d)\n", n_result);
				return false;
			}
			m_ex_scan_sums_noloop_kernel = m_program.h_Get_Kernel("LocalScan_Sums_NoLoop_v0", n_result);
			if(n_result != CL_SUCCESS) {
				if(b_verbose)
					fprintf(stderr, "error: failed to create OpenCL kernel (%d)\n", n_result);
				return false;
			}
			/*m_ex_scan_sums_single_kernel = m_program.h_Get_Kernel("LocalScan_Sums_Single_v0", n_result);
			if(n_result != CL_SUCCESS) {
				if(b_verbose)
					fprintf(stderr, "error: failed to create OpenCL kernel (%d)\n", n_result);
				return false;
			}*/ // useless kernel at the moment
			
			m_in_scan_kernel = m_program.h_Get_Kernel("LocalInScan_v0", n_result);
			if(n_result != CL_SUCCESS) {
				if(b_verbose)
					fprintf(stderr, "error: failed to create OpenCL kernel (%d)\n", n_result);
				return false;
			}
			m_in_scan_noloop_kernel = m_program.h_Get_Kernel("LocalInScan_NoLoop_v0", n_result);
			if(n_result != CL_SUCCESS) {
				if(b_verbose)
					fprintf(stderr, "error: failed to create OpenCL kernel (%d)\n", n_result);
				return false;
			}
			m_in_scan_single_kernel = m_program.h_Get_Kernel("LocalInScan_Single_v0", n_result);
			if(n_result != CL_SUCCESS) {
				if(b_verbose)
					fprintf(stderr, "error: failed to create OpenCL kernel (%d)\n", n_result);
				return false;
			}
			m_in_scan_sums_kernel = m_program.h_Get_Kernel("LocalInScan_Sums_v0", n_result);
			if(n_result != CL_SUCCESS) {
				if(b_verbose)
					fprintf(stderr, "error: failed to create OpenCL kernel (%d)\n", n_result);
				return false;
			}
			m_in_scan_sums_noloop_kernel = m_program.h_Get_Kernel("LocalInScan_Sums_NoLoop_v0", n_result);
			if(n_result != CL_SUCCESS) {
				if(b_verbose)
					fprintf(stderr, "error: failed to create OpenCL kernel (%d)\n", n_result);
				return false;
			}
			m_offset_kernel = m_program.h_Get_Kernel("GlobalScan_Offset_v3", n_result);
			if(n_result != CL_SUCCESS) {
				if(b_verbose)
					fprintf(stderr, "error: failed to create OpenCL kernel (%d)\n", n_result);
				return false;
			}
			m_offset_single_kernel = m_program.h_Get_Kernel("GlobalScan_Offset_Single_v2", n_result);
			if(n_result != CL_SUCCESS) {
				if(b_verbose)
					fprintf(stderr, "error: failed to create OpenCL kernel (%d)\n", n_result);
				return false;
			}
		}
		// get OpenCL kernel(s)

		if(m_n_local_memory_size < size_t(m_n_block_size * m_n_scalar_type_size)) {
			if(b_verbose) {
				fprintf(stderr, "error: block size set too high (" PRIsize/*B*/
					"B), will not fit in shared memory\n",
					/*PRIsizeBparams*/(m_n_block_size * m_n_scalar_type_size));
			}
			return false;
		}
		size_t n_max_threads = min(device_params.t_Properties().maxThreadsDim[0],
			device_params.t_Properties().maxThreadsPerBlock);
		if(m_n_thread_num > n_max_threads) {
			if(b_verbose) {
				fprintf(stderr, "error: the device can only execute %d "
					"threads in a block (%d configured to run)\n",
					n_max_threads, m_n_thread_num);
			}
			return false;
		}
		if(m_n_block_size / 4 > n_max_threads) {
			if(b_verbose) {
				fprintf(stderr, "error: block size too high; the device can"
					" only execute %d threads in a block (%d required)\n",
					n_max_threads, m_n_block_size / 4);
			}
			return false;
		}
		// some basic sanity checks

		m_b_status = true;
		// compiled

		return m_b_status;
	}

	/**
	 *	@brief calculates global (exclusive) scan of elements in the given buffer
	 *
	 *	This calculates cumulative sum, e.g. scan([1 6 7 1 1 9]) = [0 1 7 14 15 16].
	 *	Internally, it calculates several local scans in blocks and then applies
	 *	offsets to calculate a global scan. This means that usually all the data
	 *	needs to be read twice and written twice (first time for local scan, second
	 *	time for offset).
	 *	This implementation only works with the "+" operator (but could work with "-",
	 *	"*" or "/" as well).
	 *
	 *	@param[in] h_context is context (may need to alloc auxiliary buffers)
	 *	@param[in] h_cmd_queue is command queue where the kernels are scheduled
	 *	@param[in] dp_src_buffer is source buffer (must be aligned to block size)
	 *	@param[in] n_data_size_elems is size of data in the buffer, in elements
	 *	@param[in] n_buffer_size_elems is size to which the buffer is allocated, in elements
	 *	@param[in] dp_dest_buffer is the destination buffer where the scan is stored
	 *		(note that it can be the same as dp_src_buffer, can work inplace)
	 *
	 *	@return Returns true on success, false on failure.
	 *
	 *	@note This function requires some temporary buffers and is not reentrant
	 *		(it is reentrant if processing block size elements or less).
	 *	@note This implementation is tuned for Kepler. Runs over 120 GB/s on GTX 780.
	 *	@note Compared to intel i5, the GPU code starts being faster at cca 10k elements.
	 */
	bool ExclusiveScan(cl_context h_context, cl_command_queue h_cmd_queue,
		cl_mem dp_src_buffer, size_t n_data_size_elems, size_t n_buffer_size_elems, cl_mem dp_dest_buffer)
	{
		if(!m_b_status)
			return false;

		_ASSERTE(n_data_size_elems <= n_buffer_size_elems); // more data than buffer size? nonsense.
		if(n_buffer_size_elems % m_n_block_size)
			return false;
		// the buffers must be allocated to a multiple of block size

		if(n_data_size_elems <= m_n_block_size) {
#if 1
			return ExclusiveBlockScan_Single(h_context, h_cmd_queue, dp_src_buffer,
				n_data_size_elems, n_buffer_size_elems, dp_dest_buffer);
			// use the reusable function; the inline code below is only kept for reference / debugging
#else // 1
			size_t n_local_work_size = m_n_thread_num;
			size_t n_global_work_size = 1 * n_local_work_size; // reduce a single block

			/*debug_printf("debug: scan of %d elements done in a single step (%d threads x %d workgroups, s kernel)\n",
				n_data_size_elems, n_local_work_size, n_global_work_size / n_local_work_size);*/

			if(clCall1D3(h_cmd_queue, m_ex_scan_single_kernel, n_global_work_size, n_local_work_size,
			   dp_src_buffer, n_data_size_elems, dp_dest_buffer) != CL_SUCCESS)
			/*clSetKernelArgs(m_h_scan_single_kernel, dp_src_buffer, n_data_size_elems, dp_dest_buffer);
			if(clEnqueueNDRangeKernel(h_cmd_queue, m_h_scan_single_kernel, 1, 0, &n_global_work_size,
			   &n_local_work_size, 0, 0, 0) != CL_SUCCESS)*/
				return false;
			// just run the kernel and we're done

			return true;
#endif // 1
		}
		// no temp buffers required for less than m_n_block_size items

		size_t p_scan_size[max_ReductionLevel_Num + 1];
		int n_level_num;
		if((n_level_num = n_TempBuffer_Sizes(p_scan_size, n_data_size_elems)) < 0 ||
		   !Allocate_TempBuffers(n_level_num, p_scan_size, h_context))
			return false;
		_ASSERTE(n_level_num > 1); // otherwise single scan would handle it
		// determine sizes of scans, allocate temp buffers

		{
#if 1
			if(!ExclusiveBlockScan(h_context, h_cmd_queue, dp_src_buffer,
			   n_data_size_elems, n_buffer_size_elems, dp_dest_buffer,
			   p_scan_size[1], m_dp_block_sums[0]))
				return false;
			// use the reusable function; the inline code below is only kept for reference / debugging
#else // 1
			int n_block_num = p_scan_size[1];
			size_t n_local_work_size = m_n_thread_num;
			size_t n_global_work_size = n_block_num;
			bool b_loop_kernel;
			if((b_loop_kernel = (n_global_work_size > m_n_SM_num * 16)))
				n_global_work_size = m_n_SM_num * 16; // reduce many blocks, but do not overload SMs
			n_global_work_size *= n_local_work_size;

			/*debug_printf("debug: top level scan of %d elements (%d threads x %d workgroups, %s kernel)\n",
				n_data_size_elems, n_local_work_size, n_global_work_size / n_local_work_size,
				(b_loop_kernel)? "g" : "nl");*/

			cl_kernel h_kernel = (b_loop_kernel)? m_ex_scan_sums_kernel :
				m_ex_scan_sums_noloop_kernel;
			/*clSetKernelArgs(h_kernel, dp_src_buffer,
				n_data_size_elems, dp_dest_buffer, m_dp_block_sums[0]);
			if(clEnqueueNDRangeKernel(h_cmd_queue, h_kernel, 1, 0, &n_global_work_size,
			   &n_local_work_size, 0, 0, 0) != CL_SUCCESS)*/
			if(clCall1D4(h_cmd_queue, h_kernel, n_global_work_size, n_local_work_size,
			   dp_src_buffer, n_data_size_elems, dp_dest_buffer, m_dp_block_sums[0]) != CL_SUCCESS)
				return false;

			/*int i = 0;
			std::vector<int> offsets(p_scan_size[i + 1]); // number of blocks in the current level
			clEnqueueReadBuffer(h_cmd_queue, m_dp_block_sums[i], true, 0, offsets.size() * sizeof(int),
				&offsets[0], 0, 0, 0);*/
			// debug - see what the sums of block elements are
#endif // 1
		}
		// in the first pass, scan src -> dest and calculate block sums to m_dp_block_sums[0]

		return Finalize_GlobalScan(h_context, h_cmd_queue, n_level_num, p_scan_size, dp_dest_buffer);
	}

	/**
	 *	@brief calculates global (inclusive) scan of elements in the given buffer
	 *
	 *	This calculates cumulative sum, e.g. scan([1 6 7 1 1 9]) = [1 7 14 15 16 25].
	 *	Internally, it calculates several local scans in blocks and then applies
	 *	offsets to calculate a global scan. This means that usually all the data
	 *	needs to be read twice and written twice (first time for local scan, second
	 *	time for offset).
	 *	This implementation only works with the "+" operator (but could work with "-",
	 *	"*" or "/" as well).
	 *
	 *	@param[in] h_context is context (may need to alloc auxiliary buffers)
	 *	@param[in] h_cmd_queue is command queue where the kernels are scheduled
	 *	@param[in] dp_src_buffer is source buffer (must be aligned to block size)
	 *	@param[in] n_data_size_elems is size of data in the buffer, in elements
	 *	@param[in] n_buffer_size_elems is size to which the buffer is allocated, in elements
	 *	@param[in] dp_dest_buffer is the destination buffer where the scan is stored
	 *		(note that it can be the same as dp_src_buffer, can work inplace)
	 *
	 *	@return Returns true on success, false on failure.
	 *
	 *	@note This function requires some temporary buffers and is not reentrant
	 *		(it is reentrant if processing block size elements or less).
	 *	@note This implementation is tuned for Kepler. Runs over 120 GB/s on GTX 780.
	 *	@note Compared to intel i5, the GPU code starts being faster at cca 10k elements.
	 */
	bool InclusiveScan(cl_context h_context, cl_command_queue h_cmd_queue,
		cl_mem dp_src_buffer, size_t n_data_size_elems, size_t n_buffer_size_elems, cl_mem dp_dest_buffer)
	{
		if(!m_b_status)
			return false;

		_ASSERTE(n_data_size_elems <= n_buffer_size_elems); // more data than buffer size? nonsense.
		if(n_buffer_size_elems % m_n_block_size)
			return false;
		// the buffers must be allocated to a multiple of block size

		if(n_data_size_elems <= m_n_block_size) {
			return InclusiveBlockScan_Single(h_context, h_cmd_queue, dp_src_buffer,
				n_data_size_elems, n_buffer_size_elems, dp_dest_buffer);
			// use the reusable function; the inline code below is only kept for reference / debugging
		}
		// no temp buffers required for less than m_n_block_size items

		size_t p_scan_size[max_ReductionLevel_Num + 1];
		int n_level_num;
		if((n_level_num = n_TempBuffer_Sizes(p_scan_size, n_data_size_elems)) < 0 ||
		   !Allocate_TempBuffers(n_level_num, p_scan_size, h_context))
			return false;
		_ASSERTE(n_level_num > 1); // otherwise single scan would handle it
		// determine sizes of scans, allocate temp buffers

		{
			if(!InclusiveBlockScan(h_context, h_cmd_queue, dp_src_buffer,
			   n_data_size_elems, n_buffer_size_elems, dp_dest_buffer,
			   p_scan_size[1], m_dp_block_sums[0]))
				return false;
			// use the reusable function; the inline code below is only kept for reference / debugging
		}
		// in the first pass, scan src -> dest and calculate block sums to m_dp_block_sums[0]

		// note that the rest of the scans are exclusive

		for(int i = 1; i < n_level_num - 1; ++ i) {
			if(!ExclusiveBlockScan(h_context, h_cmd_queue, m_dp_block_sums[i - 1],
			   p_scan_size[i], m_p_block_sum_buffer_size[i - 1], m_dp_block_sums[i - 1],
			   p_scan_size[i + 1], m_dp_block_sums[i]))
				return false;
			// use the reusable function; the inline code below is only kept for reference / debugging
		}
		// for all passes except the first and the last one,
		// reduce block sums of the previous pass to itself
		// and save block sums as result of the current pass

		//clFinish(h_cmd_queue); // t_odo - remove me

		{
			int i = n_level_num - 1;
			size_t n_pass_size = p_scan_size[i];
			if(!ExclusiveBlockScan_Single(h_context, h_cmd_queue, m_dp_block_sums[i - 1],
			   n_pass_size, m_p_block_sum_buffer_size[i - 1], m_dp_block_sums[i - 1]))
				return false;
			// use the reusable function; the inline code below is only kept for reference / debugging
		}
		// in the last pass, just scan the last block

		//clFinish(h_cmd_queue); // t_odo - remove me

		for(int i = n_level_num - 1; i > 0;) {
			-- i;
			// here

			// i points at the pass being adjusted
			if(!ApplyBlockOffset(h_context, h_cmd_queue, p_scan_size[i],
			   (i)? m_dp_block_sums[i - 1] : dp_dest_buffer,
			   p_scan_size[i + 1], m_dp_block_sums[i]))
				return false;
			// use the reusable function; the inline code below is only kept for reference / debugging
		}
		// fixup for the blocking of the scans, from the second lowest level up to the full scan

		return true;
	}

	bool ExclusiveBlockScan_Single(cl_context h_context, cl_command_queue h_cmd_queue,
		cl_mem dp_src_buffer, size_t n_data_size_elems, size_t n_buffer_size_elems,
		cl_mem dp_dest_buffer) // dp_dest_buffer may be the same ss dp_src_buffer, can work inplace
	{
		if(!m_b_status)
			return false;

		_ASSERTE(n_data_size_elems <= n_buffer_size_elems); // more data than buffer size? nonsense.
		if(n_buffer_size_elems % m_n_block_size)
			return false;
		// the buffers must be allocated to a multiple of block size

		size_t n_local_work_size = m_n_thread_num;
		size_t n_global_work_size = 1 * n_local_work_size; // reduce a single block

		/*debug_printf("debug: scan of %d elements done in a single step (%d threads x %d workgroups, s kernel)\n",
			n_data_size_elems, n_local_work_size, n_global_work_size / n_local_work_size);*/

		_ASSERTE(n_data_size_elems <= UINT_MAX); // it is unsigned on the GPU side
		if(clCall1D3(h_cmd_queue, m_ex_scan_single_kernel, n_global_work_size, n_local_work_size,
		   dp_src_buffer, int(n_data_size_elems), dp_dest_buffer) != CL_SUCCESS)
		/*clSetKernelArgs(m_h_scan_single_kernel, dp_src_buffer, n_data_size_elems, dp_dest_buffer);
		if(clEnqueueNDRangeKernel(h_cmd_queue, m_h_scan_single_kernel, 1, 0, &n_global_work_size,
		   &n_local_work_size, 0, 0, 0) != CL_SUCCESS)*/
			return false;
		// just run the kernel and we're done

		return true;
	}

	bool ExclusiveBlockScan(cl_context h_context, cl_command_queue h_cmd_queue,
		cl_mem dp_src_buffer, size_t n_data_size_elems, size_t n_buffer_size_elems,
		cl_mem dp_dest_buffer, size_t n_block_sum_num, cl_mem dp_block_sums) // dp_block_sums is output, dp_dest_buffer may be the same ss dp_src_buffer, can work inplace
	{
		if(!m_b_status)
			return false;

		_ASSERTE(n_data_size_elems <= n_buffer_size_elems); // more data than buffer size? nonsense.
		if(n_buffer_size_elems % m_n_block_size ||
		   (n_data_size_elems + m_n_block_size - 1) / m_n_block_size != n_block_sum_num) // !!
			return false;
		// the buffers must be allocated to a multiple of block size

		size_t n_pass_size = n_data_size_elems;
		size_t n_block_num = (n_pass_size + m_n_block_size - 1) / m_n_block_size;
		_ASSERTE(n_block_num > 1);
		_ASSERTE(n_block_num == n_block_sum_num);
		size_t n_local_work_size = m_n_thread_num;
		size_t n_global_work_size = n_block_num; // reduce N blocks
		bool b_loop_kernel;
		if((b_loop_kernel = (n_global_work_size > m_n_SM_num * 16)))
			n_global_work_size = m_n_SM_num * 16; // reduce many blocks, but do not overload SMs
		n_global_work_size *= n_local_work_size;

		/*debug_printf("debug: level %d scan of %d elements (%d threads x %d workgroups, %s kernel)\n",
			i, n_pass_size, n_local_work_size, n_global_work_size / n_local_work_size,
			(b_loop_kernel)? "g" : "nl");*/

		cl_kernel h_kernel = (b_loop_kernel)? m_ex_scan_sums_kernel :
			m_ex_scan_sums_noloop_kernel;
		_ASSERTE(n_pass_size <= UINT_MAX); // it is unsigned on the GPU side
		if(clCall1D4(h_cmd_queue, h_kernel, n_global_work_size, n_local_work_size,
		   dp_src_buffer, int(n_pass_size), dp_dest_buffer, dp_block_sums) != CL_SUCCESS)
			return false;

		return true;
	}

	bool InclusiveBlockScan_Single(cl_context h_context, cl_command_queue h_cmd_queue,
		cl_mem dp_src_buffer, size_t n_data_size_elems, size_t n_buffer_size_elems,
		cl_mem dp_dest_buffer) // dp_dest_buffer may be the same ss dp_src_buffer, can work inplace
	{
		if(!m_b_status)
			return false;

		_ASSERTE(n_data_size_elems <= n_buffer_size_elems); // more data than buffer size? nonsense.
		if(n_buffer_size_elems % m_n_block_size)
			return false;
		// the buffers must be allocated to a multiple of block size

		size_t n_local_work_size = m_n_thread_num;
		size_t n_global_work_size = 1 * n_local_work_size; // reduce a single block

		/*debug_printf("debug: scan of %d elements done in a single step (%d threads x %d workgroups, s kernel)\n",
			n_data_size_elems, n_local_work_size, n_global_work_size / n_local_work_size);*/

		_ASSERTE(n_data_size_elems <= UINT_MAX); // it is unsigned on the GPU side
		if(clCall1D3(h_cmd_queue, m_in_scan_single_kernel, n_global_work_size, n_local_work_size,
		   dp_src_buffer, int(n_data_size_elems), dp_dest_buffer) != CL_SUCCESS)
		/*clSetKernelArgs(m_h_scan_single_kernel, dp_src_buffer, n_data_size_elems, dp_dest_buffer);
		if(clEnqueueNDRangeKernel(h_cmd_queue, m_h_scan_single_kernel, 1, 0, &n_global_work_size,
		   &n_local_work_size, 0, 0, 0) != CL_SUCCESS)*/
			return false;
		// just run the kernel and we're done

		return true;
	}

	bool InclusiveBlockScan(cl_context h_context, cl_command_queue h_cmd_queue,
		cl_mem dp_src_buffer, size_t n_data_size_elems, size_t n_buffer_size_elems,
		cl_mem dp_dest_buffer, size_t n_block_sum_num, cl_mem dp_block_sums) // dp_block_sums is output, dp_dest_buffer may be the same ss dp_src_buffer, can work inplace
	{
		if(!m_b_status)
			return false;

		_ASSERTE(n_data_size_elems <= n_buffer_size_elems); // more data than buffer size? nonsense.
		if(n_buffer_size_elems % m_n_block_size ||
		   (n_data_size_elems + m_n_block_size - 1) / m_n_block_size != n_block_sum_num) // !!
			return false;
		// the buffers must be allocated to a multiple of block size

		size_t n_pass_size = n_data_size_elems;
		size_t n_block_num = (n_pass_size + m_n_block_size - 1) / m_n_block_size;
		_ASSERTE(n_block_num > 1);
		_ASSERTE(n_block_num == n_block_sum_num);
		size_t n_local_work_size = m_n_thread_num;
		size_t n_global_work_size = n_block_num; // reduce N blocks
		bool b_loop_kernel;
		if((b_loop_kernel = (n_global_work_size > m_n_SM_num * 16)))
			n_global_work_size = m_n_SM_num * 16; // reduce many blocks, but do not overload SMs
		n_global_work_size *= n_local_work_size;

		/*debug_printf("debug: level %d scan of %d elements (%d threads x %d workgroups, %s kernel)\n",
			i, n_pass_size, n_local_work_size, n_global_work_size / n_local_work_size,
			(b_loop_kernel)? "g" : "nl");*/

		cl_kernel h_kernel = (b_loop_kernel)? m_in_scan_sums_kernel :
			m_in_scan_sums_noloop_kernel;
		_ASSERTE(n_pass_size <= UINT_MAX); // it is unsigned on the GPU side
		if(clCall1D4(h_cmd_queue, h_kernel, n_global_work_size, n_local_work_size,
		   dp_src_buffer, int(n_pass_size), dp_dest_buffer, dp_block_sums) != CL_SUCCESS)
			return false;

		return true;
	}

	bool ApplyBlockOffset(cl_context h_context, cl_command_queue h_cmd_queue,
		size_t n_scan_size, cl_mem dp_block_scan, size_t n_block_sum_num, cl_mem dp_block_sums) // dp_block_scan is input and output, dp_block_sums is input
	{
		if(!m_b_status)
			return false;

		_ASSERTE(n_scan_size > m_n_block_size);
		// must be more than 1 block, otherwise why offset?

		_ASSERTE(n_block_sum_num == (n_scan_size + m_n_block_size - 1) / m_n_block_size);
		// make sure that the number of block sums is correct

		bool b_single_block = /*n_scan_size <= m_n_block_size ||*/ (n_scan_size - m_n_block_size) < m_n_block_size;

		size_t n_local_work_size = /*(b_single_block)? m_n_block_size :*/ m_n_block_size / 4;// min(512, n_Make_POT(n_scan_size)); // todo - make configurable
		size_t n_global_work_size = (b_single_block)? n_local_work_size :
			/*min(m_n_SM_num * 16,*/ size_t(n_scan_size - m_n_block_size +
			m_n_block_size - 1) / m_n_block_size/*)*/ * n_local_work_size; // single block -> single workgroup
		// can skip the first block, as scan of block sums starts with 0 - schedule one block less work

		/*debug_printf("debug: sending block sums up to level %d (%d elements, %d threads x %d workgroups)\n",
			i, n_scan_size, n_local_work_size, n_global_work_size / n_local_work_size);*/

		cl_kernel h_kernel = (b_single_block)? m_offset_single_kernel : m_offset_kernel;
		_ASSERTE(n_scan_size <= UINT_MAX); // it is unsigned on the GPU side
		if(clCall1D3(h_cmd_queue, h_kernel, n_global_work_size, n_local_work_size,
		   dp_block_sums, dp_block_scan, int(n_scan_size)) != CL_SUCCESS)
			return false;
		// add scanned sums of blocks from next level to whole blocks in this level

		return true;
	}

	/**
	 *	@brief runs a simple benchmark of running scans of different sizes
	 *		(prints results to stdout)
	 *
	 *	@param[in] h_context is OpenCL context
	 *	@param[in] h_device is target device (currently only supports single device)
	 *	@param[in] h_cmd_queue is command queue where the kernels are scheduled
	 *
	 *	@return Returns true on success, false on failure (kernels failed to launch
	 *		or give incorrect results).
	 *
	 *	@note This function throws std::bad_alloc.
	 *	@note The results are also checked against CPU ground truth, this can serve
	 *		also as a simple way to see if the code behaves well.
	 */
	static bool Benchmark(cl_context h_context,
		cl_device_id h_device, cl_command_queue h_cmd_queue) // throw(std::bad_alloc)
	{
		CCLScanKernels scan;

		scan.Set_DataTypes("TWOWORD(unsigned,int)", 4);
		_ASSERTE(sizeof(int) == 4);

		const size_t n_block_size = scan.n_Block_Size();
		if(!scan.Compile(h_context, h_device, true)) {
			fprintf(stderr, "error: failed to compile scan primitives\n");
			return false;
		}
		// compile scans

		bool b_results_correct = true;
		for(int n_pass2 = 0; n_pass2 < 2; ++ n_pass2) {
			bool b_inclusive = n_pass2 != 0;

			//const size_t p_size[] = {1024, 1234, 2048, 12345, 123456, 1048576, 1234567, 1048576 * 2, 12345678, 1048576 * 100};
			const size_t p_size[] = {1000 * 10, 1000 * 50, 1000 * 100, 1000 * 200, 1000 * 500, 1000000, 1000000 * 2, 1000000 * 5, 1000000 * 10, 1000000 * 20};
			for(int n_test = 0; n_test < sizeof(p_size) / sizeof(p_size[0]); ++ n_test) {
				size_t n = p_size[n_test];

				printf("preparing data ...\r");

				std::vector<unsigned int> scan_data(n);
				for(size_t i = 0; i < n; ++ i)
					scan_data[i] = (unsigned int)i;
				for(size_t i = 0; i < n; ++ i)
					std::swap(scan_data[i], scan_data[i + rand() % (n - i)]);
				// generate some data

				cl_mem dp_scan_buffer;
				size_t n_buffer_size_elems = n_Align_Up(scan_data.size(), n_block_size);
				{
					cl_int n_result;
					dp_scan_buffer = clCreateBuffer(h_context, CL_MEM_READ_WRITE,
						n_buffer_size_elems * sizeof(int), NULL, &n_result);
					if(n_result != CL_SUCCESS) {
						fprintf(stderr, "error: failed to alloc device buffer\n");
						return false;
					}
				}
				// allocate memory

				printf("running test ...  \r");

				CTimer test_timer;
				double f_time = 0;
				int n_pass_num = 0;
				for(;;) {
					clEnqueueWriteBuffer(h_cmd_queue, dp_scan_buffer, true, 0, scan_data.size() * sizeof(int),
						&scan_data[0], 0, 0, 0);
					clFinish(h_cmd_queue);
					// prepare data ...

					double f_start_time = test_timer.f_Time();

					//printf("\n=== scan of %d elems ===\n", scan_data.size()); // separate debug outputs

					if(b_inclusive) {
						if(!scan.InclusiveScan(h_context, h_cmd_queue, dp_scan_buffer,
						   scan_data.size(), n_buffer_size_elems, dp_scan_buffer))
							fprintf(stderr, "error: InclusiveScan() failed\n");
					} else {
						if(!scan.ExclusiveScan(h_context, h_cmd_queue, dp_scan_buffer,
						   scan_data.size(), n_buffer_size_elems, dp_scan_buffer))
							fprintf(stderr, "error: ExclusiveScan() failed\n");
					}
					// calculates scan of arbitrary-sized array

					//printf("\n"); // separate debug outputs

					cl_int n_result = clFinish(h_cmd_queue);
					if(n_result) {
						fprintf(stderr, "error: finish result: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
						return false;
					}

					double f_pass_time = test_timer.f_Time() - f_start_time;
					//if(n_pass_num)
						f_time += f_pass_time;
					++ n_pass_num;

					if((f_time > .5f && n_pass_num > 10) || f_time > 4)
						break;
					// make sure the timing is stable, don't take too long at the same time
				}
				//-- n_pass_num; // the first pass did not count
				// run the thing

				f_time /= n_pass_num;
				size_t n_data = 4/*3*/ * scan_data.size() * sizeof(int); // 3x to match mgpu benchmarks; 4x because data is read and written in scan and later again in offset
				double f_GBps = n_data / f_time * 1e-9;
				printf("on " PRIsize/*B*/ "B, it took %f msec, reaching %f GB/s\n",
					/*PRIsizeBparams*/(scan_data.size() * sizeof(int)), f_time * 1000, f_GBps);
				// print results

				std::vector<unsigned int> global_scan_cpu(n);
				CTimer tcpu;
				if(b_inclusive) {
					global_scan_cpu[0] = scan_data[0];
					for(size_t i = 1; i < n; ++ i)
						global_scan_cpu[i] = global_scan_cpu[i - 1] + scan_data[i];
				} else {
					for(unsigned int i = 0, n_accum = 0; i < n; ++ i) {
						global_scan_cpu[i] = n_accum;
						n_accum += scan_data[i];
					}
				}
				printf("global %sclusive scan takes %f msec on CPU\n", (b_inclusive)? "in" : "ex", tcpu.f_Time() * 1000);
				// perform a global scan (the goal)

				clEnqueueReadBuffer(h_cmd_queue, dp_scan_buffer, true, 0, scan_data.size() * sizeof(int),
					&scan_data[0], 0, 0, 0);
				// copy back to CPU

				clReleaseMemObject(dp_scan_buffer);
				// release memory

				size_t n_err_num = 0;
				for(size_t i = 0; i < n; ++ i) {
					if(scan_data[i] != global_scan_cpu[i]) {
						if(++ n_err_num < 100) {
							fprintf(stderr, "error: scan failed: scan_data[%d] = %d (should be %d)\n",
								i, scan_data[i], global_scan_cpu[i]);
						}
					}
				}
				if(!n_err_num)
					printf("done. %sclusive scan of %d items succeeded\n", (b_inclusive)? "in" : "ex", n);
				else {
					fprintf(stderr, "error: %sclusive scan failed with %d errore\n", (b_inclusive)? "in" : "ex", n_err_num);
					printf("scan[16] = %d\n", global_scan_cpu[16]);
					printf("scan[32] = %d\n", global_scan_cpu[32]);
					printf("scan[40] = %d\n", global_scan_cpu[40]);
					printf("scan[48] = %d\n", global_scan_cpu[48]);
					printf("scan[64] = %d\n", global_scan_cpu[64]);
					b_results_correct = false;
				}
				// make sure it is scanned correctly
			}
		}

		return b_results_correct;
	}

protected:
	int n_TempBuffer_Sizes(size_t *p_scan_size, size_t n_data_size_elems) const
	{
		p_scan_size[0] = n_data_size_elems;
		int n_level_num;
		{
			int n_last_level = 0; // 0 is correct - it is index, not number of levels
			//debug_printf("debug: level %d has size %d\n", n_last_level, p_scan_size[n_last_level]);
			while(n_last_level < max_ReductionLevel_Num &&
			   p_scan_size[n_last_level] > m_n_block_size) {
				p_scan_size[n_last_level + 1] = (p_scan_size[n_last_level] +
					m_n_block_size - 1) / m_n_block_size;
				++ n_last_level;
				//debug_printf("debug: level %d has size %d\n", n_last_level, p_scan_size[n_last_level]);
			}
			if(n_last_level == max_ReductionLevel_Num || // the line below would access out of bounds
			   (n_last_level == max_ReductionLevel_Num - 1 &&
			   p_scan_size[n_last_level] > m_n_block_size))
				return -1; // the array is too big, not reducible in the given number of steps
			p_scan_size[n_last_level + 1] = 0;
			n_level_num = n_last_level + 1; // from zero-based index to number of levels
		}
		if(n_level_num > max_ReductionLevel_Num)
			return -1;
		return n_level_num;
	}

	bool Allocate_TempBuffers(int n_level_num, const size_t *p_scan_size, cl_context h_context)
	{
		for(int i = n_level_num; i > 0;) {
			-- i;
			// here

			size_t n_size = n_Align_Up_POT(p_scan_size[i + 1], m_n_block_size) * m_n_scalar_type_size;
			// calculate size of temp buffer required at i-th level of reduction

			if(m_p_block_sum_buffer_size[i] >= n_size)
				continue;
			// have a buffer of sufficient size

			if(m_p_block_sum_buffer_size[i]) {
				//clReleaseMemObject(m_dp_block_sums[i]);

				//debug_printf("debug: deleting level %d buffer (%d B)\n", i, m_p_block_sum_buffer_size[i]);

				m_p_block_sum_buffer_size[i] = 0;
				m_dp_block_sums[i] = 0;
			}
			// the buffer is too small, delete it (if it was allocated)

			for(int j = i; j > 0;) {
				-- j; // here
				if(m_p_block_sum_buffer_size[j] >= n_size &&
				   m_p_block_sum_buffer_size[j] < n_size * 2) {
					std::swap(m_p_block_sum_buffer_size[j], m_p_block_sum_buffer_size[i]);
					std::swap(m_dp_block_sums[j], m_dp_block_sums[i]);

					/*debug_printf("debug: reusing buffer of %d B from level %d instead of buffer of %d B for level %d\n",
						m_p_block_sum_buffer_size[i], j, n_size, i); // m_p_block_sum_buffer_size[j] is swapped now*/
					break;
				}
			}
			// see if we can accomodate it in one of the bigger buffers

			if(m_p_block_sum_buffer_size[i] < n_size) {
				_ASSERTE(!m_p_block_sum_buffer_size[i]);

				//debug_printf("debug: allocating level %d buffer (%d B)\n", i, n_size);

				cl_int n_result;
				m_dp_block_sums[i] = clCreateBuffer(h_context, CL_MEM_READ_WRITE, n_size, NULL, &n_result);
				if(n_result != CL_SUCCESS) {
					m_p_block_sum_buffer_size[i] = 0;
					m_dp_block_sums[i] = 0;
					//fprintf(stderr, "error: failed to alloc device buffer\n");
					return false;
				}
				m_p_block_sum_buffer_size[i] = n_size;
			}
			// if we did not find anything, allocate a new one
		}
		// allocate temp buffers

		return true;
	}

	bool Finalize_GlobalScan(cl_context h_context, cl_command_queue h_cmd_queue,
		int n_level_num, const size_t *p_scan_size, cl_mem dp_dest_buffer)
	{
		for(int i = 1; i < n_level_num - 1; ++ i) {
#if 1
			if(!ExclusiveBlockScan(h_context, h_cmd_queue, m_dp_block_sums[i - 1],
			   p_scan_size[i], m_p_block_sum_buffer_size[i - 1], m_dp_block_sums[i - 1],
			   p_scan_size[i + 1], m_dp_block_sums[i]))
				return false;
			// use the reusable function; the inline code below is only kept for reference / debugging
#else // 1
			int n_pass_size = p_scan_size[i];
			int n_block_num = (n_pass_size + m_n_block_size - 1) / m_n_block_size;
			_ASSERTE(n_block_num > 1);
			size_t n_local_work_size = m_n_thread_num;
			size_t n_global_work_size = n_block_num; // reduce N blocks
			bool b_loop_kernel;
			if((b_loop_kernel = (n_global_work_size > m_n_SM_num * 16)))
				n_global_work_size = m_n_SM_num * 16; // reduce many blocks, but do not overload SMs
			n_global_work_size *= n_local_work_size;

			/*debug_printf("debug: level %d scan of %d elements (%d threads x %d workgroups, %s kernel)\n",
				i, n_pass_size, n_local_work_size, n_global_work_size / n_local_work_size,
				(b_loop_kernel)? "g" : "nl");*/

			cl_kernel h_kernel = (b_loop_kernel)? m_ex_scan_sums_kernel :
				m_ex_scan_sums_noloop_kernel;
			/*clSetKernelArgs(h_kernel, m_dp_block_sums[i - 1],
				n_pass_size, m_dp_block_sums[i - 1], m_dp_block_sums[i]);
			if(clEnqueueNDRangeKernel(h_cmd_queue, h_kernel, 1, 0, &n_global_work_size,
			   &n_local_work_size, 0, 0, 0) != CL_SUCCESS)*/
			if(clCall1D4(h_cmd_queue, h_kernel, n_global_work_size, n_local_work_size,
			   m_dp_block_sums[i - 1], n_pass_size, m_dp_block_sums[i - 1], m_dp_block_sums[i]) != CL_SUCCESS)
				return false;
#endif // 1
		}
		// for all passes except the first and the last one,
		// reduce block sums of the previous pass to itself
		// and save block sums as result of the current pass

		//clFinish(h_cmd_queue); // t_odo - remove me

		{
			int i = n_level_num - 1;
			size_t n_pass_size = p_scan_size[i];
#if 1
			if(!ExclusiveBlockScan_Single(h_context, h_cmd_queue, m_dp_block_sums[i - 1],
			   n_pass_size, m_p_block_sum_buffer_size[i - 1], m_dp_block_sums[i - 1]))
				return false;
			// use the reusable function; the inline code below is only kept for reference / debugging
#else // 1
			_ASSERTE((n_pass_size + m_n_block_size - 1) / m_n_block_size == 1); // make sure there is only a single block

			// note that if p_scan_size[i] == 2, then the scan is trivial: scan([a b]) = [0 a]
			// this is currently not exploted, although it could save one kernel launch in
			// some cases (probably considerable speedup for small scans, but there the CPU
			// is faster anyway) // todo

			size_t n_local_work_size = m_n_thread_num;
			size_t n_global_work_size = 1 * n_local_work_size; // reduce a single block

			/*debug_printf("debug: last level scan of %d elements (%d threads x %d workgroups, s kernel)\n",
				n_pass_size, n_local_work_size, n_global_work_size / n_local_work_size);*/

			/*clSetKernelArgs(m_ex_scan_single_kernel, m_dp_block_sums[i - 1], n_pass_size, m_dp_block_sums[i - 1]);
			if(clEnqueueNDRangeKernel(h_cmd_queue, m_ex_scan_single_kernel, 1, 0, &n_global_work_size,
			   &n_local_work_size, 0, 0, 0) != CL_SUCCESS)*/
			if(clCall1D3(h_cmd_queue, m_ex_scan_single_kernel, n_global_work_size, n_local_work_size,
			   m_dp_block_sums[i - 1], n_pass_size, m_dp_block_sums[i - 1]) != CL_SUCCESS)
				return false;
#endif // 1
		}
		// in the last pass, just scan the last block

		//clFinish(h_cmd_queue); // t_odo - remove me

		for(int i = n_level_num - 1; i > 0;) {
			-- i;
			// here

			// i points at the pass being adjusted
#if 1
			if(!ApplyBlockOffset(h_context, h_cmd_queue, p_scan_size[i],
			   (i)? m_dp_block_sums[i - 1] : dp_dest_buffer,
			   p_scan_size[i + 1], m_dp_block_sums[i]))
				return false;
			// use the reusable function; the inline code below is only kept for reference / debugging
#else // 1
			_ASSERTE(p_scan_size[i] > m_n_block_size);
			// must be more than 1 block, otherwise why offset?

			bool b_single_block = (p_scan_size[i] - m_n_block_size) < m_n_block_size;

			size_t n_local_work_size = /*(b_single_block)? m_n_block_size :*/ m_n_block_size / 4;// min(512, n_Make_POT(p_scan_size[i])); // todo - make configurable
			size_t n_global_work_size = (b_single_block)? n_local_work_size :
				/*min(m_n_SM_num * 16,*/ size_t(p_scan_size[i] - m_n_block_size +
				m_n_block_size - 1) / m_n_block_size/*)*/ * n_local_work_size; // single block -> single workgroup
			//n_global_work_size = max(size_t(1), n_global_work_size - n_global_work_size % m_n_SM_num); // align it to prevent having idle blocks at the end (probably does not help)
			// can skip the first block, as scan of block sums starts with 0 - schedule one block less work

			/*std::vector<int> offsets(p_scan_size[i + 1]); // number of blocks in the current level
			clEnqueueReadBuffer(h_cmd_queue, m_dp_block_sums[i], true, 0, offsets.size() * sizeof(int),
				&offsets[0], 0, 0, 0);*/
			// debug - see what is being added

			/*debug_printf("debug: sending block sums up to level %d (%d elements, %d threads x %d workgroups)\n",
				i, p_scan_size[i], n_local_work_size, n_global_work_size / n_local_work_size);*/

			cl_kernel h_kernel = (b_single_block)? m_offset_single_kernel : m_offset_kernel;
			/*clSetKernelArgs(h_kernel, m_dp_block_sums[i],
				(i)? m_dp_block_sums[i - 1].h_Get() : dp_dest_buffer, p_scan_size[i]); // todo - note that the first block sum is 0, no need to shift that block
			if(clEnqueueNDRangeKernel(h_cmd_queue, h_kernel, 1, 0,
			   &n_global_work_size, &n_local_work_size, 0, 0, 0) != CL_SUCCESS)*/
			if(clCall1D3(h_cmd_queue, h_kernel, n_global_work_size, n_local_work_size, m_dp_block_sums[i],
			   (i)? m_dp_block_sums[i - 1] : dp_dest_buffer, p_scan_size[i]) != CL_SUCCESS)
				return false;
			// add scanned sums of blocks from next level to whole blocks in this level
#endif // 1
		}
		// fixup for the blocking of the scans, from the second lowest level up to the full scan

		return true;
	}

private:
	inline CCLScanKernels(const CCLScanKernels &r_other); // this object is not copy-able; use pointers instead
	inline CCLScanKernels &operator =(const CCLScanKernels &r_other); // this object is not copy-able; use pointers instead
};

#endif // __OPENCL_SCAN_KERNELS_v2_INCLUDED
