/*
								+--------------------------------+
								|                                |
								| *** OCL tiled scan, reduce *** |
								|                                |
								|  Copyright  -tHE SWINe- 2016  |
								|                                |
								|       TiledScanReduce.h        |
								|                                |
								+--------------------------------+
*/

#pragma once
#ifndef __OPENCL_TILED_SCAN_REDUCTION_INCLUDED
#define __OPENCL_TILED_SCAN_REDUCTION_INCLUDED

/**
 *	@file gpgpu/TiledScanReduce.h
 *	@date 2016
 *	@author -tHE SWINe-
 *	@brief OpenCL tiled scan and reduce primitives
 */

#include <vector>
#include <algorithm>
#include <numeric>
#include "ClUtils.h"
#include "AutotuneInfo.h"
#include "kernels/ScanReducev3Src.h"
#include "kernels/SegScanReduceSrc.h"

#include "TempBuffer.h" // for testing the segmented scan without having to autotune it first
#include "SegmentedScanReduce_Debug.h" // for testing the segmented scan

/**
 *	@brief reduction or reduction-like operation configuration
 *
 *	This contains configuration for reduction data type and operations.
 *
 *	The reductions are generally performed as \f$f(r(e(x_1), e(x_2), \ldots, e(x_n)))\f$
 *	where \f$e(\cdot)\f$ is element operation, \f$r(\cdot)\f$ is a reduction operation
 *	(binary; the application to more than two elements is tree-like and pretty much
 *	arbitrary) and \f$f(\cdot)\f$ is a finalization operation. There is also an option
 *	to specify the identity value (0 for addition, 1 for multiplication).
 *
 *	For a simple sum, \f$e(x) = x, r(x, y) = x + y, f(x) = x\f$. For Euclidean norm,
 *	\f$e(x) = x^2, r(x, y) = x + y, f(x) = \sqrt{x}\f$.
 */
class CCLReductionConfig {
protected:
	std::string m_s_data_type; // "uint32_t"
	unsigned int m_n_data_type_size; // 4
	std::string m_s_elem_op; // "x"
	std::string m_s_reduce_op; // "x+y"
	char m_n_reduce_op; // '+' (or '?' if indeterminate)
	std::string m_s_final_op; // "x"
	std::string m_s_identity; // "0"

public:
	CCLReductionConfig()
	{
		Set_DataType("uint32_t");
		Set_ReduceOps();
	}

	const char *p_s_DaraType() const
	{
		return m_s_data_type.c_str();
	}

	inline size_t n_DataType_Size() const
	{
		return m_n_data_type_size;
	}

	const char *p_s_ElementOp() const
	{
		return m_s_elem_op.c_str();
	}

	const char *p_s_ReduceOp() const
	{
		return m_s_reduce_op.c_str();
	}

	char n_Reduce_Operator() const
	{
		return m_n_reduce_op;
	}

	bool b_Has_ElementOp() const
	{
		return m_s_elem_op != "x" && m_s_elem_op != "(x)"; // uh ... well ...
	}

	bool b_Has_FinalizeOp() const
	{
		return m_s_final_op != "x" && m_s_final_op != "(x)"; // uh ... well ...
	}

	const char *p_s_FinalizeOp() const
	{
		return m_s_final_op.c_str();
	}

	const char *p_s_IdentityValue() const
	{
		return m_s_identity.c_str();
	}

	bool Set_ReduceOps(const char *p_s_elem_op = "x", const char *p_s_reduce_op = "x+y",
		const char *p_s_finalize_op = "x", const char *p_s_identity = "0")
	{
		std::string s_elem_op, s_reduce_op, s_final_op, s_identity;
		if(!stl_ut::AssignCStr(s_elem_op, p_s_elem_op) ||
		   !stl_ut::AssignCStr(s_reduce_op, p_s_reduce_op) ||
		   !stl_ut::AssignCStr(s_final_op, p_s_finalize_op) ||
		   !stl_ut::AssignCStr(s_identity, p_s_identity))
			return false;
		if(s_elem_op.find_first_of(" \t\b\r\n") != std::string::npos ||
		   s_reduce_op.find_first_of(" \t\b\r\n") != std::string::npos ||
		   s_final_op.find_first_of(" \t\b\r\n") != std::string::npos ||
		   s_identity.find_first_of(" \t\b\r\n") != std::string::npos)
			return false; // there must be no whitespace so that those could be passed using -D and macros
		if(s_elem_op.find('x') == std::string::npos ||
		   s_reduce_op.find('x') == std::string::npos ||
		   s_reduce_op.find('y') == std::string::npos ||
		   s_final_op.find('x') == std::string::npos)
			return false; // there must be the variables
		// do some sanity checks on the input strings

		m_n_reduce_op = '?';
		if(b_Match("x.x", s_reduce_op))
			m_n_reduce_op = s_reduce_op[1];
		else if(b_Match("(x.x)", s_reduce_op))
			m_n_reduce_op = s_reduce_op[2];
		else if(b_Match("(x).(x)", s_reduce_op))
			m_n_reduce_op = s_reduce_op[3];
		else if(b_Match("((x).(x))", s_reduce_op))
			m_n_reduce_op = s_reduce_op[4];
		// try to figure out what the reduce operator is

		m_s_elem_op.swap(s_elem_op);
		m_s_reduce_op.swap(s_reduce_op);
		m_s_final_op.swap(s_final_op);
		m_s_identity.swap(s_identity);
		// swap all at once to avoid inconsistent states

		//m_b_built = false; // now need to rebuild

		return true;
	}

	bool Set_DataType(const char *p_s_data_type)
	{
		static const struct {
			const char *p_s_name, *p_s_standard_name;
		} p_data_type_translation_table[] = {
			{"char", "int8_t"},
			{"signed char", "int8_t"},
			{"unsigned char", "uint8_t"},
			{"short", "int16_t"},
			{"signed short", "int16_t"},
			{"unsigned short", "uint16_t"},
			{"int", "int32_t"},
			{"signed int", "int32_t"},
			{"unsigned int", "uint32_t"},
			{"long", "int32_t"},
			{"signed long", "int32_t"},
			{"unsigned long", "uint32_t"}
		};
		const size_t n_data_type_translation_num =
			sizeof(p_data_type_translation_table) /
			sizeof(p_data_type_translation_table[0]);
		for(size_t i = 0; i < n_data_type_translation_num; ++ i) {
			if(!strcmp(p_s_data_type, p_data_type_translation_table[i].p_s_name)) {
				p_s_data_type = p_data_type_translation_table[i].p_s_standard_name;
				break;
			}
		}
		// replace some easy "C" names by standard names

		static const struct {
			const char *p_s_name;
			int n_size_bytes;
		} p_data_type_table[] = {
			{"float", 4},
			{"double", 8},
			{"uint32_t", 4},
			{"int32_t", 4},
			{"uint64_t", 8},
			{"int64_t", 8},
			{"uint16_t", 2},
			{"int16_t", 2},
			{"uint8_t", 1},
			{"int8_t", 1}
		};
		const size_t n_data_type_num = sizeof(p_data_type_table) /
			sizeof(p_data_type_table[0]);
		for(size_t i = 0; i < n_data_type_num; ++ i) {
			if(!strcmp(p_s_data_type, p_data_type_table[i].p_s_name)) {
				if(!stl_ut::AssignCStr(m_s_data_type, p_s_data_type))
					return false;
				m_n_data_type_size = p_data_type_table[i].n_size_bytes;

				//m_b_built = false; // now need to rebuild
				return true;
			}
		}

		return false; // unknown type
	}

protected:
	static bool b_Match(const char *p_s_pattern, const std::string &r_s_text)
	{
		for(size_t i = 0, n = r_s_text.length(); i < n; ++ i) {
			if(!p_s_pattern[i])
				return false; // pattern is shorter
			if(p_s_pattern[i] == '.')
				continue;
			if(p_s_pattern[i] == 'x' && r_s_text[i] != 'x' && r_s_text[i] != 'y')
				return false; // variable mismatch
			if(p_s_pattern[i] != 'x' && p_s_pattern[i] != r_s_text[i])
				return false; // other pattern mismatch
		}
		if(r_s_text.empty() && *p_s_pattern)
			return false; // pattern is longer
		return true; // match
	}
};

class CCLTiled_ReduceScan_Impl {
protected:
	size_t m_n_workgroup_size; // 256
	size_t m_n_tile_size; // 2048
	bool m_b_built;

	CCLUniqueProgram m_program;
	CCLUniqueKernel m_h_tile_reduce_kernel;
	CCLUniqueKernel m_h_multitile_reduce_kernel;
	CCLUniqueKernel m_h_tile_reduce_finalop_kernel; // with final op, if not identity op (then the same kernel as m_h_tile_reduce_kernel)
	CCLUniqueKernel m_h_multitile_reduce_finalop_kernel; // with final op, if not identity op (then the same kernel as m_h_multitile_reduce_kernel)
	CCLUniqueKernel m_h_tile_exscan_kernel;
	CCLUniqueKernel m_h_tile_inscan_kernel;
	CCLUniqueKernel m_h_tile_exscan_ds_kernel; // downsweep
	CCLUniqueKernel m_h_tile_inscan_ds_kernel; // downsweep

public:
	CCLTiled_ReduceScan_Impl(size_t n_workgroup_size = 128, size_t n_tile_size = 1024)
		:m_n_workgroup_size(n_workgroup_size), m_n_tile_size(n_tile_size), m_b_built(false)
	{}

	void Set_WorkGroupSize_TileSize(size_t n_workgroup_size = 128, size_t n_tile_size = 1024)
	{
		m_n_workgroup_size = n_workgroup_size;
		m_n_tile_size = n_tile_size;

		m_b_built = false; // now need to rebuild
	}

	bool b_Status() const
	{
		return m_b_built;
	}

	inline size_t n_Tile_Size() const
	{
		return m_n_tile_size;
	}

	inline size_t n_WorkGroup_Size() const
	{
		return m_n_workgroup_size;
	}

	/**
	 *	@brief compiles the kernels with the specified reduction settings
	 *
	 *	@param[in] h_context is OpenCL context
	 *	@param[in] h_device is target device (currently only supports single device)
	 *	@param[in] r_config is reference to scan configuration
	 *	@param[in] b_want_scan is scan kernel build flag
	 *	@param[in] b_want_reduce is reduction kernel build flag
	 *	@param[in] b_verbose is verbosity flag (set to enable verbose)
	 *	@param[in] b_compiler_verbose is compiler verbosity flag (e.g. nvcc reports numbers of registers)
	 *	@param[in] b_use_nv_shuffle is NVIDIA specific shuffle instruction enable flag
	 *		(cleared automatically for devices that do not support it)
	 *	@param[in] b_use_Harris_scan is Harris-style workgroup cooperative scan flag
	 *
	 *	@return Returns true on success, false on failure.
	 *
	 *	@note Once the kernels are compiled, this function has no effect and
	 *		always returns true.
	 */
	bool Compile(cl_context h_context, cl_device_id h_device,
		const CCLReductionConfig &r_config,
		bool b_want_scan, bool b_want_reduce, bool b_verbose = false,
		bool b_compiler_verbose = false, bool b_use_nv_shuffle = true,
		bool b_use_Harris_scan = false)
	{
		bool b_have_scan = m_h_tile_exscan_kernel.h_Get() != 0;
		bool b_have_reduce = m_h_tile_reduce_kernel.h_Get() != 0;
		if(m_b_built && (b_have_scan || !b_want_scan) && (b_have_reduce || !b_want_reduce))
			return true;
		// already compiled and have all the kernels she wants

		_ASSERTE(b_want_scan || b_want_reduce);
		// make sure we want something

		//if(!b_Is_POT(m_n_workgroup_size))
		//	b_use_nv_shuffle = false;
		// wouldn't work with NPOT workgroups

		if(b_use_nv_shuffle) {
			CCLDeviceParams dev(h_device);
			if(!dev.b_Is_NVIDIA() || dev.n_NV_ComputeCap_Major() < 3)
				b_use_nv_shuffle = false; // not supportted before 3.0 devices
		}

		std::string s_preprocessor;
		if(!stl_ut::Format(s_preprocessor,
		   "-DSCAN_SCALAR_TYPE=%s "
		   "-DSCAN_SCALAR_SIZE=%d "
		   "-DSCAN_LOCAL_WORK_SIZE=" PRIsize " "
		   "-DSCAN_TILE_SIZE=" PRIsize " "
		   "-DREDUCTION_ELEM_OP=(%s) "
		   "-DREDUCTION_REDUCE_OP=(%s) "
		   "-DREDUCTION_REDUCE_OPERATOR=%d " // using \'%c\' does not work on windows 7
		   "-DREDUCTION_FINAL_OP=(%s) "
		   "-DREDUCTION_IDENTITY=%s "
		   "%s " // "-cl-nv-verbose"
		   "%s " // "-DDISABLE_NV_SHFL"
		   "%s " // "-DBUILD_SCAN"
		   "%s " // "-DBUILD_REDUCE"
		   "%s " // "-DBUILD_FINALIZING_REDUCE"
		   "%s " // "-DBUILD_ELEMENT_OP_LESS_REDUCE"
		   "%s ", // "-DBUILD_ELEMENT_OP_LESS_REDUCE"
		   r_config.p_s_DaraType(),
		   r_config.n_DataType_Size(),
		   m_n_workgroup_size,
		   m_n_tile_size,
		   r_config.p_s_ElementOp(),
		   r_config.p_s_ReduceOp(),
		   (char)r_config.n_Reduce_Operator(),
		   r_config.p_s_FinalizeOp(),
		   r_config.p_s_IdentityValue(),
		   (b_compiler_verbose)? "-cl-nv-verbose" : "",
		   (!b_use_nv_shuffle)? "-DDISABLE_NV_SHFL" : "",
		   (b_use_Harris_scan)? "-DUSE_HARRIS_SCAN" : "",
		   (b_want_scan)? "-DBUILD_SCAN" : "",
		   (b_want_reduce)? "-DBUILD_REDUCE" : "",
		   (b_want_reduce && r_config.b_Has_FinalizeOp())? "-DBUILD_FINALIZING_REDUCE" : "",
		   (b_want_reduce && r_config.b_Has_ElementOp())? "-DBUILD_ELEMENT_OP_LESS_REDUCE" : ""))
			return false;
		// build the preprocessor string

		if(b_verbose)
			printf("loading \'%s\' ... ", "compressed kernels");
		m_program = CCLUniqueProgram(h_context, CScanReducev3(),
			CCLUniqueProgram::from_compressed, s_preprocessor.c_str(),
			"%temp_default%ScanKernels", 64);
		CLresult n_result = m_program.n_Status();
		if(b_verbose)
			m_program.Dump_StatusWord(); // see the result
		if(n_result != CL_SUCCESS) {
			if(b_verbose)
				fprintf(stderr, "error: failed to load OpenCL program\n");
			return false;
		}
		// compile program

		{
			const struct TKernelBinding {
				const char *p_s_kernel_name;
				CCLUniqueKernel &r_kernel;
				bool b_is_scan;

				inline TKernelBinding(const char *_p_s_kernel_name,
					CCLUniqueKernel &_r_kernel, bool _b_is_scan)
					:p_s_kernel_name(_p_s_kernel_name), r_kernel(_r_kernel), b_is_scan(_b_is_scan)
				{}
			} p_kernel_bindings[] = {
				TKernelBinding("TileReduce", m_h_tile_reduce_kernel, false),
				TKernelBinding("TileReduce_Multi", m_h_multitile_reduce_kernel, false),
				TKernelBinding((r_config.b_Has_FinalizeOp())? "TileReduce_FinalOp" :
					"TileReduce", m_h_tile_reduce_finalop_kernel, false),
				TKernelBinding((r_config.b_Has_FinalizeOp())? "TileReduce_Multi_FinalOp" :
					"TileReduce_Multi", m_h_multitile_reduce_finalop_kernel, false),
				TKernelBinding("TileExScan", m_h_tile_exscan_kernel, true),
				TKernelBinding("TileInScan", m_h_tile_inscan_kernel, true),
				TKernelBinding("TileExScan_Downsweep", m_h_tile_exscan_ds_kernel, true),
				TKernelBinding("TileInScan_Downsweep", m_h_tile_inscan_ds_kernel, true)
			};
			for(size_t i = 0, n = sizeof(p_kernel_bindings) / sizeof(p_kernel_bindings[0]); i < n; ++ i) {
				if((p_kernel_bindings[i].b_is_scan && !b_want_scan) ||
				   (!p_kernel_bindings[i].b_is_scan && !b_want_reduce)) {
					p_kernel_bindings[i].r_kernel = cl_kernel(0); // debug
					continue;
				}
				CLresult n_result;
				p_kernel_bindings[i].r_kernel =
					m_program.h_Get_Kernel(p_kernel_bindings[i].p_s_kernel_name, n_result);
				if(n_result != CL_SUCCESS) {
					if(b_verbose) {
						fprintf(stderr, "error: failed to create OpenCL kernel \'%s\' (%d)\n",
							p_kernel_bindings[i].p_s_kernel_name, n_result);
					}
					return false;
				}
			}
		}
		// load kernel bindings

		m_b_built = true;
		return true;
	}

	CCLKernelCall Enqueue_TileExScan(cl_command_queue h_cmd_queue,
		cl_mem dp_scan, const cl_mem dp_data, size_t n_elem_num)
	{
		_ASSERTE(m_h_tile_exscan_kernel != cl_kernel(0)); // built for scan?
		_ASSERTE(n_elem_num <= INT_MAX);
		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = ((n_elem_num + m_n_tile_size - 1) / m_n_tile_size) * n_local_work_size;
		//size_t n_tile_num = n_global_work_size / n_local_work_size; // debug
		return clCall1D3Ex(h_cmd_queue, m_h_tile_exscan_kernel,
			n_global_work_size, n_local_work_size, dp_scan, dp_data, int(n_elem_num));
	}

	CCLKernelCall Enqueue_TileInScan(cl_command_queue h_cmd_queue,
		cl_mem dp_scan, const cl_mem dp_data, size_t n_elem_num)
	{
		_ASSERTE(m_h_tile_inscan_kernel != cl_kernel(0)); // built for scan?
		_ASSERTE(n_elem_num <= INT_MAX);
		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = ((n_elem_num + m_n_tile_size - 1) / m_n_tile_size) * n_local_work_size;
		//size_t n_tile_num = n_global_work_size / n_local_work_size; // debug
		return clCall1D3Ex(h_cmd_queue, m_h_tile_inscan_kernel,
			n_global_work_size, n_local_work_size, dp_scan, dp_data, int(n_elem_num));
	}

	CCLKernelCall Enqueue_TileExScan_Downsweep(cl_command_queue h_cmd_queue,
		cl_mem dp_scan, const cl_mem dp_spine, const cl_mem dp_data, size_t n_elem_num)
	{
		_ASSERTE(m_h_tile_exscan_ds_kernel != cl_kernel(0)); // built for scan?
		_ASSERTE(n_elem_num <= INT_MAX);
		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = ((n_elem_num + m_n_tile_size - 1) / m_n_tile_size) * n_local_work_size;
		//size_t n_tile_num = n_global_work_size / n_local_work_size; // debug
		return clCall1D4Ex(h_cmd_queue, m_h_tile_exscan_ds_kernel,
			n_global_work_size, n_local_work_size, dp_scan, dp_spine, dp_data, int(n_elem_num));
	}

	CCLKernelCall Enqueue_TileInScan_Downsweep(cl_command_queue h_cmd_queue,
		cl_mem dp_scan, const cl_mem dp_spine, const cl_mem dp_data, size_t n_elem_num)
	{
		_ASSERTE(m_h_tile_inscan_ds_kernel != cl_kernel(0)); // built for scan?
		_ASSERTE(n_elem_num <= INT_MAX);
		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = ((n_elem_num + m_n_tile_size - 1) / m_n_tile_size) * n_local_work_size;
		//size_t n_tile_num = n_global_work_size / n_local_work_size; // debug
		return clCall1D4Ex(h_cmd_queue, m_h_tile_inscan_ds_kernel,
			n_global_work_size, n_local_work_size, dp_scan, dp_spine, dp_data, int(n_elem_num));
	}

	CCLKernelCall Enqueue_TileReduce_NoFinalOp(cl_command_queue h_cmd_queue,
		cl_mem dp_reduce, size_t n_reduce_size, const cl_mem dp_data, size_t n_elem_num)
	{
		_ASSERTE(m_h_tile_reduce_kernel != cl_kernel(0)); // built for reduce?
		_ASSERTE(n_elem_num <= INT_MAX);
		_ASSERTE(n_reduce_size == (n_elem_num + m_n_tile_size - 1) / m_n_tile_size); // make sure the reduce array is big enough

		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = n_reduce_size * n_local_work_size;
		return clCall1D3Ex(h_cmd_queue, m_h_tile_reduce_kernel, n_global_work_size,
			n_local_work_size, dp_reduce, dp_data, int(n_elem_num));
	}

	// this does not call the finalizing operation on the elements before writing them out to the results buffer
	CCLKernelCall Enqueue_TileReduce_NoFinalOp(cl_command_queue h_cmd_queue,
		cl_mem dp_reduce, size_t n_reduce_size, const cl_mem dp_data,
		size_t n_elem_num, size_t n_tiles_per_workgroup)
	{
		_ASSERTE(m_h_multitile_reduce_kernel != cl_kernel(0)); // built for reduce?
		_ASSERTE(n_elem_num <= INT_MAX);
		_ASSERTE(n_tiles_per_workgroup <= INT_MAX);
		_ASSERTE(n_reduce_size == (n_elem_num + m_n_tile_size * n_tiles_per_workgroup - 1) /
			(m_n_tile_size * n_tiles_per_workgroup)); // make sure the reduce array is big enough

		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = n_reduce_size * n_local_work_size;
		return clCall1D4Ex(h_cmd_queue, m_h_multitile_reduce_kernel, n_global_work_size,
			n_local_work_size, dp_reduce, dp_data, int(n_elem_num), int(n_tiles_per_workgroup));
	}

	// this does not call the finalizing operation on the elements before writing them out to the results buffer
	CCLKernelCall Enqueue_TileReduce(cl_command_queue h_cmd_queue,
		cl_mem dp_reduce, size_t n_reduce_size, const cl_mem dp_data, size_t n_elem_num)
	{
		_ASSERTE(m_h_tile_reduce_kernel != cl_kernel(0)); // built for reduce?
		_ASSERTE(n_elem_num <= INT_MAX);
		_ASSERTE(n_reduce_size == (n_elem_num + m_n_tile_size - 1) / m_n_tile_size); // make sure the reduce array is big enough

		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = n_reduce_size * n_local_work_size;
		return clCall1D3Ex(h_cmd_queue, m_h_tile_reduce_finalop_kernel, n_global_work_size,
			n_local_work_size, dp_reduce, dp_data, int(n_elem_num));
	}

	CCLKernelCall Enqueue_TileReduce(cl_command_queue h_cmd_queue,
		cl_mem dp_reduce, size_t n_reduce_size, const cl_mem dp_data,
		size_t n_elem_num, size_t n_tiles_per_workgroup)
	{
		_ASSERTE(m_h_multitile_reduce_kernel != cl_kernel(0)); // built for reduce?
		_ASSERTE(n_elem_num <= INT_MAX);
		_ASSERTE(n_tiles_per_workgroup <= INT_MAX);
		_ASSERTE(n_reduce_size == (n_elem_num + m_n_tile_size * n_tiles_per_workgroup - 1) /
			(m_n_tile_size * n_tiles_per_workgroup)); // make sure the reduce array is big enough

		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = n_reduce_size * n_local_work_size;
		return clCall1D4Ex(h_cmd_queue, m_h_multitile_reduce_finalop_kernel, n_global_work_size,
			n_local_work_size, dp_reduce, dp_data, int(n_elem_num), int(n_tiles_per_workgroup));
	}

	static bool Benchmark(cl_command_queue h_cmd_queue, cl_context h_context,
		cl_device_id h_device, bool b_verbose = false)
	{
		CCLReductionConfig config;
		CCLTiled_ReduceScan_Impl scan;
		return scan.Compile(h_context, h_device, config, true, true, b_verbose) &&
			scan.Benchmark(h_cmd_queue, h_context);
	}

	bool Benchmark(cl_command_queue h_cmd_queue, cl_context h_context)
	{
		if(!m_b_built)
			return false;

		CCLContextWrapper context(h_context);
		CCLCommandQueueWrapper cmd_queue(h_cmd_queue);
		// thin wrappers, do not delete the handles

		bool b_results_correct = true;
		const size_t p_size[] = {1000 * 10, 1000 * 50, 1000 * 100, 1000 * 200,
			1000 * 500, 1000000, 1000000 * 2, 1000000 * 5, 1000000 * 10, 1000000 * 20};
		if(m_h_tile_inscan_kernel != cl_kernel(0) && m_h_tile_exscan_kernel != cl_kernel(0)) {
			for(int n_pass = 0; n_pass < 2; ++ n_pass) {
				bool b_inclusive = !n_pass;

				for(int n_test = 0; n_test < sizeof(p_size) / sizeof(p_size[0]); ++ n_test) {
					size_t n = p_size[n_test];

					printf("preparing data ...\r");

					std::vector<uint32_t> scan_data(n);
					for(size_t i = 0; i < n; ++ i)
						scan_data[i] = (uint32_t)(i + 1);
					for(size_t i = 0; i < n; ++ i)
						std::swap(scan_data[i], scan_data[CUniformIntegerDistribution<size_t>(i, n - 1)(CCLibGenerator<false>())]);
					// generate some data

					CCLUniqueMem dp_data_buffer, dp_scan_buffer;
					if(!(dp_data_buffer = context.h_CreateBuffer(n * sizeof(uint32_t))) ||
					   !(dp_scan_buffer = context.h_CreateBuffer(n * sizeof(uint32_t)))) {
						fprintf(stderr, "error: failed to alloc device buffer\n");
						return false;
					}
					// allocate memory

					printf("running tile %s-scan test ...  \r", (b_inclusive)? "in" : "ex");

					CTimer test_timer;
					double f_time = 0;
					int n_pass_num = 0;
					for(;;) {
						cmd_queue.n_Enqueue_Memcpy_HtoD(dp_data_buffer, 0,
							&scan_data[0], scan_data.size() * sizeof(uint32_t));
						cmd_queue.n_Enqueue_Memcpy_HtoD(dp_scan_buffer, 0,
							&scan_data[0], scan_data.size() * sizeof(uint32_t)); // clear this buffer as well
						CLresult n_result0 = cmd_queue.n_Finish();
						if(n_result0) {
							fprintf(stderr, "error: pre-finish result: %d (%s, %d)\n", n_result0, __FILE__, __LINE__);
							return false;
						}
						// prepare data ...

						double f_start_time = test_timer.f_Time();

						//printf("\n=== scan of %d elems ===\n", scan_data.size()); // separate debug outputs

						{
							/*size_t n_local_work_size = m_n_workgroup_size;
							size_t n_global_work_size = ((n + m_n_tile_size - 1) / m_n_tile_size) * n_local_work_size;
							size_t n_tile_num = n_global_work_size / n_local_work_size;*/
							CLresult n_result = /*clCall1D(cmd_queue, (b_inclusive)?
								m_h_tile_inscan_kernel : m_h_tile_exscan_kernel, n_global_work_size,
								n_local_work_size, dp_scan_buffer, dp_data_buffer, int(n));*/
								(b_inclusive)? Enqueue_TileInScan(cmd_queue, dp_scan_buffer, dp_data_buffer, n) :
								Enqueue_TileExScan(cmd_queue, dp_scan_buffer, dp_data_buffer, n);
							if(n_result != CL_SUCCESS) {
								fprintf(stderr, "error: clCall1D() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
								return false;
							}
						}

						//printf("\n"); // separate debug outputs

						CLresult n_result = cmd_queue.n_Finish();
						if(n_result) {
							fprintf(stderr, "error: finish result: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
							return false;
						}

						double f_pass_time = test_timer.f_Time() - f_start_time;
						f_time += f_pass_time;
						++ n_pass_num;

						if((f_time > .5f && n_pass_num > 10) || f_time > 4)
							break;
						// make sure the timing is stable, don't take too long at the same time
					}
					//-- n_pass_num; // the first pass did not count
					// run the thing

					f_time /= n_pass_num;
					size_t n_data = 2 * scan_data.size() * sizeof(uint32_t); // read data, write scans
					double f_GBps = n_data / f_time * 1e-9; // mGPU also uses 1e-9 rather than 1024^3
					printf("on " PRIsizeB "B, it took %f msec, reaching %f*1e9 B/s\n",
						PRIsizeBparams(scan_data.size() * sizeof(uint32_t)), f_time * 1000, f_GBps);
					// print results

					/*std::vector<unsigned int> global_scan_cpu(n);
					CTimer tcpu;
					if(b_inclusive) {
						global_scan_cpu[0] = scan_data[0];
						for(size_t i = 1; i < n; ++ i)
							global_scan_cpu[i] = global_scan_cpu[i - 1] + scan_data[i];
					} else {
						for(unsigned int i = 0, n_accum = 0; i < n; ++ i) {
							global_scan_cpu[i] = n_accum;
							n_accum += scan_data[i];
						}
					}
					printf("tile reduce takes %f msec on CPU\n", (b_inclusive)? "in" : "ex", tcpu.f_Time() * 1000);
					// perform a global scan (the goal)*/

					std::vector<uint32_t> tile_scan_cpu(n); // could work inplace but we still need a buffer for getting the GPU result(s)
					CTimer tcpu;
					if(b_inclusive) {
						for(size_t i = 0; i < n;) {
							uint32_t n_partial = 0;
							for(size_t e = std::min(n, i + m_n_tile_size); i < e; ++ i) {
								//tile_scan_cpu[i] = n_partial; // exclusive
								n_partial += scan_data[i];
								tile_scan_cpu[i] = n_partial; // inclusive
							}
						}
					} else {
						for(size_t i = 0; i < n;) {
							uint32_t n_partial = 0;
							for(size_t e = std::min(n, i + m_n_tile_size); i < e; ++ i) {
								tile_scan_cpu[i] = n_partial; // exclusive
								n_partial += scan_data[i];
								//tile_scan_cpu[i] = n_partial; // inclusive
							}
						}
					}
					printf("tile %s-scan takes %f msec on CPU\n",
						(b_inclusive)? "in" : "ex", tcpu.f_Time() * 1000);
					// perform a global scan (the goal)

					/*cmd_queue.n_Enqueue_Memcpy_DtoH(&scan_data[0], dp_scan_buffer, 0, scan_data.size() * sizeof(uint32_t));
					// copy back to CPU

					// memory released automatically at the end of scope

					size_t n_err_num = 0;
					for(size_t i = 0; i < n; ++ i) {
						if(scan_data[i] != tile_scan_cpu[i]) {
							if(++ n_err_num < 100) {
								fprintf(stderr, "error: tile %s-scan failed: scan_data[%d] = %d (should be %d)\n",
									(b_inclusive)? "in" : "ex", i, scan_data[i], tile_scan_cpu[i]);
							}
						}
					}*/

					bool b_test_correct;
					if(cmd_queue.n_CompareBuffer_DebugVerbose(b_test_correct, tile_scan_cpu.begin(),
					   tile_scan_cpu.end(), dp_scan_buffer, 0, (b_inclusive)? "tile in-scan" :
					   "tile ex-scan") != cl_Success) {
						fprintf(stderr, "error: cmd_queue.n_CompareBuffer_DebugVerbose() failed to compare the results\n");
						break;
					}
					// reusable function, uses clnqueueMapBuffer()

					if(b_test_correct/*!n_err_num*/)
						printf("done. tile %s-scan of %d items succeeded\n", (b_inclusive)? "in" : "ex", n);
					else {
						//fprintf(stderr, "error: tile %s-scan failed with %d errore\n", (b_inclusive)? "in" : "ex", n_err_num);
						/*printf("reduce[16] = %d\n", tile_scan_cpu[16]);
						printf("reduce[32] = %d\n", tile_scan_cpu[32]);
						printf("reduce[40] = %d\n", tile_scan_cpu[40]);
						printf("reduce[48] = %d\n", tile_scan_cpu[48]);
						printf("reduce[64] = %d\n", tile_scan_cpu[64]);*/
						b_results_correct = false;
						break;
					}
					// make sure it is scanned correctly
				}
			}
		}
		if(m_h_tile_reduce_kernel) {
			for(int n_test = 0; n_test < sizeof(p_size) / sizeof(p_size[0]); ++ n_test) {
				size_t n = p_size[n_test];

				printf("preparing data ...\r");

				std::vector<uint32_t> scan_data(n);
				for(size_t i = 0; i < n; ++ i)
					scan_data[i] = (uint32_t)(i + 1);
				for(size_t i = 0; i < n; ++ i)
					std::swap(scan_data[i], scan_data[CUniformIntegerDistribution<size_t>(i, n - 1)(CCLibGenerator<false>())]);
				// generate some data

				CCLUniqueMem dp_data_buffer, dp_reductions_buffer;
				size_t n_buffer_size_elems = n;//n_Align_Up(scan_data.size(), n_tile_size);
				size_t n_reductions_elems = (n + m_n_tile_size - 1) / m_n_tile_size;
				if(!(dp_data_buffer = context.h_CreateBuffer(n_buffer_size_elems * sizeof(uint32_t))) ||
				   !(dp_reductions_buffer = context.h_CreateBuffer(n_reductions_elems * sizeof(uint32_t)))) {
					fprintf(stderr, "error: failed to alloc device buffer\n");
					return false;
				}
				// allocate memory

				printf("running tile reduce test ...  \r");

				CTimer test_timer;
				double f_time = 0;
				int n_pass_num = 0;
				for(;;) {
					cmd_queue.n_Enqueue_Memcpy_HtoD(dp_data_buffer, 0, &scan_data[0],
						scan_data.size() * sizeof(uint32_t));
					cmd_queue.n_Enqueue_Memcpy_HtoD(dp_reductions_buffer, 0, &scan_data[0],
						n_reductions_elems * sizeof(uint32_t)); // clear this buffer as well
					cmd_queue.n_Finish();
					// prepare data ...

					double f_start_time = test_timer.f_Time();

					//printf("\n=== scan of %d elems ===\n", scan_data.size()); // separate debug outputs

					{
						/*size_t n_local_work_size = n_workgroup_size;
						size_t n_global_work_size = n_reductions_elems * n_local_work_size;
						size_t n_tile_num = n_global_work_size / n_local_work_size;
						CLresult n_result = clCall1D(cmd_queue, h_tile_reduce_kernel, n_global_work_size,
							n_local_work_size, dp_reductions_buffer, dp_data_buffer, int(n));*/
						CLresult n_result = Enqueue_TileReduce(cmd_queue,
							dp_reductions_buffer, n_reductions_elems, dp_data_buffer, n);
						if(n_result != CL_SUCCESS) {
							fprintf(stderr, "error: clCall1D() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
							return false;
						}
					}

					//printf("\n"); // separate debug outputs

					CLresult n_result = cmd_queue.n_Finish();
					if(n_result) {
						fprintf(stderr, "error: finish result: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
						return false;
					}

					double f_pass_time = test_timer.f_Time() - f_start_time;
					f_time += f_pass_time;
					++ n_pass_num;

					if((f_time > .5f && n_pass_num > 10) || f_time > 4)
						break;
					// make sure the timing is stable, don't take too long at the same time
				}
				//-- n_pass_num; // the first pass did not count
				// run the thing

				f_time /= n_pass_num;
				size_t n_data = 1 * scan_data.size() * sizeof(uint32_t); // ignores writing tile reductions
				double f_GBps = n_data / f_time * 1e-9; // mGPU also uses 1e-9 rather than 1024^3
				printf("on " PRIsizeB "B, it took %f msec, reaching %f*1e9 B/s\n",
					PRIsizeBparams(scan_data.size() * sizeof(uint32_t)), f_time * 1000, f_GBps);
				// print results

				/*std::vector<unsigned int> global_scan_cpu(n);
				CTimer tcpu;
				if(b_inclusive) {
					global_scan_cpu[0] = scan_data[0];
					for(size_t i = 1; i < n; ++ i)
						global_scan_cpu[i] = global_scan_cpu[i - 1] + scan_data[i];
				} else {
					for(unsigned int i = 0, n_accum = 0; i < n; ++ i) {
						global_scan_cpu[i] = n_accum;
						n_accum += scan_data[i];
					}
				}
				printf("tile reduce takes %f msec on CPU\n", (b_inclusive)? "in" : "ex", tcpu.f_Time() * 1000);
				// perform a global scan (the goal)*/

				std::vector<uint32_t> tile_reduce_cpu(n_reductions_elems);
				CTimer tcpu;
				for(size_t i = 0, b = 0; i < n; ++ b) {
					uint32_t n_partial = 0;
					for(size_t e = std::min(n, i + m_n_tile_size); i < e; ++ i)
						n_partial += scan_data[i];
					tile_reduce_cpu[b] = n_partial;
				}
				printf("tile reduce takes %f msec on CPU\n", tcpu.f_Time() * 1000);
				// perform a global scan (the goal)

				/*cmd_queue.n_Enqueue_Memcpy_DtoH(&scan_data[0], dp_reductions_buffer, 0,
					tile_reduce_cpu.size() * sizeof(uint32_t));
				// copy back to CPU

				// memory released automatically at the end of scope

				size_t n_err_num = 0;
				for(size_t i = 0; i < n_reductions_elems; ++ i) {
					if(scan_data[i] != tile_reduce_cpu[i]) {
						if(++ n_err_num < 100) {
							fprintf(stderr, "error: tile reduce failed: scan_data[%d] = %d (should be %d)\n",
								i, scan_data[i], tile_reduce_cpu[i]);
						}
					}
				}*/

				bool b_test_correct;
				if(cmd_queue.n_CompareBuffer_DebugVerbose(b_test_correct, tile_reduce_cpu.begin(),
				   tile_reduce_cpu.end(), dp_reductions_buffer, 0, "tile reduce") != cl_Success) {
					fprintf(stderr, "error: cmd_queue.n_CompareBuffer_DebugVerbose() failed to compare the results\n");
					break;
				}
				// reusable function, uses clnqueueMapBuffer()

				if(b_test_correct/*!n_err_num*/)
					printf("done. tile reduce of %d items succeeded\n", n);
				else {
					//fprintf(stderr, "error: tile reduce failed with %d errore\n", n_err_num); // already reported
					/*printf("reduce[16] = %d\n", tile_reduce_cpu[16]);
					printf("reduce[32] = %d\n", tile_reduce_cpu[32]);
					printf("reduce[40] = %d\n", tile_reduce_cpu[40]);
					printf("reduce[48] = %d\n", tile_reduce_cpu[48]);
					printf("reduce[64] = %d\n", tile_reduce_cpu[64]);*/
					b_results_correct = false;
					break;
				}
				// make sure it is scanned correctly
			}
		}
		if(b_results_correct)
			printf("all tests finished correctly\n");
		else
			fprintf(stderr, "error: there were some errors\n");

		return true;
	}
};

class CCLTiled_SegmentedReduceScan_Impl {
protected:
	size_t m_n_workgroup_size; // 256
	size_t m_n_tile_size; // 2048
	bool m_b_strided_head_flags;
	bool m_b_built;

	CCLUniqueProgram m_program_seg;
	CCLUniqueKernel m_h_tile_pack_flags_kernel; // pack head flags

	CCLUniqueKernel m_h_tile_segscan_bc_kernel; // tile carry
	CCLUniqueKernel m_h_tile_segscan_ds_kernel; // downsweep
	CCLUniqueKernel m_h_tile_segscan_bc_phf_kernel; // tile carry, pack head flags
	CCLUniqueKernel m_h_tile_segscan_bc_pphf_kernel; // tile carry, pre-packed head flags
	CCLUniqueKernel m_h_tile_segscan_ds_pphf_kernel; // downsweep, pre-packed head flags
	CCLUniqueKernel m_h_tile_segscan_bc_pphfno_kernel; // tile carry, pre-packed head flags in natural order (equal to m_h_tile_segscan_bc_pphf_kernel unless m_b_strided_head_flags is set)
	CCLUniqueKernel m_h_tile_segscan_ds_pphfno_kernel; // downsweep, pre-packed head flags in natural order (equal to m_h_tile_segscan_ds_pphf_kernel unless m_b_strided_head_flags is set)
	CCLUniqueKernel m_h_tile_segscan_kernel; // single tile
	CCLUniqueKernel m_h_tile_segscan_pphf_kernel; // single tile, pre-packed head flags
	CCLUniqueKernel m_h_tile_segscan_pphfno_kernel; // single tile, pre-packed head flags in natural order (equal to m_h_tile_segscan_bc_pphf_kernel unless m_b_strided_head_flags is set)

	CCLUniqueKernel m_h_tile_segreduce_bs_kernel;
	CCLUniqueKernel m_h_tile_segreduce_bs_phf_kernel;
	CCLUniqueKernel m_h_tile_segreduce_bs_pphf_kernel;
	CCLUniqueKernel m_h_tile_segreduce_bs_pphfno_kernel;

	CCLUniqueKernel m_h_tile_segreduce_kernel;
	CCLUniqueKernel m_h_tile_segreduce_pphf_kernel;
	CCLUniqueKernel m_h_tile_segreduce_pphfno_kernel;

	CCLUniqueKernel m_h_tile_segreduce_s_kernel;
	CCLUniqueKernel m_h_tile_segreduce_s_pphf_kernel;
	CCLUniqueKernel m_h_tile_segreduce_s_pphfno_kernel;

	CCLUniqueKernel m_h_tile_segreduce_spine_kernel;
	CCLUniqueKernel m_h_tile_segreduce_spine_pphf_kernel;
	CCLUniqueKernel m_h_tile_segreduce_spine_pphfno_kernel;

	CCLUniqueKernel m_h_tile_segreduce_spine_ds_pphf_kernel;
	CCLUniqueKernel m_h_tile_segreduce_spine_ds_pphfno_kernel;

public:
	CCLTiled_SegmentedReduceScan_Impl(size_t n_workgroup_size = 128,
		size_t n_tile_size = 1024, bool b_strided_head_flags = false)
		:m_n_workgroup_size(n_workgroup_size), m_n_tile_size(n_tile_size),
		m_b_strided_head_flags(b_strided_head_flags), m_b_built(false)
	{}

	void Set_WorkGroupSize_TileSize(size_t n_workgroup_size = 128,
		size_t n_tile_size = 1024, bool b_strided_head_flags = false)
	{
		m_n_workgroup_size = n_workgroup_size;
		m_n_tile_size = n_tile_size;
		m_b_strided_head_flags = b_strided_head_flags;

		m_b_built = false; // now need to rebuild
	}

	bool b_Status() const
	{
		return m_b_built;
	}

	inline size_t n_Tile_Size() const
	{
		return m_n_tile_size;
	}

	inline size_t n_WorkGroup_Size() const
	{
		return m_n_workgroup_size;
	}

	inline bool b_Strided_HeadFlags() const
	{
		return m_b_strided_head_flags;
	}

	/**
	 *	@brief compiles the kernels with the specified reduction settings
	 *
	 *	@param[in] h_context is OpenCL context
	 *	@param[in] h_device is target device (currently only supports single device)
	 *	@param[in] r_config is reference to scan configuration
	 *	@param[in] b_want_scan is scan kernel build flag
	 *	@param[in] b_want_reduce is reduction kernel build flag
	 *	@param[in] b_verbose is verbosity flag (set to enable verbose)
	 *	@param[in] b_compiler_verbose is compiler verbosity flag (e.g. nvcc reports numbers of registers)
	 *	@param[in] b_use_nv_shuffle is NVIDIA specific shuffle instruction enable flag
	 *		(cleared automatically for devices that do not support it)
	 *	@param[in] b_use_Harris_scan is Harris-style workgroup cooperative scan flag
	 *
	 *	@return Returns true on success, false on failure.
	 *
	 *	@note Once the kernels are compiled, this function has no effect and
	 *		always returns true.
	 */
	bool Compile(cl_context h_context, cl_device_id h_device,
		const CCLReductionConfig &r_config,
		bool b_want_pack, bool b_want_scan, bool b_want_reduce,
		bool b_want_reduce_spine_adjust, bool b_verbose = false,
		bool b_compiler_verbose = false, bool b_use_nv_shuffle = true)
	{
		bool b_have_pack = m_h_tile_pack_flags_kernel.h_Get() != 0;
		bool b_have_scan = m_h_tile_segscan_kernel.h_Get() != 0;
		bool b_have_reduce = m_h_tile_segreduce_kernel.h_Get() != 0;
		bool b_have_reduce_spine_adjust = m_h_tile_segreduce_spine_kernel.h_Get() != 0;
		if(m_b_built && (b_have_pack || !b_want_pack) && (b_have_scan || !b_want_scan) &&
		   (b_have_reduce || !b_want_reduce) && (b_have_reduce_spine_adjust || !b_want_reduce_spine_adjust))
			return true;
		// already compiled and have all the kernels she wants

		_ASSERTE(b_want_pack || b_want_scan || b_want_reduce || b_want_reduce_spine_adjust);
		// make sure we want something

		if(r_config.b_Has_FinalizeOp())
			return false;
		// segmented scan/reduce does not support final op

		//if(!b_Is_POT(m_n_workgroup_size))
		//	b_use_nv_shuffle = false;
		// wouldn't work with NPOT workgroups

		if(b_use_nv_shuffle) {
			CCLDeviceParams dev(h_device);
			if(!dev.b_Is_NVIDIA() || dev.n_NV_ComputeCap_Major() < 3)
				b_use_nv_shuffle = false; // not supportted before 3.0 devices
		}

		std::string s_preprocessor;
		if(!stl_ut::Format(s_preprocessor,
		   "-DSCAN_SCALAR_TYPE=%s "
		   "-DSCAN_SCALAR_SIZE=%d "
		   "-DSCAN_LOCAL_WORK_SIZE=" PRIsize " "
		   "-DSCAN_TILE_SIZE=" PRIsize " "
		   "-DREDUCTION_ELEM_OP=(%s) "
		   "-DREDUCTION_REDUCE_OP=(%s) "
		   "-DREDUCTION_REDUCE_OPERATOR=%d " // using \'%c\' does not work on windows 7
		   "-DREDUCTION_FINAL_OP=(%s) "
		   "-DREDUCTION_IDENTITY=%s "
		   "%s " // "-cl-nv-verbose"
		   "%s " // "-DDISABLE_NV_SHFL"
		   "%s " // "-DBUILD_PACK_FLAGS_KERNELS"
		   "%s " // "-DBUILD_SEG_SCAN_KERNELS"
		   "%s " // "-DBUILD_SEG_REDUCE_KERNELS"
		   "%s " // "-DBUILD_SEG_REDUCE_SPINE_ADJUST_KERNELS"
		   "%s ", // "-DUSE_STRIDED_PACKED_HEAD_FLAGS"
		   r_config.p_s_DaraType(),
		   r_config.n_DataType_Size(),
		   m_n_workgroup_size,
		   m_n_tile_size,
		   r_config.p_s_ElementOp(),
		   r_config.p_s_ReduceOp(),
		   (char)r_config.n_Reduce_Operator(),
		   r_config.p_s_FinalizeOp(),
		   r_config.p_s_IdentityValue(),
		   (b_compiler_verbose)? "-cl-nv-verbose" : "",
		   (!b_use_nv_shuffle)? "-DDISABLE_NV_SHFL" : "",
		   (b_want_pack)? "-DBUILD_PACK_FLAGS_KERNELS" : "",
		   (b_want_scan)? "-DBUILD_SEG_SCAN_KERNELS" : "",
		   (b_want_reduce)? "-DBUILD_SEG_REDUCE_KERNELS" : "",
		   (b_want_reduce_spine_adjust)? "-DBUILD_SEG_REDUCE_SPINE_ADJUST_KERNELS" : "",
		   (m_b_strided_head_flags)? "-DUSE_STRIDED_PACKED_HEAD_FLAGS" : ""))
			return false;
		// build the preprocessor string

		if(b_verbose)
			printf("loading \'%s\' ... ", "compressed kernels");
		/*m_program = CCLUniqueProgram(h_context, CScanReducev3(),
			CCLUniqueProgram::from_compressed, s_preprocessor.c_str(),
			"%temp_default%ScanKernels", 64);
		CLresult n_result = m_program.n_Status();
		if(b_verbose)
			m_program.Dump_StatusWord(); // see the result
		if(n_result != CL_SUCCESS) {
			if(b_verbose)
				fprintf(stderr, "error: failed to load OpenCL program\n");
			return false;
		}*/
		m_program_seg = CCLUniqueProgram(h_context, CSegScanReduce(),
			CCLUniqueProgram::from_compressed, s_preprocessor.c_str(),
			"%temp_default%SegScanKernels", 64);
		CLresult n_result = m_program_seg.n_Status();
		if(b_verbose)
			m_program_seg.Dump_StatusWord(); // see the result
		if(n_result != CL_SUCCESS) {
			if(b_verbose)
				fprintf(stderr, "error: failed to load OpenCL program\n");
			return false;
		}
		// compile program

		enum {
			flag_Pack = 1,
			flag_Scan = 2,
			flag_Reduce = 4,
			flag_Reduce_SpineAdj = 8
		};

		int n_wanted_flags = ((b_want_pack)? flag_Pack : 0) | ((b_want_scan)? flag_Scan : 0) |
			((b_want_reduce)? flag_Reduce : 0) | ((b_want_reduce_spine_adjust)? flag_Reduce_SpineAdj : 0);

		{
			const struct TKernelBinding {
				const char *p_s_kernel_name;
				CCLUniqueKernel &r_kernel;
				int n_kernel_type_flags;

				inline TKernelBinding(const char *_p_s_kernel_name,
					CCLUniqueKernel &_r_kernel, int _n_kernel_type_flags)
					:p_s_kernel_name(_p_s_kernel_name), r_kernel(_r_kernel),
					n_kernel_type_flags(_n_kernel_type_flags)
				{}
			} p_kernel_bindings[] = {
				TKernelBinding("Pack_HeadFlags", m_h_tile_pack_flags_kernel, flag_Pack),

				TKernelBinding("TileSegScan_Carry", m_h_tile_segscan_bc_kernel, flag_Scan),
				TKernelBinding("TileSegScan_Carry_Pack", m_h_tile_segscan_bc_phf_kernel, flag_Scan),
				TKernelBinding("TileSegScan_Carry_Packed", m_h_tile_segscan_bc_pphf_kernel, flag_Scan),
				TKernelBinding((m_b_strided_head_flags)? "TileSegScan_Carry_PackedNatural" :
					"TileSegScan_Carry_Packed", m_h_tile_segscan_bc_pphfno_kernel, flag_Scan),

				TKernelBinding("TileSegScan", m_h_tile_segscan_kernel, flag_Scan),
				TKernelBinding("TileSegScan_Packed", m_h_tile_segscan_pphf_kernel, flag_Scan),
				TKernelBinding((m_b_strided_head_flags)? "TileSegScan_PackedNatural" :
					"TileSegScan_Packed", m_h_tile_segscan_pphfno_kernel, flag_Scan),

				TKernelBinding("TileSegScan_Downsweep", m_h_tile_segscan_ds_kernel, flag_Scan),
				TKernelBinding("TileSegScan_Downsweep_Packed", m_h_tile_segscan_ds_pphf_kernel, flag_Scan),
				TKernelBinding((m_b_strided_head_flags)? "TileSegScan_Downsweep_PackedNatural" :
					"TileSegScan_Downsweep_Packed", m_h_tile_segscan_ds_pphfno_kernel, flag_Scan),

				TKernelBinding("TileSegReduce_Bootstrap", m_h_tile_segreduce_bs_kernel, flag_Reduce),
				TKernelBinding("TileSegReduce_Bootstrap_Pack", m_h_tile_segreduce_bs_phf_kernel, flag_Reduce),
				TKernelBinding("TileSegReduce_Bootstrap_Packed", m_h_tile_segreduce_bs_pphf_kernel, flag_Reduce),
				TKernelBinding((m_b_strided_head_flags)? "TileSegReduce_Bootstrap_PackedNatural" :
					"TileSegReduce_Bootstrap_Packed", m_h_tile_segreduce_bs_pphfno_kernel, flag_Reduce),

				TKernelBinding("TileSegReduce", m_h_tile_segreduce_kernel, flag_Reduce),
				TKernelBinding("TileSegReduce_Packed", m_h_tile_segreduce_pphf_kernel, flag_Reduce),
				TKernelBinding((m_b_strided_head_flags)? "TileSegReduce_PackedNatural" :
					"TileSegReduce_Packed", m_h_tile_segreduce_pphfno_kernel, flag_Reduce),

				TKernelBinding("TileSegReduceSingle", m_h_tile_segreduce_s_kernel, flag_Reduce),
				TKernelBinding("TileSegReduceSingle_Packed", m_h_tile_segreduce_s_pphf_kernel, flag_Reduce),
				TKernelBinding((m_b_strided_head_flags)? "TileSegReduceSingle_PackedNatural" :
					"TileSegReduceSingle_Packed", m_h_tile_segreduce_s_pphfno_kernel, flag_Reduce),

				TKernelBinding("TileSegReduce_SpineAdjust", m_h_tile_segreduce_spine_kernel, flag_Reduce_SpineAdj),
				TKernelBinding("TileSegReduce_SpineAdjust_Packed", m_h_tile_segreduce_spine_pphf_kernel, flag_Reduce_SpineAdj),
				TKernelBinding((m_b_strided_head_flags)? "TileSegReduce_SpineAdjust_PackedNatural" :
					"TileSegReduce_SpineAdjust_Packed", m_h_tile_segreduce_spine_pphfno_kernel, flag_Reduce_SpineAdj),

				TKernelBinding("TileSegReduce_SpineAdjust_Downsweep_Packed", m_h_tile_segreduce_spine_ds_pphf_kernel, flag_Reduce_SpineAdj),
				//TKernelBinding((m_b_strided_head_flags)? "TileSegReduce_SpineAdjust_Downsweep_PackedNatural" :
				//	"TileSegReduce_SpineAdjust_Downsweep_Packed", m_h_tile_segreduce_spine_ds_pphfno_kernel, flag_Reduce_SpineAdj), // probably not, the inputs are never given by the "user"
			};
			for(size_t i = 0, n = sizeof(p_kernel_bindings) / sizeof(p_kernel_bindings[0]); i < n; ++ i) {
				if(!(p_kernel_bindings[i].n_kernel_type_flags & n_wanted_flags)) {
					p_kernel_bindings[i].r_kernel = cl_kernel(0); // debug
					continue;
				}
				CLresult n_result;
				p_kernel_bindings[i].r_kernel = (/*(p_kernel_bindings[i].b_is_seg)?*/ m_program_seg /*:
					m_program*/).h_Get_Kernel(p_kernel_bindings[i].p_s_kernel_name, n_result);
				if(n_result != CL_SUCCESS) {
					if(b_verbose) {
						fprintf(stderr, "error: failed to create OpenCL kernel \'%s\' (%d)\n",
							p_kernel_bindings[i].p_s_kernel_name, n_result);
					}
					return false;
				}
			}
		}
		// load kernel bindings

		m_b_built = true;
		return true;
	}

	CCLKernelCall Enqueue_Pack_HeadFlags(cl_command_queue h_cmd_queue,
		cl_mem dp_packed_head_flags, const cl_mem dp_head_flags, size_t n_flag_num) const
	{
		_ASSERTE(m_h_tile_pack_flags_kernel != cl_kernel(0)); // built for scan?
		_ASSERTE(n_flag_num <= INT_MAX);
		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = ((n_flag_num + m_n_tile_size - 1) / m_n_tile_size) * n_local_work_size;
		//size_t n_tile_num = n_global_work_size / n_local_work_size; // debug
		return clCall1D3Ex(h_cmd_queue, m_h_tile_pack_flags_kernel,
			n_global_work_size, n_local_work_size,
			dp_packed_head_flags, dp_head_flags, int(n_flag_num));
	}

	// ---- segmented ops ----

	// only calculates tile carry and reduced tile head flags (for calculating the segmented scan of the carry)
	CCLKernelCall Enqueue_TileSegScan_Carry(cl_command_queue h_cmd_queue,
		cl_mem dp_tile_carry, cl_mem dp_tile_head_flags,
		const cl_mem dp_data, const cl_mem dp_head_flags, size_t n_elem_num) const
	{
		_ASSERTE(m_h_tile_segscan_bc_kernel != cl_kernel(0)); // built for scan?
		_ASSERTE(n_elem_num <= INT_MAX);
		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = ((n_elem_num + m_n_tile_size - 1) / m_n_tile_size) * n_local_work_size;
		//size_t n_tile_num = n_global_work_size / n_local_work_size; // debug
		return clCall1D5Ex(h_cmd_queue, m_h_tile_segscan_bc_kernel,
			n_global_work_size, n_local_work_size, dp_tile_carry,
			dp_tile_head_flags, dp_data, dp_head_flags, int(n_elem_num));
	}

	CCLKernelCall Enqueue_TileSegScan_Carry_Pack(cl_command_queue h_cmd_queue,
		cl_mem dp_tile_carry, cl_mem dp_tile_head_flags, cl_mem dp_packed_head_flags,
		const cl_mem dp_data, const cl_mem dp_head_flags, size_t n_elem_num) const
	{
		_ASSERTE(m_h_tile_segscan_bc_phf_kernel != cl_kernel(0)); // built for scan?
		_ASSERTE(n_elem_num <= INT_MAX);
		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = ((n_elem_num + m_n_tile_size - 1) / m_n_tile_size) * n_local_work_size;
		//size_t n_tile_num = n_global_work_size / n_local_work_size; // debug
		return clCall1D6Ex(h_cmd_queue, m_h_tile_segscan_bc_phf_kernel,
			n_global_work_size, n_local_work_size, dp_tile_carry,
			dp_tile_head_flags, dp_packed_head_flags, dp_data, dp_head_flags, int(n_elem_num));
	}

	CCLKernelCall Enqueue_TileSegScan_Carry_Packed(cl_command_queue h_cmd_queue,
		cl_mem dp_tile_carry, cl_mem dp_tile_head_flags,
		const cl_mem dp_data, const cl_mem dp_packed_head_flags, size_t n_elem_num) const
	{
		_ASSERTE(m_h_tile_segscan_bc_pphf_kernel != cl_kernel(0)); // built for scan?
		_ASSERTE(n_elem_num <= INT_MAX);
		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = ((n_elem_num + m_n_tile_size - 1) / m_n_tile_size) * n_local_work_size;
		//size_t n_tile_num = n_global_work_size / n_local_work_size; // debug
		return clCall1D5Ex(h_cmd_queue, m_h_tile_segscan_bc_pphf_kernel,
			n_global_work_size, n_local_work_size, dp_tile_carry,
			dp_tile_head_flags, dp_data, dp_packed_head_flags, int(n_elem_num));
	}

	CCLKernelCall Enqueue_TileSegScan_Carry_PackedNaturalOrder(cl_command_queue h_cmd_queue,
		cl_mem dp_tile_carry, cl_mem dp_tile_head_flags,
		const cl_mem dp_data, const cl_mem dp_packed_head_flags, size_t n_elem_num) const
	{
		_ASSERTE(m_h_tile_segscan_bc_pphfno_kernel != cl_kernel(0)); // built for scan?
		_ASSERTE(n_elem_num <= INT_MAX);
		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = ((n_elem_num + m_n_tile_size - 1) / m_n_tile_size) * n_local_work_size;
		//size_t n_tile_num = n_global_work_size / n_local_work_size; // debug
		return clCall1D5Ex(h_cmd_queue, m_h_tile_segscan_bc_pphfno_kernel,
			n_global_work_size, n_local_work_size, dp_tile_carry,
			dp_tile_head_flags, dp_data, dp_packed_head_flags, int(n_elem_num));
	}

	// segmented scan of a single tile
	CCLKernelCall Enqueue_TileSegScan(cl_command_queue h_cmd_queue,
		cl_mem dp_scan, //cl_mem dp_tile_carry, cl_mem dp_tile_flags, // not needed
		const cl_mem dp_data, const cl_mem dp_head_flags, size_t n_elem_num) const
	{
		_ASSERTE(m_h_tile_segscan_kernel != cl_kernel(0)); // built for scan?
		_ASSERTE(n_elem_num <= INT_MAX);
		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = ((n_elem_num + m_n_tile_size - 1) / m_n_tile_size) * n_local_work_size;
		//size_t n_tile_num = n_global_work_size / n_local_work_size; // debug
		return clCall1D4Ex(h_cmd_queue, m_h_tile_segscan_kernel,
			n_global_work_size, n_local_work_size, dp_scan, /*dp_tile_carry,
			dp_tile_flags,*/ dp_data, dp_head_flags, int(n_elem_num));
	}

	CCLKernelCall Enqueue_TileSegScan_Packed(cl_command_queue h_cmd_queue,
		cl_mem dp_scan, //cl_mem dp_tile_carry, cl_mem dp_tile_flags, // not needed
		const cl_mem dp_data, const cl_mem dp_packed_head_flags, size_t n_elem_num) const
	{
		_ASSERTE(m_h_tile_segscan_pphf_kernel != cl_kernel(0)); // built for scan?
		_ASSERTE(n_elem_num <= INT_MAX);
		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = ((n_elem_num + m_n_tile_size - 1) / m_n_tile_size) * n_local_work_size;
		//size_t n_tile_num = n_global_work_size / n_local_work_size; // debug
		return clCall1D4Ex(h_cmd_queue, m_h_tile_segscan_pphf_kernel,
			n_global_work_size, n_local_work_size, dp_scan, /*dp_tile_carry,
			dp_tile_flags,*/ dp_data, dp_packed_head_flags, int(n_elem_num));
	}

	CCLKernelCall Enqueue_TileSegScan_PackedNaturalOrder(cl_command_queue h_cmd_queue,
		cl_mem dp_scan, //cl_mem dp_tile_carry, cl_mem dp_tile_flags, // not needed
		const cl_mem dp_data, const cl_mem dp_packed_head_flags, size_t n_elem_num) const
	{
		_ASSERTE(m_h_tile_segscan_pphfno_kernel != cl_kernel(0)); // built for scan?
		_ASSERTE(n_elem_num <= INT_MAX);
		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = ((n_elem_num + m_n_tile_size - 1) / m_n_tile_size) * n_local_work_size;
		//size_t n_tile_num = n_global_work_size / n_local_work_size; // debug
		return clCall1D4Ex(h_cmd_queue, m_h_tile_segscan_pphfno_kernel,
			n_global_work_size, n_local_work_size, dp_scan, /*dp_tile_carry,
			dp_tile_flags,*/ dp_data, dp_packed_head_flags, int(n_elem_num));
	}

	// segmented scan downsweep
	CCLKernelCall Enqueue_TileSegScan_Downsweep(cl_command_queue h_cmd_queue,
		cl_mem dp_scan, const cl_mem dp_data, const cl_mem dp_tile_carry,
		const cl_mem dp_head_flags, size_t n_elem_num) const
	{
		_ASSERTE(m_h_tile_segscan_ds_kernel != cl_kernel(0)); // built for scan?
		_ASSERTE(n_elem_num <= INT_MAX);
		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = ((n_elem_num + m_n_tile_size - 1) / m_n_tile_size) * n_local_work_size;
		//size_t n_tile_num = n_global_work_size / n_local_work_size; // debug
		return clCall1D5Ex(h_cmd_queue, m_h_tile_segscan_ds_kernel,
			n_global_work_size, n_local_work_size, dp_scan, dp_data,
			dp_tile_carry, dp_head_flags, int(n_elem_num));
	}

	CCLKernelCall Enqueue_TileSegScan_Downsweep_Packed(cl_command_queue h_cmd_queue,
		cl_mem dp_scan, const cl_mem dp_data, const cl_mem dp_tile_carry,
		const cl_mem dp_packed_head_flags, size_t n_elem_num) const
	{
		_ASSERTE(m_h_tile_segscan_ds_pphf_kernel != cl_kernel(0)); // built for scan?
		_ASSERTE(n_elem_num <= INT_MAX);
		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = ((n_elem_num + m_n_tile_size - 1) / m_n_tile_size) * n_local_work_size;
		//size_t n_tile_num = n_global_work_size / n_local_work_size; // debug
		return clCall1D5Ex(h_cmd_queue, m_h_tile_segscan_ds_pphf_kernel,
			n_global_work_size, n_local_work_size, dp_scan, dp_data,
			dp_tile_carry, dp_packed_head_flags, int(n_elem_num));
	}

	CCLKernelCall Enqueue_TileSegScan_Downsweep_PackedNaturalOrder(cl_command_queue h_cmd_queue,
		cl_mem dp_scan, const cl_mem dp_data, const cl_mem dp_tile_carry,
		const cl_mem dp_packed_head_flags, size_t n_elem_num) const
	{
		_ASSERTE(m_h_tile_segscan_ds_pphfno_kernel != cl_kernel(0)); // built for scan?
		_ASSERTE(n_elem_num <= INT_MAX);
		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = ((n_elem_num + m_n_tile_size - 1) / m_n_tile_size) * n_local_work_size;
		//size_t n_tile_num = n_global_work_size / n_local_work_size; // debug
		return clCall1D5Ex(h_cmd_queue, m_h_tile_segscan_ds_pphfno_kernel,
			n_global_work_size, n_local_work_size, dp_scan, dp_data,
			dp_tile_carry, dp_packed_head_flags, int(n_elem_num));
	}

	CCLKernelCall Enqueue_TileSegReduce_Bootstrap(cl_command_queue h_cmd_queue,
		cl_mem dp_tile_tail_counts, cl_mem dp_tile_head_flags,
		const cl_mem dp_head_flags, size_t n_elem_num) const
	{
		_ASSERTE(m_h_tile_segreduce_bs_kernel != cl_kernel(0)); // built for reduce?
		_ASSERTE(n_elem_num <= INT_MAX);
		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = ((n_elem_num + m_n_tile_size - 1) / m_n_tile_size) * n_local_work_size;
		//size_t n_tile_num = n_global_work_size / n_local_work_size; // debug

		return clCall1D4Ex(h_cmd_queue, m_h_tile_segreduce_bs_kernel,
			n_global_work_size, n_local_work_size, dp_tile_tail_counts,
			dp_tile_head_flags, dp_head_flags, int(n_elem_num));
	}

	CCLKernelCall Enqueue_TileSegReduce_Bootstrap_Pack(cl_command_queue h_cmd_queue,
		cl_mem dp_tile_tail_counts, cl_mem dp_tile_head_flags, cl_mem dp_packed_head_flags,
		const cl_mem dp_head_flags, size_t n_elem_num) const
	{
		_ASSERTE(m_h_tile_segreduce_bs_phf_kernel != cl_kernel(0)); // built for reduce?
		_ASSERTE(n_elem_num <= INT_MAX);
		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = ((n_elem_num + m_n_tile_size - 1) / m_n_tile_size) * n_local_work_size;
		//size_t n_tile_num = n_global_work_size / n_local_work_size; // debug

		return clCall1D4Ex(h_cmd_queue, m_h_tile_segreduce_bs_phf_kernel,
			n_global_work_size, n_local_work_size, dp_tile_tail_counts,
			dp_tile_head_flags, dp_packed_head_flags, dp_head_flags, int(n_elem_num));
	}

	CCLKernelCall Enqueue_TileSegReduce_Bootstrap_Packed(cl_command_queue h_cmd_queue,
		cl_mem dp_tile_tail_counts, cl_mem dp_tile_head_flags,
		const cl_mem dp_packed_head_flags, size_t n_elem_num) const
	{
		_ASSERTE(m_h_tile_segreduce_bs_pphf_kernel != cl_kernel(0)); // built for reduce?
		_ASSERTE(n_elem_num <= INT_MAX);
		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = ((n_elem_num + m_n_tile_size - 1) / m_n_tile_size) * n_local_work_size;
		//size_t n_tile_num = n_global_work_size / n_local_work_size; // debug

		return clCall1D4Ex(h_cmd_queue, m_h_tile_segreduce_bs_pphf_kernel,
			n_global_work_size, n_local_work_size, dp_tile_tail_counts,
			dp_tile_head_flags, dp_packed_head_flags, int(n_elem_num));
	}

	CCLKernelCall Enqueue_TileSegReduce_Bootstrap_PackedNaturalOrder(cl_command_queue h_cmd_queue,
		cl_mem dp_tile_tail_counts, cl_mem dp_tile_head_flags,
		const cl_mem dp_packed_head_flags, size_t n_elem_num) const
	{
		_ASSERTE(m_h_tile_segreduce_bs_pphfno_kernel != cl_kernel(0)); // built for reduce?
		_ASSERTE(n_elem_num <= INT_MAX);
		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = ((n_elem_num + m_n_tile_size - 1) / m_n_tile_size) * n_local_work_size;
		//size_t n_tile_num = n_global_work_size / n_local_work_size; // debug

		return clCall1D4Ex(h_cmd_queue, m_h_tile_segreduce_bs_pphfno_kernel,
			n_global_work_size, n_local_work_size, dp_tile_tail_counts,
			dp_tile_head_flags, dp_packed_head_flags, int(n_elem_num));
	}

	CCLKernelCall Enqueue_TileSegReduce(cl_command_queue h_cmd_queue,
		cl_mem dp_reductions, cl_mem dp_tile_sums,
		const cl_mem dp_tile_tail_counts_scan, const cl_mem dp_data,
		const cl_mem dp_head_flags, size_t n_elem_num) const
	{
		_ASSERTE(m_h_tile_segreduce_kernel != cl_kernel(0)); // built for reduce?
		_ASSERTE(n_elem_num <= INT_MAX);
		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = ((n_elem_num + m_n_tile_size - 1) / m_n_tile_size) * n_local_work_size;
		//size_t n_tile_num = n_global_work_size / n_local_work_size; // debug

		return clCall1D6Ex(h_cmd_queue, m_h_tile_segreduce_kernel,
			n_global_work_size, n_local_work_size, dp_reductions, dp_tile_sums,
			dp_tile_tail_counts_scan, dp_data, dp_head_flags, int(n_elem_num));
	}

	CCLKernelCall Enqueue_TileSegReduce_Packed(cl_command_queue h_cmd_queue,
		cl_mem dp_reductions, cl_mem dp_tile_sums,
		const cl_mem dp_tile_tail_counts_scan, const cl_mem dp_data,
		const cl_mem dp_packed_head_flags, size_t n_elem_num) const
	{
		_ASSERTE(m_h_tile_segreduce_pphf_kernel != cl_kernel(0)); // built for reduce?
		_ASSERTE(n_elem_num <= INT_MAX);
		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = ((n_elem_num + m_n_tile_size - 1) / m_n_tile_size) * n_local_work_size;
		//size_t n_tile_num = n_global_work_size / n_local_work_size; // debug

		return clCall1D6Ex(h_cmd_queue, m_h_tile_segreduce_pphf_kernel,
			n_global_work_size, n_local_work_size, dp_reductions, dp_tile_sums,
			dp_tile_tail_counts_scan, dp_data, dp_packed_head_flags, int(n_elem_num));
	}

	CCLKernelCall Enqueue_TileSegReduce_PackedNaturalOrder(cl_command_queue h_cmd_queue,
		cl_mem dp_reductions, cl_mem dp_tile_sums,
		const cl_mem dp_tile_tail_counts_scan, const cl_mem dp_data,
		const cl_mem dp_packed_head_flags, size_t n_elem_num) const
	{
		_ASSERTE(m_h_tile_segreduce_pphfno_kernel != cl_kernel(0)); // built for reduce?
		_ASSERTE(n_elem_num <= INT_MAX);
		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = ((n_elem_num + m_n_tile_size - 1) / m_n_tile_size) * n_local_work_size;
		//size_t n_tile_num = n_global_work_size / n_local_work_size; // debug

		return clCall1D6Ex(h_cmd_queue, m_h_tile_segreduce_pphfno_kernel,
			n_global_work_size, n_local_work_size, dp_reductions, dp_tile_sums,
			dp_tile_tail_counts_scan, dp_data, dp_packed_head_flags, int(n_elem_num));
	}

	CCLKernelCall Enqueue_TileSegReduceSingle(cl_command_queue h_cmd_queue,
		cl_mem dp_reductions, const cl_mem dp_data, const cl_mem dp_head_flags, size_t n_elem_num) const
	{
		_ASSERTE(m_h_tile_segreduce_s_kernel != cl_kernel(0)); // built for reduce?
		_ASSERTE(n_elem_num <= INT_MAX);
		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = ((n_elem_num + m_n_tile_size - 1) / m_n_tile_size) * n_local_work_size;
		//size_t n_tile_num = n_global_work_size / n_local_work_size; // debug

		return clCall1D6Ex(h_cmd_queue, m_h_tile_segreduce_s_kernel,
			n_global_work_size, n_local_work_size, dp_reductions,
			dp_data, dp_head_flags, int(n_elem_num));
	}

	CCLKernelCall Enqueue_TileSegReduceSingle_Packed(cl_command_queue h_cmd_queue,
		cl_mem dp_reductions, const cl_mem dp_data, const cl_mem dp_packed_head_flags, size_t n_elem_num) const
	{
		_ASSERTE(m_h_tile_segreduce_s_pphf_kernel != cl_kernel(0)); // built for reduce?
		_ASSERTE(n_elem_num <= INT_MAX);
		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = ((n_elem_num + m_n_tile_size - 1) / m_n_tile_size) * n_local_work_size;
		//size_t n_tile_num = n_global_work_size / n_local_work_size; // debug

		return clCall1D6Ex(h_cmd_queue, m_h_tile_segreduce_s_pphf_kernel,
			n_global_work_size, n_local_work_size, dp_reductions,
			dp_data, dp_packed_head_flags, int(n_elem_num));
	}

	CCLKernelCall Enqueue_TileSegReduceSingle_PackedNaturalOrder(cl_command_queue h_cmd_queue,
		cl_mem dp_reductions, const cl_mem dp_data, const cl_mem dp_packed_head_flags, size_t n_elem_num) const
	{
		_ASSERTE(m_h_tile_segreduce_s_pphfno_kernel != cl_kernel(0)); // built for reduce?
		_ASSERTE(n_elem_num <= INT_MAX);
		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = ((n_elem_num + m_n_tile_size - 1) / m_n_tile_size) * n_local_work_size;
		//size_t n_tile_num = n_global_work_size / n_local_work_size; // debug

		return clCall1D6Ex(h_cmd_queue, m_h_tile_segreduce_s_pphfno_kernel,
			n_global_work_size, n_local_work_size, dp_reductions,
			 dp_data, dp_packed_head_flags, int(n_elem_num));
	}

	CCLKernelCall Enqueue_TileSegReduce_SpineAdjust(cl_command_queue h_cmd_queue,
		cl_mem dp_reductions, const cl_mem dp_tile_tail_counts_scan,
		const cl_mem dp_reduction_head_flags, size_t n_reduction_tile_size, int b_packed_reduction,
		const cl_mem dp_tile_carry, const cl_mem dp_tile_head_flags, size_t n_tile_num) const
	{
		_ASSERTE(m_h_tile_segreduce_spine_kernel != cl_kernel(0)); // built for reduce?
		_ASSERTE(n_reduction_tile_size <= INT_MAX);
		_ASSERTE(n_tile_num <= INT_MAX);

		_ASSERTE(n_tile_num > 1); // needs one less, if n_tile_num <= 1 there would be a zero size launch

		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = ((n_tile_num - 1 + m_n_tile_size - 1) / m_n_tile_size) * n_local_work_size;
		//size_t n_tile_num = n_global_work_size / n_local_work_size; // debug

		_ASSERTE(b_packed_reduction); // other mode not supported (historical)
		return clCall1D7Ex(h_cmd_queue, m_h_tile_segreduce_spine_kernel,
			n_global_work_size, n_local_work_size, dp_reductions, dp_tile_tail_counts_scan,
			dp_reduction_head_flags, int(n_reduction_tile_size), //b_packed_reduction,
			dp_tile_carry, dp_tile_head_flags, int(n_tile_num));
	}

	/*CCLKernelCall Enqueue_TileSegReduce_SpineAdjust_Downsweep(cl_command_queue h_cmd_queue,
		cl_mem dp_reductions, const cl_mem dp_tile_tail_counts_scan,
		const cl_mem dp_reduction_head_flags, size_t n_reduction_tile_size, int b_packed_reduction,
		const cl_mem dp_tile_carry, const cl_mem dp_scan_carry,
		const cl_mem dp_tile_head_flags, size_t n_tile_num) const
	{
		_ASSERTE(m_h_tile_segreduce_spine_ds_kernel != cl_kernel(0)); // built for reduce?
		_ASSERTE(n_reduction_tile_size <= INT_MAX);
		_ASSERTE(n_tile_num <= INT_MAX);

		_ASSERTE(n_tile_num > 1); // needs one less, if n_tile_num <= 1 there would be a zero size launch

		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = ((n_tile_num - 1 + m_n_tile_size - 1) / m_n_tile_size) * n_local_work_size;
		//size_t n_tile_num = n_global_work_size / n_local_work_size; // debug

		_ASSERTE(b_packed_reduction); // other mode not supported (historical)
		return clCall1D7Ex(h_cmd_queue, m_h_tile_segreduce_spine_ds_kernel,
			n_global_work_size, n_local_work_size, dp_reductions, dp_tile_tail_counts_scan,
			dp_reduction_head_flags, int(n_reduction_tile_size), //b_packed_reduction,
			dp_tile_carry, dp_scan_carry, dp_tile_head_flags, int(n_tile_num));
	}*/

	CCLKernelCall Enqueue_TileSegReduce_SpineAdjust_Downsweep_Packed(cl_command_queue h_cmd_queue,
		cl_mem dp_reductions, const cl_mem dp_tile_tail_counts_scan,
		const cl_mem dp_reduction_head_flags, size_t n_reduction_tile_size, int b_packed_reduction,
		const cl_mem dp_tile_carry, const cl_mem dp_scan_carry,
		const cl_mem dp_packed_tile_head_flags, size_t n_tile_num) const
	{
		_ASSERTE(m_h_tile_segreduce_spine_ds_pphf_kernel != cl_kernel(0)); // built for reduce?
		_ASSERTE(n_reduction_tile_size <= INT_MAX);
		_ASSERTE(n_tile_num <= INT_MAX);

		_ASSERTE(n_tile_num > 1); // needs one less, if n_tile_num <= 1 there would be a zero size launch

		size_t n_local_work_size = m_n_workgroup_size;
		size_t n_global_work_size = ((n_tile_num - 1 + m_n_tile_size - 1) / m_n_tile_size) * n_local_work_size;
		//size_t n_tile_num = n_global_work_size / n_local_work_size; // debug

		_ASSERTE(b_packed_reduction); // other mode not supported (historical)
		return clCall1D7Ex(h_cmd_queue, m_h_tile_segreduce_spine_ds_pphf_kernel,
			n_global_work_size, n_local_work_size, dp_reductions, dp_tile_tail_counts_scan,
			dp_reduction_head_flags, int(n_reduction_tile_size), //b_packed_reduction,
			dp_tile_carry, dp_scan_carry, dp_packed_tile_head_flags, int(n_tile_num));
	}

	/**
	 *	@brief gets the size of packed head flags
	 *
	 *	@param[in] n_flag_num is number of head flags (matches the number of elements in a segmented scan or reduce)
	 *
	 *	@return Returns the size of packed head flags, in bytes.
	 *
	 *	@note The head flags are assumed to be stored in 32-bit integers, with the least significant
	 *		bit corresponding to the flag with lower index than the most significant bit. In little
	 *		endian, this transparently extends to arbitrary size integers.
	 */
	size_t n_PackedHeadFlags_Size(size_t n_flag_num) const
	{
		size_t n_result = n_Align_Up(n_flag_num, m_n_tile_size) / 32 * sizeof(uint32_t); // easier and more understandable
		//size_t n_tile_num = (n_flag_num + m_n_tile_size - 1) / m_n_tile_size;
		//return n_tile_num * (m_n_tile_size / 32) * sizeof(uint32_t);
		_ASSERTE(n_result == ((n_flag_num + m_n_tile_size - 1) / m_n_tile_size) *
			(m_n_tile_size / 32) * sizeof(uint32_t)); // should be the same
		return n_result;
	}

protected:
	CLresult n_Enqueue_SegmentedScan_Packed_NoAutoTune(cl_command_queue h_cmd_queue, cl_mem dp_segscan,
		const cl_mem dp_data, const cl_mem dp_packed_head_flags, size_t n_elem_num,
		CCLTempBufferStack &r_memory_alloc)
	{
		if(n_elem_num > m_n_tile_size) {
			size_t n_tile_num = (n_elem_num + m_n_tile_size - 1) / m_n_tile_size;
			CCLTempBufferReservation dp_tile_flags(n_tile_num * sizeof(uint32_t)/*m_config.n_DataType_Size()*/, r_memory_alloc);
			CCLTempBufferReservation dp_tile_carry(n_tile_num * sizeof(uint32_t)/*m_config.n_DataType_Size()*/, r_memory_alloc);

			CCLTempBufferReservation dp_pack_flags(n_tile_num * m_n_tile_size / 32 * sizeof(uint32_t), r_memory_alloc);
			// bit-pack the decoded head flags, save a bunch of traffic in the downsweep pass

			CLresult n_result = Enqueue_TileSegScan_Carry_Packed(h_cmd_queue, dp_tile_carry,
				dp_tile_flags, dp_data, dp_packed_head_flags, n_elem_num);
			if(n_result != cl_Success)
				return n_result;

			n_result = n_Enqueue_SegmentedScan_NoAutoTune(h_cmd_queue,
				dp_tile_carry, dp_tile_carry, dp_tile_flags, n_tile_num, r_memory_alloc);
			if(n_result != cl_Success)
				return n_result;
			// scan the segment carry

			n_result = Enqueue_TileSegScan_Downsweep_Packed(h_cmd_queue, dp_segscan,
				dp_data, dp_tile_carry, dp_packed_head_flags, n_elem_num);
			if(n_result != cl_Success)
				return n_result;
			// segmented reduction downsweep
		} else {
			CLresult n_result = Enqueue_TileSegScan_Packed(h_cmd_queue, dp_segscan,
				dp_data, dp_packed_head_flags, n_elem_num);
			if(n_result != cl_Success)
				return n_result;
		}
		return cl_Success;
	}

	CLresult n_Enqueue_SegmentedScan_PackedNaturalOrder_NoAutoTune(cl_command_queue h_cmd_queue, cl_mem dp_segscan,
		const cl_mem dp_data, const cl_mem dp_packed_head_flags, size_t n_elem_num,
		CCLTempBufferStack &r_memory_alloc)
	{
		if(n_elem_num > m_n_tile_size) {
			size_t n_tile_num = (n_elem_num + m_n_tile_size - 1) / m_n_tile_size;
			CCLTempBufferReservation dp_tile_flags(n_tile_num * sizeof(uint32_t)/*m_config.n_DataType_Size()*/, r_memory_alloc);
			CCLTempBufferReservation dp_tile_carry(n_tile_num * sizeof(uint32_t)/*m_config.n_DataType_Size()*/, r_memory_alloc);

			CLresult n_result = Enqueue_TileSegScan_Carry_PackedNaturalOrder(h_cmd_queue, dp_tile_carry,
				dp_tile_flags, dp_data, dp_packed_head_flags, n_elem_num);
			if(n_result != cl_Success)
				return n_result;

			n_result = n_Enqueue_SegmentedScan_NoAutoTune(h_cmd_queue,
				dp_tile_carry, dp_tile_carry, dp_tile_flags, n_tile_num, r_memory_alloc);
			if(n_result != cl_Success)
				return n_result;
			// scan the segment carry

			n_result = Enqueue_TileSegScan_Downsweep_PackedNaturalOrder(h_cmd_queue, dp_segscan,
				dp_data, dp_tile_carry, dp_packed_head_flags, n_elem_num);
			if(n_result != cl_Success)
				return n_result;
			// segmented reduction downsweep
		} else {
			CLresult n_result = Enqueue_TileSegScan_PackedNaturalOrder(h_cmd_queue, dp_segscan,
				dp_data, dp_packed_head_flags, n_elem_num);
			if(n_result != cl_Success)
				return n_result;
		}
		return cl_Success;
	}

	CLresult n_Enqueue_SegmentedScan_NoAutoTune(cl_command_queue h_cmd_queue, cl_mem dp_segscan,
		const cl_mem dp_data, const cl_mem dp_head_flags, size_t n_elem_num,
		CCLTempBufferStack &r_memory_alloc)
	{
		if(n_elem_num > m_n_tile_size) {
			size_t n_tile_num = (n_elem_num + m_n_tile_size - 1) / m_n_tile_size;
			CCLTempBufferReservation dp_tile_flags(n_tile_num * sizeof(uint32_t)/*m_config.n_DataType_Size()*/, r_memory_alloc);
			CCLTempBufferReservation dp_tile_carry(n_tile_num * sizeof(uint32_t)/*m_config.n_DataType_Size()*/, r_memory_alloc);

			CCLTempBufferReservation dp_pack_flags(n_tile_num * m_n_tile_size / 32 * sizeof(uint32_t), r_memory_alloc);
			// bit-pack the decoded head flags, save a bunch of traffic in the downsweep pass

#if 1
			CLresult n_result = Enqueue_TileSegScan_Carry_Pack(h_cmd_queue, dp_tile_carry,
				dp_tile_flags, dp_pack_flags, dp_data, dp_head_flags, n_elem_num);
			//CLresult n_result = Enqueue_TileSegScan_Carry(h_cmd_queue, dp_tile_carry,
			//	dp_tile_flags, dp_data, dp_head_flags, n_elem_num);
			if(n_result != cl_Success)
				return n_result;
#else // 1
			CCLTempBufferReservation dp_tile_tcounts((n_tile_num + 1) * m_config.n_DataType_Size(), r_memory_alloc); // +1!
			CLresult n_result = Enqueue_TileSegReduce_Bootstrap(h_cmd_queue, dp_tile_tcounts,
				dp_tile_flags, dp_head_flags, n_elem_num);
			if(n_result != cl_Success)
				return false;
			// bootstrap the segmented reduction by calculating tail counts per tile

			if(!IntExScan(h_cmd_queue, dp_tile_tcounts, dp_tile_tcounts, n_tile_num + 1, r_memory_alloc)) // +1!
				return false;
			// todo - this needs to be an int scan! (works now but won't work if the data type changes e.g.-to float)

			uint32_t n_reds_num = m_n_tile_size;
			if(n_elem_num > m_n_tile_size) {
				CCLCommandQueueWrapper q(h_cmd_queue);
				if(q.n_Finish() != cl_Success) // ouch :(
					return false;
				if(q.n_Enqueue_Memcpy_DtoH(&n_reds_num, dp_tile_tcounts,
				   (n_tile_num/* - 1*/) * sizeof(int32_t), sizeof(int32_t)) != cl_Success) // +1!
					return false;
				if(n_reds_num < 1)
					n_reds_num = 1; // would likely indicate a failure in counting tail flags
			}
			// in case there is a lot of data, read the number of reductions, otherwise just use a large temp buffer

			CCLTempBufferReservation dp_reductions(n_reds_num * m_config.n_DataType_Size(), r_memory_alloc);
			// alloc the following temp buffers

			n_result = Enqueue_TileSegReduce(h_cmd_queue, dp_reductions,
				dp_tile_carry, dp_tile_tcounts, dp_data, dp_head_flags, n_elem_num);
			if(n_result != cl_Success)
				return false;
			// apply the segmented reduction to tile partials

			// you're doing it wrong! don't use reduction, it has complicated logic for allocating and writing the reductions which are not needed at all!
			// all you need are the tile partials! just make a segscan kernel which calculates tile partials but not the element ones,
			// their number is predictible so it is not needed to synchronize CPU and GPU!
			// howgh.
#endif // 1

			n_result = n_Enqueue_SegmentedScan_NoAutoTune(h_cmd_queue,
				dp_tile_carry, dp_tile_carry, dp_tile_flags, n_tile_num, r_memory_alloc);
			if(n_result != cl_Success)
				return n_result;
			// scan the segment carry

			//n_result = Enqueue_TileSegScan_Downsweep(h_cmd_queue, dp_segscan,
			//	dp_data, dp_tile_carry, dp_head_flags, n_elem_num);
			n_result = Enqueue_TileSegScan_Downsweep_Packed(h_cmd_queue, dp_segscan,
				dp_data, dp_tile_carry, dp_pack_flags, n_elem_num);
			if(n_result != cl_Success)
				return n_result;
			// segmented reduction downsweep
		} else {
			/*CCLTempBufferReservation dp_tile_carry(1 * m_config.n_DataType_Size(), r_memory_alloc);
			CCLTempBufferReservation dp_tile_flags(1 * m_config.n_DataType_Size(), r_memory_alloc);*/
			// do not need those

			CLresult n_result = Enqueue_TileSegScan(h_cmd_queue, dp_segscan, /*dp_tile_carry,
				dp_tile_flags,*/ dp_data, dp_head_flags, n_elem_num);
			if(n_result != cl_Success)
				return n_result;
		}
		return cl_Success;
	}

	// scenario A, user knows reduce size (or its upper bound) and has allocated a dest buffer herself
	//		dp_segreduce = allocated buffer, r_n_reduction_num = n, n_max_reduction_num = n		(size not checked, buffer reused)
	// scenario B, user knows reduce size and wants the function to allcoate the dest buffer
	//		dp_segreduce = null handle, r_n_reduction_num = n, n_max_reduction_num = 0			(the buffer will be allocated)
	// scenario C, user does not know reduce size but has allocated a buffer herself
	//		dp_segreduce = allocated buffer, r_n_reduction_num = 0, n_max_reduction_num = n		(the buffer will be reallocated if n is not enough)
	// scenario D, user does not know reduce size and wants the function to allcoate the dest buffer
	//		dp_segreduce = null handle, r_n_reduction_num = 0, n_max_reduction_num = 0			(the buffer will be allocated)
	template <class CCLScan> // "forward" decl
	CLresult n_Enqueue_SegmentedReduce_NoAutoTune(cl_command_queue h_cmd_queue,
		cl_mem &dp_segreduce, // a buffer or 0 for allocate it for me
		size_t &r_n_reduction_num, // number of reductions or 0 for dont know
		size_t n_max_reduction_num, // max number of reductions that dp_segreduce can hold or 0 if dp_segreduce == 0
		const cl_mem dp_data, const cl_mem dp_head_flags, size_t n_elem_num, CCLScan &r_int_scan,
		CCLTempBufferStack &r_memory_alloc)
	{
		_ASSERTE(!n_max_reduction_num == !dp_segreduce); // either both zero or both nonzero
		_ASSERTE(!n_max_reduction_num || n_max_reduction_num >= r_n_reduction_num); // if you guess, make a sane guess (if this fails then the dp_segreduce buffer has fewer elements than is guessed to be the number of reductions)
		_ASSERTE(r_n_reduction_num <= n_elem_num); // cannot possibly be more than that

		size_t n_tile_num = (n_elem_num + m_n_tile_size - 1) / m_n_tile_size;
		CCLTempBufferReservation dp_tile_flags(n_tile_num * sizeof(uint32_t), r_memory_alloc);
		CCLTempBufferReservation dp_tile_carry(n_tile_num * sizeof(uint32_t)/*m_config.n_DataType_Size()*/, r_memory_alloc);

		bool b_need_reduction_total = !r_n_reduction_num;
#ifdef _DEBUG
		b_need_reduction_total = true;
#endif // _DEBUG

		CCLTempBufferReservation dp_tile_tcounts((n_tile_num +
			((b_need_reduction_total)? 1 : 0)) * sizeof(uint32_t), r_memory_alloc); // +1!

		CCLTempBufferReservation dp_pack_flags((n_tile_num * m_n_tile_size / 32 + 1) * sizeof(uint32_t), r_memory_alloc);
		// bit-pack the decoded head flags, save a bunch of traffic in the downsweep pass

		CLresult n_result = Enqueue_TileSegReduce_Bootstrap_Pack(h_cmd_queue, dp_tile_tcounts,
			dp_tile_flags, dp_pack_flags, dp_head_flags, n_elem_num);
		if(n_result != cl_Success)
			return n_result;
		// bootstrap the segmented reduction by calculating tail counts per tile

		if(n_tile_num > 1) {
			n_result = r_int_scan.Enqueue_ExScan(h_cmd_queue, dp_tile_tcounts,
				dp_tile_tcounts, n_tile_num + ((b_need_reduction_total)? 1 : 0)); // +1!
			if(n_result != cl_Success)
				return n_result;
			// todo - this needs to be an int scan! (works now but won't work if the data type changes e.g. to float)
			// todo - make and use the reentrant version with a temp buffer allocator
		}
		// scan the tile counts to get tail counts

		uint32_t n_reds_num = r_n_reduction_num;
		if(b_need_reduction_total) { // the caller does not know how many there are, need to synchronize and check
			CCLCommandQueueWrapper q(h_cmd_queue);
			if((n_result = q.n_Finish()) != cl_Success) // ouch :(
				return n_result;
			size_t n_sum_index = (n_tile_num > 1)? n_tile_num : n_tile_num - 1; // if we just exscanned it then +1!
			if((n_result = q.n_Enqueue_Memcpy_DtoH(&n_reds_num, dp_tile_tcounts,
			   n_sum_index * sizeof(uint32_t), sizeof(uint32_t))) != cl_Success) // +1!
				return n_result;
			if(n_reds_num < 1)
				return (CLresult)-123456;//n_reds_num = 1; // would likely indicate a failure in counting tail flags (there always is at least one)
#ifdef _DEBUG
			_ASSERTE(!r_n_reduction_num || r_n_reduction_num == n_reds_num); // make sure that the caller indeed knows
#endif // _DEBUG
			r_n_reduction_num = n_reds_num; // write out the real number
		}
		if(n_max_reduction_num < n_reds_num) {
			if(dp_segreduce) {
				clReleaseMemObject(dp_segreduce);
				dp_segreduce = cl_mem(0);
			}
			n_result = CCLContextWrapper(r_memory_alloc.h_Context()).n_CreateBuffer(dp_segreduce,
				n_reds_num * sizeof(uint32_t)/*m_config.n_DataType_Size()*/);
			if(n_result != cl_Success) {
				dp_segreduce = cl_mem(0); // just to make sure
				return n_result;
			}
		}
		// in case there is a lot of data, read the number of reductions, otherwise just use a large temp buffer

		if(n_tile_num > 1) {
			n_result = Enqueue_TileSegReduce_Packed(h_cmd_queue, dp_segreduce,
				dp_tile_carry, dp_tile_tcounts, dp_data, /*dp_head_flags*/dp_pack_flags, n_elem_num);
			if(n_result != cl_Success)
				return n_result;
			// apply the segmented reduction to tile partials

			/*_ASSERTE(n_tile_num <= m_n_tile_size);
			n_result = n_Enqueue_SegmentedScan_NoAutoTune(h_cmd_queue, dp_tile_carry,
				dp_tile_carry, dp_head_flags, n_tile_num, r_memory_alloc);
			if(n_result != cl_Success)
				return n_result;*/
			// you are getting confused now, this is done below

			if(n_tile_num > m_n_tile_size) {
				size_t n_tile_num2 = (n_tile_num + m_n_tile_size - 1) / m_n_tile_size;
				CCLTempBufferReservation dp_tile_flags2(n_tile_num2 * sizeof(uint32_t)/*m_config.n_DataType_Size()*/, r_memory_alloc);
				CCLTempBufferReservation dp_tile_carry2(n_tile_num2 * sizeof(uint32_t)/*m_config.n_DataType_Size()*/, r_memory_alloc);

				CCLTempBufferReservation dp_packed_tile_head_flags(n_tile_num2 * m_n_tile_size / 32 * sizeof(uint32_t), r_memory_alloc);
				// bit-pack the decoded head flags, save a bunch of traffic in the downsweep pass

				CLresult n_result = Enqueue_TileSegScan_Carry_Pack(h_cmd_queue, dp_tile_carry2,
					dp_tile_flags2, dp_packed_tile_head_flags, dp_tile_carry, dp_tile_flags, n_tile_num);
				if(n_result != cl_Success)
					return n_result;

				n_result = n_Enqueue_SegmentedScan_NoAutoTune(h_cmd_queue,
					dp_tile_carry2, dp_tile_carry2, dp_tile_flags2, n_tile_num2, r_memory_alloc);
				if(n_result != cl_Success)
					return n_result;
				// scan the segment carry

				// note that at this point, dp_tile_carry2 contains global segmented scan of tile carry (dp_tile_carry)
				// there are no tile size constraints tying segmented scan to the segmented reduce spine adjust below.

				// note that the tile size argument below is likely a nonsense one,
				// this is not tied to the global scan tile size and can in fact
				// either run at the same granularity as the rest of the segmented reduce kernels
				// or at different one but then it needs more work

				n_result = Enqueue_TileSegReduce_SpineAdjust_Downsweep_Packed(h_cmd_queue, dp_segreduce,
					dp_tile_tcounts, dp_pack_flags, m_n_tile_size / 32, 1, dp_tile_carry, dp_tile_carry2,
					dp_packed_tile_head_flags, n_tile_num);
				/*n_result = Enqueue_TileSegReduce_SpineAdjust_Downsweep(h_cmd_queue, dp_segreduce,
					dp_tile_tcounts, dp_head_flags, m_n_tile_size, dp_tile_carry, dp_tile_carry2,
					dp_packed_tile_head_flags, n_tile_num);*/
				if(n_result != cl_Success)
					return n_result;
				// segmented reduction downsweep
			} else {
				return Enqueue_TileSegReduce_SpineAdjust(h_cmd_queue, dp_segreduce, dp_tile_tcounts,
					dp_pack_flags, m_n_tile_size / 32, 1, dp_tile_carry, dp_tile_flags, n_tile_num);
				// will need also a downsweep variant preceded by corrseponding seg-scan kernels
			}

			// need a new kernel which does seg scan (downsweep) and reduction fixup in one
			// dp_reductions[dp_tile_tcounts[i]] += scan[i] for each i where either !i or
			// dp_tile_tcounts[i] > dp_tile_tcounts[i - 1] ... TileSegReduce_SpineAdjust()
		} else {
			n_result = Enqueue_TileSegReduceSingle_Packed(h_cmd_queue, dp_segreduce,
				/*dp_tile_carry, dp_tile_tcounts,*/ dp_data, /*dp_head_flags*/dp_pack_flags, n_elem_num);
			if(n_result != cl_Success)
				return n_result;
			// apply the segmented reduction to tile partials
		}

		return cl_Success;
	}

	template <class CCLScan> // "forward" decl
	CLresult n_Enqueue_SegmentedReduce_Packed_NoAutoTune(cl_command_queue h_cmd_queue,
		cl_mem &dp_segreduce, // a buffer or 0 for allocate it for me
		size_t &r_n_reduction_num, // number of reductions or 0 for dont know
		size_t n_max_reduction_num, // max number of reductions that dp_segreduce can hold or 0 if dp_segreduce == 0
		const cl_mem dp_data, /*const cl_mem dp_head_flags,*/
		const cl_mem dp_packed_head_flags, size_t n_elem_num, CCLScan &r_int_scan,
		CCLTempBufferStack &r_memory_alloc)
	{
		_ASSERTE(!n_max_reduction_num == !dp_segreduce); // either both zero or both nonzero
		_ASSERTE(!n_max_reduction_num || n_max_reduction_num >= r_n_reduction_num); // if you guess, make a sane guess (if this fails then the dp_segreduce buffer has fewer elements than is guessed to be the number of reductions)
		_ASSERTE(r_n_reduction_num <= n_elem_num); // cannot possibly be more than that

		size_t n_tile_num = (n_elem_num + m_n_tile_size - 1) / m_n_tile_size;
		CCLTempBufferReservation dp_tile_flags(n_tile_num * sizeof(uint32_t), r_memory_alloc);
		CCLTempBufferReservation dp_tile_carry(n_tile_num * sizeof(uint32_t)/*m_config.n_DataType_Size()*/, r_memory_alloc);

		bool b_need_reduction_total = !r_n_reduction_num;
#ifdef _DEBUG
		b_need_reduction_total = true;
#endif // _DEBUG

		CCLTempBufferReservation dp_tile_tcounts((n_tile_num +
			((b_need_reduction_total)? 1 : 0)) * sizeof(uint32_t), r_memory_alloc); // +1!

		//CCLTempBufferReservation dp_pack_flags((n_tile_num * m_n_tile_size / 32 + 1) * sizeof(uint32_t), r_memory_alloc);
		// bit-pack the decoded head flags, save a bunch of traffic in the downsweep pass

		CLresult n_result = Enqueue_TileSegReduce_Bootstrap_Packed/**/(h_cmd_queue, dp_tile_tcounts,
			dp_tile_flags, /*dp_head_flags*/dp_packed_head_flags, n_elem_num);
		if(n_result != cl_Success)
			return n_result;
		// bootstrap the segmented reduction by calculating tail counts per tile

		if(n_tile_num > 1) {
			n_result = r_int_scan.Enqueue_ExScan(h_cmd_queue, dp_tile_tcounts,
				dp_tile_tcounts, n_tile_num + ((b_need_reduction_total)? 1 : 0)); // +1!
			if(n_result != cl_Success)
				return n_result;
			// todo - this needs to be an int scan! (works now but won't work if the data type changes e.g. to float)
			// todo - make and use the reentrant version with a temp buffer allocator
		}
		// scan the tile counts to get tail counts

		uint32_t n_reds_num = r_n_reduction_num;
		if(b_need_reduction_total) { // the caller does not know how many there are, need to synchronize and check
			CCLCommandQueueWrapper q(h_cmd_queue);
			if((n_result = q.n_Finish()) != cl_Success) // ouch :(
				return n_result;
			size_t n_sum_index = (n_tile_num > 1)? n_tile_num : n_tile_num - 1; // if we just exscanned it then +1!
			if((n_result = q.n_Enqueue_Memcpy_DtoH(&n_reds_num, dp_tile_tcounts,
			   n_sum_index * sizeof(uint32_t), sizeof(uint32_t))) != cl_Success) // +1!
				return n_result;
			if(n_reds_num < 1)
				return (CLresult)-123456;//n_reds_num = 1; // would likely indicate a failure in counting tail flags (there always is at least one)
#ifdef _DEBUG
			_ASSERTE(!r_n_reduction_num || r_n_reduction_num >= n_reds_num); // make sure that the caller indeed knows at least the correct upper bound
#endif // _DEBUG
			r_n_reduction_num = n_reds_num; // write out the real number
		}
		if(n_max_reduction_num < n_reds_num) {
			if(dp_segreduce) {
				clReleaseMemObject(dp_segreduce);
				dp_segreduce = cl_mem(0);
			}
			n_result = CCLContextWrapper(r_memory_alloc.h_Context()).n_CreateBuffer(dp_segreduce,
				n_reds_num * sizeof(uint32_t)/*m_config.n_DataType_Size()*/);
			if(n_result != cl_Success) {
				dp_segreduce = cl_mem(0); // just to make sure
				return n_result;
			}
		}
		// in case there is a lot of data, read the number of reductions, otherwise just use a large temp buffer

		if(n_tile_num > 1) {
			n_result = Enqueue_TileSegReduce_Packed(h_cmd_queue, dp_segreduce,
				dp_tile_carry, dp_tile_tcounts, dp_data, dp_packed_head_flags, n_elem_num);
			if(n_result != cl_Success)
				return n_result;
			// apply the segmented reduction to tile partials

			/*_ASSERTE(n_tile_num <= m_n_tile_size);
			n_result = n_Enqueue_SegmentedScan_NoAutoTune(h_cmd_queue, dp_tile_carry,
				dp_tile_carry, dp_head_flags, n_tile_num, r_memory_alloc);
			if(n_result != cl_Success)
				return n_result;*/
			// you are getting confused now, this is done below

			if(n_tile_num > m_n_tile_size) {
				size_t n_tile_num2 = (n_tile_num + m_n_tile_size - 1) / m_n_tile_size;
				CCLTempBufferReservation dp_tile_flags2(n_tile_num2 * sizeof(uint32_t)/*m_config.n_DataType_Size()*/, r_memory_alloc);
				CCLTempBufferReservation dp_tile_carry_scan(n_tile_num2 * sizeof(uint32_t)/*m_config.n_DataType_Size()*/, r_memory_alloc);

				CCLTempBufferReservation dp_packed_tile_head_flags(n_tile_num2 * m_n_tile_size / 32 * sizeof(uint32_t), r_memory_alloc);
				// bit-pack the decoded head flags, save a bunch of traffic in the downsweep pass

				CLresult n_result = Enqueue_TileSegScan_Carry_Pack(h_cmd_queue, dp_tile_carry_scan,
					dp_tile_flags2, dp_packed_tile_head_flags, dp_tile_carry, dp_tile_flags, n_tile_num);
				if(n_result != cl_Success)
					return n_result;

				n_result = n_Enqueue_SegmentedScan_NoAutoTune(h_cmd_queue,
					dp_tile_carry_scan, dp_tile_carry_scan, dp_tile_flags2, n_tile_num2, r_memory_alloc);
				if(n_result != cl_Success)
					return n_result;
				// scan the segment carry

				// note that at this point, dp_tile_carry2 contains global segmented scan of tile carry (dp_tile_carry)
				// there are no tile size constraints tying segmented scan to the segmented reduce spine adjust below.

				// note that the tile size argument below is likely a nonsense one,
				// this is not tied to the global scan tile size and can in fact
				// either run at the same granularity as the rest of the segmented reduce kernels
				// or at different one but then it needs more work

				n_result = Enqueue_TileSegReduce_SpineAdjust_Downsweep_Packed(h_cmd_queue, dp_segreduce,
					dp_tile_tcounts, dp_packed_head_flags, m_n_tile_size / 32, 1, dp_tile_carry, dp_tile_carry_scan,
					dp_packed_tile_head_flags, n_tile_num);
				/*n_result = Enqueue_TileSegReduce_SpineAdjust_Downsweep(h_cmd_queue, dp_segreduce,
					dp_tile_tcounts, dp_head_flags, m_n_tile_size, 0, dp_tile_carry, dp_tile_carry_scan,
					dp_packed_tile_head_flags, n_tile_num);*/
				if(n_result != cl_Success)
					return n_result;
				// segmented reduction downsweep
			} else {
				return Enqueue_TileSegReduce_SpineAdjust(h_cmd_queue, dp_segreduce, dp_tile_tcounts,
					dp_packed_head_flags, m_n_tile_size / 32, 1, dp_tile_carry, dp_tile_flags, n_tile_num);
				// will need also a downsweep variant preceded by corrseponding seg-scan kernels
			}

			// need a new kernel which does seg scan (downsweep) and reduction fixup in one
			// dp_reductions[dp_tile_tcounts[i]] += scan[i] for each i where either !i or
			// dp_tile_tcounts[i] > dp_tile_tcounts[i - 1] ... TileSegReduce_SpineAdjust()
		} else {
			n_result = Enqueue_TileSegReduceSingle_Packed(h_cmd_queue, dp_segreduce,
				/*dp_tile_carry, dp_tile_tcounts,*/ dp_data, dp_packed_head_flags, n_elem_num);
			if(n_result != cl_Success)
				return n_result;
			// apply the segmented reduction to tile partials
		}

		return cl_Success;
	}

	template <class CCLScan> // "forward" decl
	CLresult n_Enqueue_SegmentedReduce_Packed_NoAutoTune_Profile(cl_command_queue h_cmd_queue,
		cl_mem &dp_segreduce, // a buffer or 0 for allocate it for me
		size_t &r_n_reduction_num, // number of reductions or 0 for dont know
		size_t n_max_reduction_num, // max number of reductions that dp_segreduce can hold or 0 if dp_segreduce == 0
		const cl_mem dp_data, /*const cl_mem dp_head_flags,*/
		const cl_mem dp_packed_head_flags, size_t n_elem_num, CCLScan &r_int_scan,
		CCLTempBufferStack &r_memory_alloc, double &r_f_time_bootstrap, double &r_f_time_exscan,
		double &r_f_time_segreduce, double &r_f_time_segscan, double &r_f_time_spine, double &r_f_time_total)
	{
		_ASSERTE(!n_max_reduction_num == !dp_segreduce); // either both zero or both nonzero
		_ASSERTE(!n_max_reduction_num || n_max_reduction_num >= r_n_reduction_num); // if you guess, make a sane guess (if this fails then the dp_segreduce buffer has fewer elements than is guessed to be the number of reductions)
		_ASSERTE(r_n_reduction_num <= n_elem_num); // cannot possibly be more than that

		r_f_time_bootstrap = 0;
		r_f_time_segreduce = 0;
		r_f_time_total = 0; // just in case of error

		r_f_time_exscan = 0;
		r_f_time_segscan = 0;
		r_f_time_spine = 0; // not always taken

		CCLUniqueEvent ev_bootstrap, ev_segreduce, ev_spine, ev_exscan_start,
			ev_exscan_end, ev_segscan_start, ev_segscan_end, ev_start, ev_end;
		// profiling info

		CCLCommandQueueWrapper cmd_queue(h_cmd_queue);

		cmd_queue.n_Enqueue_Marker(ev_start);

		size_t n_tile_num = (n_elem_num + m_n_tile_size - 1) / m_n_tile_size;
		CCLTempBufferReservation dp_tile_flags(n_tile_num * sizeof(uint32_t), r_memory_alloc);
		CCLTempBufferReservation dp_tile_carry(n_tile_num * sizeof(uint32_t)/*m_config.n_DataType_Size()*/, r_memory_alloc);

		bool b_need_reduction_total = !r_n_reduction_num;
#ifdef _DEBUG
		b_need_reduction_total = true;
#endif // _DEBUG

		CCLTempBufferReservation dp_tile_tcounts((n_tile_num +
			((b_need_reduction_total)? 1 : 0)) * sizeof(uint32_t), r_memory_alloc); // +1!

		//CCLTempBufferReservation dp_pack_flags((n_tile_num * m_n_tile_size / 32 + 1) * sizeof(uint32_t), r_memory_alloc);
		// bit-pack the decoded head flags, save a bunch of traffic in the downsweep pass

		CLresult n_result = Enqueue_TileSegReduce_Bootstrap_Packed/**/(h_cmd_queue, dp_tile_tcounts,
			dp_tile_flags, /*dp_head_flags*/dp_packed_head_flags, n_elem_num).GetEvent(ev_bootstrap);
		if(n_result != cl_Success)
			return n_result;
		// bootstrap the segmented reduction by calculating tail counts per tile

		cmd_queue.n_Enqueue_Marker(ev_exscan_start);

		if(n_tile_num > 1) {
			n_result = r_int_scan.Enqueue_ExScan(h_cmd_queue, dp_tile_tcounts,
				dp_tile_tcounts, n_tile_num + ((b_need_reduction_total)? 1 : 0)); // +1!
			if(n_result != cl_Success)
				return n_result;
			// todo - this needs to be an int scan! (works now but won't work if the data type changes e.g. to float)
			// todo - make and use the reentrant version with a temp buffer allocator
		}
		// scan the tile counts to get tail counts

		cmd_queue.n_Enqueue_Marker(ev_exscan_end);

		uint32_t n_reds_num = r_n_reduction_num;
		if(b_need_reduction_total) { // the caller does not know how many there are, need to synchronize and check
			CCLCommandQueueWrapper q(h_cmd_queue);
			if((n_result = q.n_Finish()) != cl_Success) // ouch :(
				return n_result;
			size_t n_sum_index = (n_tile_num > 1)? n_tile_num : n_tile_num - 1; // if we just exscanned it then +1!
			if((n_result = q.n_Enqueue_Memcpy_DtoH(&n_reds_num, dp_tile_tcounts,
			   n_sum_index * sizeof(uint32_t), sizeof(uint32_t))) != cl_Success) // +1!
				return n_result;
			if(n_reds_num < 1)
				return (CLresult)-123456;//n_reds_num = 1; // would likely indicate a failure in counting tail flags (there always is at least one)
#ifdef _DEBUG
			_ASSERTE(!r_n_reduction_num || r_n_reduction_num >= n_reds_num); // make sure that the caller indeed knows at least the correct upper bound
#endif // _DEBUG
			r_n_reduction_num = n_reds_num; // write out the real number
		}
		if(n_max_reduction_num < n_reds_num) {
			if(dp_segreduce) {
				clReleaseMemObject(dp_segreduce);
				dp_segreduce = cl_mem(0);
			}
			n_result = CCLContextWrapper(r_memory_alloc.h_Context()).n_CreateBuffer(dp_segreduce,
				n_reds_num * sizeof(uint32_t)/*m_config.n_DataType_Size()*/);
			if(n_result != cl_Success) {
				dp_segreduce = cl_mem(0); // just to make sure
				return n_result;
			}
		}
		// in case there is a lot of data, read the number of reductions, otherwise just use a large temp buffer

		if(n_tile_num > 1) {
			n_result = Enqueue_TileSegReduce_Packed(h_cmd_queue, dp_segreduce,
				dp_tile_carry, dp_tile_tcounts, dp_data, dp_packed_head_flags, n_elem_num).GetEvent(ev_segreduce);
			if(n_result != cl_Success)
				return n_result;
			// apply the segmented reduction to tile partials

			/*_ASSERTE(n_tile_num <= m_n_tile_size);
			n_result = n_Enqueue_SegmentedScan_NoAutoTune(h_cmd_queue, dp_tile_carry,
				dp_tile_carry, dp_head_flags, n_tile_num, r_memory_alloc);
			if(n_result != cl_Success)
				return n_result;*/
			// you are getting confused now, this is done below

			if(n_tile_num > m_n_tile_size) {
				cmd_queue.n_Enqueue_Marker(ev_segscan_start);

				size_t n_tile_num2 = (n_tile_num + m_n_tile_size - 1) / m_n_tile_size;
				CCLTempBufferReservation dp_tile_flags2(n_tile_num2 * sizeof(uint32_t)/*m_config.n_DataType_Size()*/, r_memory_alloc);
				CCLTempBufferReservation dp_tile_carry_scan(n_tile_num2 * sizeof(uint32_t)/*m_config.n_DataType_Size()*/, r_memory_alloc);

				CCLTempBufferReservation dp_packed_tile_head_flags(n_tile_num2 * m_n_tile_size / 32 * sizeof(uint32_t), r_memory_alloc);
				// bit-pack the decoded head flags, save a bunch of traffic in the downsweep pass

				CLresult n_result = Enqueue_TileSegScan_Carry_Pack(h_cmd_queue, dp_tile_carry_scan,
					dp_tile_flags2, dp_packed_tile_head_flags, dp_tile_carry, dp_tile_flags, n_tile_num);
				if(n_result != cl_Success)
					return n_result;

				n_result = n_Enqueue_SegmentedScan_NoAutoTune(h_cmd_queue,
					dp_tile_carry_scan, dp_tile_carry_scan, dp_tile_flags2, n_tile_num2, r_memory_alloc);
				if(n_result != cl_Success)
					return n_result;
				// scan the segment carry

				cmd_queue.n_Enqueue_Marker(ev_segscan_end);

				// note that at this point, dp_tile_carry2 contains global segmented scan of tile carry (dp_tile_carry)
				// there are no tile size constraints tying segmented scan to the segmented reduce spine adjust below.

				// note that the tile size argument below is likely a nonsense one,
				// this is not tied to the global scan tile size and can in fact
				// either run at the same granularity as the rest of the segmented reduce kernels
				// or at different one but then it needs more work

				n_result = Enqueue_TileSegReduce_SpineAdjust_Downsweep_Packed(h_cmd_queue, dp_segreduce,
					dp_tile_tcounts, dp_packed_head_flags, m_n_tile_size / 32, 1, dp_tile_carry, dp_tile_carry_scan,
					dp_packed_tile_head_flags, n_tile_num).GetEvent(ev_spine);
				/*n_result = Enqueue_TileSegReduce_SpineAdjust_Downsweep(h_cmd_queue, dp_segreduce,
					dp_tile_tcounts, dp_head_flags, m_n_tile_size, 0, dp_tile_carry, dp_tile_carry_scan,
					dp_packed_tile_head_flags, n_tile_num);*/
				if(n_result != cl_Success)
					return n_result;
				// segmented reduction downsweep

				cmd_queue.n_Enqueue_Marker(ev_end);

				ev_segscan_start.n_Wait();
				ev_segscan_end.n_Wait();
				uint64_t n_ss_start, n_ss_end;
				n_result = ev_segscan_start.n_GetProfilingCounter(n_ss_start, CL_PROFILING_COMMAND_END);
				n_result = ev_segscan_end.n_GetProfilingCounter(n_ss_end, CL_PROFILING_COMMAND_START);
				r_f_time_segscan = CCLUniqueEvent::f_ProfilingCounter_Difference(n_ss_start, n_ss_end);
			} else {
				n_result = Enqueue_TileSegReduce_SpineAdjust(h_cmd_queue, dp_segreduce, dp_tile_tcounts,
					dp_packed_head_flags, m_n_tile_size / 32, 1, dp_tile_carry, dp_tile_flags, n_tile_num).GetEvent(ev_spine);
				// will need also a downsweep variant preceded by corrseponding seg-scan kernels

				if(n_result != cl_Success)
					return n_result;

				cmd_queue.n_Enqueue_Marker(ev_end);
			}

			ev_spine.n_Wait();
			ev_spine.n_GetProfilingCounter_Difference(r_f_time_spine);

			// need a new kernel which does seg scan (downsweep) and reduction fixup in one
			// dp_reductions[dp_tile_tcounts[i]] += scan[i] for each i where either !i or
			// dp_tile_tcounts[i] > dp_tile_tcounts[i - 1] ... TileSegReduce_SpineAdjust()
		} else {
			n_result = Enqueue_TileSegReduceSingle_Packed(h_cmd_queue, dp_segreduce,
				/*dp_tile_carry, dp_tile_tcounts,*/ dp_data, dp_packed_head_flags, n_elem_num).GetEvent(ev_segreduce);
			if(n_result != cl_Success)
				return n_result;
			// apply the segmented reduction to tile partials

			cmd_queue.n_Enqueue_Marker(ev_end);
		}

		ev_bootstrap.n_Wait();
		ev_bootstrap.n_GetProfilingCounter_Difference(r_f_time_bootstrap);
		ev_segreduce.n_Wait();
		ev_segreduce.n_GetProfilingCounter_Difference(r_f_time_segreduce);

		ev_exscan_start.n_Wait();
		ev_exscan_end.n_Wait();
		uint64_t n_es_start, n_es_end;
		n_result = ev_exscan_start.n_GetProfilingCounter(n_es_start, CL_PROFILING_COMMAND_END);
		n_result = ev_exscan_end.n_GetProfilingCounter(n_es_end, CL_PROFILING_COMMAND_START);
		if(n_tile_num > 1)
			r_f_time_exscan = CCLUniqueEvent::f_ProfilingCounter_Difference(n_es_start, n_es_end);

		ev_start.n_Wait();
		ev_end.n_Wait();
		uint64_t n_op_start, n_op_end;
		n_result = ev_start.n_GetProfilingCounter(n_op_start, CL_PROFILING_COMMAND_END);
		n_result = ev_end.n_GetProfilingCounter(n_op_end, CL_PROFILING_COMMAND_START);
		r_f_time_total = CCLUniqueEvent::f_ProfilingCounter_Difference(n_op_start, n_op_end);

		return cl_Success;
	}

	template <class CCLScan> // "forward" decl
	CLresult n_Enqueue_SegmentedReduce_PackedNaturalOrder_NoAutoTune(cl_command_queue h_cmd_queue,
		cl_mem &dp_segreduce, // a buffer or 0 for allocate it for me
		size_t &r_n_reduction_num, // number of reductions or 0 for dont know
		size_t n_max_reduction_num, // max number of reductions that dp_segreduce can hold or 0 if dp_segreduce == 0
		const cl_mem dp_data, /*const cl_mem dp_head_flags,*/
		const cl_mem dp_packed_head_flags, size_t n_elem_num, CCLScan &r_int_scan,
		CCLTempBufferStack &r_memory_alloc)
	{
		_ASSERTE(!n_max_reduction_num == !dp_segreduce); // either both zero or both nonzero
		_ASSERTE(!n_max_reduction_num || n_max_reduction_num >= r_n_reduction_num); // if you guess, make a sane guess (if this fails then the dp_segreduce buffer has fewer elements than is guessed to be the number of reductions)
		_ASSERTE(r_n_reduction_num <= n_elem_num); // cannot possibly be more than that

		size_t n_tile_num = (n_elem_num + m_n_tile_size - 1) / m_n_tile_size;
		CCLTempBufferReservation dp_tile_flags(n_tile_num * sizeof(uint32_t), r_memory_alloc);
		CCLTempBufferReservation dp_tile_carry(n_tile_num * sizeof(uint32_t)/*m_config.n_DataType_Size()*/, r_memory_alloc);

		bool b_need_reduction_total = !r_n_reduction_num;
#ifdef _DEBUG
		b_need_reduction_total = true;
#endif // _DEBUG

		CCLTempBufferReservation dp_tile_tcounts((n_tile_num +
			((b_need_reduction_total)? 1 : 0)) * sizeof(uint32_t), r_memory_alloc); // +1!

		//CCLTempBufferReservation dp_pack_flags((n_tile_num * m_n_tile_size / 32 + 1) * sizeof(uint32_t), r_memory_alloc);
		// bit-pack the decoded head flags, save a bunch of traffic in the downsweep pass

		CLresult n_result = Enqueue_TileSegReduce_Bootstrap_PackedNaturalOrder/**/(h_cmd_queue, dp_tile_tcounts,
			dp_tile_flags, /*dp_head_flags*/dp_packed_head_flags, n_elem_num);
		if(n_result != cl_Success)
			return n_result;
		// bootstrap the segmented reduction by calculating tail counts per tile

		if(n_tile_num > 1) {
			n_result = r_int_scan.Enqueue_ExScan(h_cmd_queue, dp_tile_tcounts,
				dp_tile_tcounts, n_tile_num + ((b_need_reduction_total)? 1 : 0)); // +1!
			if(n_result != cl_Success)
				return n_result;
			// todo - this needs to be an int scan! (works now but won't work if the data type changes e.g. to float)
			// todo - make and use the reentrant version with a temp buffer allocator
		}
		// scan the tile counts to get tail counts

		uint32_t n_reds_num = r_n_reduction_num;
		if(b_need_reduction_total) { // the caller does not know how many there are, need to synchronize and check
			CCLCommandQueueWrapper q(h_cmd_queue);
			if((n_result = q.n_Finish()) != cl_Success) // ouch :(
				return n_result;
			size_t n_sum_index = (n_tile_num > 1)? n_tile_num : n_tile_num - 1; // if we just exscanned it then +1!
			if((n_result = q.n_Enqueue_Memcpy_DtoH(&n_reds_num, dp_tile_tcounts,
			   n_sum_index * sizeof(uint32_t), sizeof(uint32_t))) != cl_Success) // +1!
				return n_result;
			if(n_reds_num < 1)
				return (CLresult)-123456;//n_reds_num = 1; // would likely indicate a failure in counting tail flags (there always is at least one)
#ifdef _DEBUG
			_ASSERTE(!r_n_reduction_num || r_n_reduction_num >= n_reds_num); // make sure that the caller indeed knows at least the correct upper bound
#endif // _DEBUG
			r_n_reduction_num = n_reds_num; // write out the real number
		}
		if(n_max_reduction_num < n_reds_num) {
			if(dp_segreduce) {
				clReleaseMemObject(dp_segreduce);
				dp_segreduce = cl_mem(0);
			}
			n_result = CCLContextWrapper(r_memory_alloc.h_Context()).n_CreateBuffer(dp_segreduce,
				n_reds_num * sizeof(uint32_t)/*m_config.n_DataType_Size()*/);
			if(n_result != cl_Success) {
				dp_segreduce = cl_mem(0); // just to make sure
				return n_result;
			}
		}
		// in case there is a lot of data, read the number of reductions, otherwise just use a large temp buffer

		if(n_tile_num > 1) {
			n_result = Enqueue_TileSegReduce_PackedNaturalOrder(h_cmd_queue, dp_segreduce,
				dp_tile_carry, dp_tile_tcounts, dp_data, dp_packed_head_flags, n_elem_num);
			if(n_result != cl_Success)
				return n_result;
			// apply the segmented reduction to tile partials

			/*_ASSERTE(n_tile_num <= m_n_tile_size);
			n_result = n_Enqueue_SegmentedScan_NoAutoTune(h_cmd_queue, dp_tile_carry,
				dp_tile_carry, dp_head_flags, n_tile_num, r_memory_alloc);
			if(n_result != cl_Success)
				return n_result;*/
			// you are getting confused now, this is done below

			if(n_tile_num > m_n_tile_size) {
				size_t n_tile_num2 = (n_tile_num + m_n_tile_size - 1) / m_n_tile_size;
				CCLTempBufferReservation dp_tile_flags2(n_tile_num2 * sizeof(uint32_t)/*m_config.n_DataType_Size()*/, r_memory_alloc);
				CCLTempBufferReservation dp_tile_carry_scan(n_tile_num2 * sizeof(uint32_t)/*m_config.n_DataType_Size()*/, r_memory_alloc);

				CCLTempBufferReservation dp_packed_tile_head_flags(n_tile_num2 * m_n_tile_size / 32 * sizeof(uint32_t), r_memory_alloc);
				// bit-pack the decoded head flags, save a bunch of traffic in the downsweep pass

				CLresult n_result = Enqueue_TileSegScan_Carry_Pack(h_cmd_queue, dp_tile_carry_scan,
					dp_tile_flags2, dp_packed_tile_head_flags, dp_tile_carry, dp_tile_flags, n_tile_num);
				if(n_result != cl_Success)
					return n_result;

				n_result = n_Enqueue_SegmentedScan_NoAutoTune(h_cmd_queue,
					dp_tile_carry_scan, dp_tile_carry_scan, dp_tile_flags2, n_tile_num2, r_memory_alloc);
				if(n_result != cl_Success)
					return n_result;
				// scan the segment carry

				// note that at this point, dp_tile_carry2 contains global segmented scan of tile carry (dp_tile_carry)
				// there are no tile size constraints tying segmented scan to the segmented reduce spine adjust below.

				// note that the tile size argument below is likely a nonsense one,
				// this is not tied to the global scan tile size and can in fact
				// either run at the same granularity as the rest of the segmented reduce kernels
				// or at different one but then it needs more work

				n_result = Enqueue_TileSegReduce_SpineAdjust_Downsweep_Packed(h_cmd_queue, dp_segreduce,
					dp_tile_tcounts, dp_packed_head_flags, m_n_tile_size / 32, 1, dp_tile_carry, dp_tile_carry_scan,
					dp_packed_tile_head_flags, n_tile_num);
				/*n_result = Enqueue_TileSegReduce_SpineAdjust_Downsweep(h_cmd_queue, dp_segreduce,
					dp_tile_tcounts, dp_head_flags, m_n_tile_size, 0, dp_tile_carry, dp_tile_carry_scan,
					dp_packed_tile_head_flags, n_tile_num);*/
				if(n_result != cl_Success)
					return n_result;
				// segmented reduction downsweep
			} else {
				return Enqueue_TileSegReduce_SpineAdjust(h_cmd_queue, dp_segreduce, dp_tile_tcounts,
					dp_packed_head_flags, m_n_tile_size / 32, 1, dp_tile_carry, dp_tile_flags, n_tile_num);
				// will need also a downsweep variant preceded by corrseponding seg-scan kernels
			}

			// need a new kernel which does seg scan (downsweep) and reduction fixup in one
			// dp_reductions[dp_tile_tcounts[i]] += scan[i] for each i where either !i or
			// dp_tile_tcounts[i] > dp_tile_tcounts[i - 1] ... TileSegReduce_SpineAdjust()
		} else {
			n_result = Enqueue_TileSegReduceSingle_PackedNaturalOrder(h_cmd_queue, dp_segreduce,
				/*dp_tile_carry, dp_tile_tcounts,*/ dp_data, dp_packed_head_flags, n_elem_num);
			if(n_result != cl_Success)
				return n_result;
			// apply the segmented reduction to tile partials
		}

		return cl_Success;
	}

	// ---- ~segmented ops ----

public:
	template <class CCLIntScan>
	static bool Benchmark(cl_command_queue h_cmd_queue, cl_context h_context,
		CCLIntScan &r_int_scan, cl_device_id h_device, bool b_verbose = false)
	{
		CCLReductionConfig config;
		CCLTiled_SegmentedReduceScan_Impl scan;
		return scan.Compile(h_context, h_device, config, true, true, b_verbose) &&
			scan.Benchmark(h_cmd_queue, h_context, r_int_scan);
	}

	template <class CCLIntScan>
	bool Benchmark(cl_command_queue h_cmd_queue, cl_context h_context, CCLIntScan &r_int_scan)
	{
		if(!m_b_built)
			return false;

		CCLContextWrapper context(h_context);
		CCLCommandQueueWrapper cmd_queue(h_cmd_queue);
		// thin wrappers, do not delete the handles

		CCLTempBufferStack mem_alloc(context);
		// temp buffers, reuse between passes and benchmarks (otherwise prints a lot of verbose)

		bool b_results_correct = true;
		const size_t p_size[] = {1000 * 10, 1000 * 50, 1000 * 100, 1000 * 200,
			1000 * 500, 1000000, 1000000 * 2, 1000000 * 5, 1000000 * 10, 1000000 * 20};
		//const size_t p_size[] = {10, 100, 1000, 10000, 100000, 1000000, 10000000};
		if(m_h_tile_segreduce_kernel != cl_kernel(0)) {
			for(int n_test = 0; n_test < sizeof(p_size) / sizeof(p_size[0]); ++ n_test) {
				size_t n = p_size[n_test];

				printf("preparing data ...\r");

				seg_debug::THeadFlag_DebugInfo hf;
				//
				std::vector<uint32_t> scan_data(n);
				for(size_t i = 0; i < n; ++ i)
					scan_data[i] = (uint32_t)(i + 1);
				std::random_shuffle(scan_data.begin(), scan_data.end());
				// generate some data

				const size_t n_avg_seg_size = 500;
				// given as an input

				size_t n_tile_num = (n + m_n_tile_size - 1) / m_n_tile_size;
				// number of tiles

				size_t n_head_flags_size_bytes = n_PackedHeadFlags_Size(n);
				// size of the packed head flags

				const size_t n_seg_num = n / n_avg_seg_size + 1; // there must be at least one
				// number of segments (and so also of reductions)

				CCLUniqueMem dp_data, dp_reduce, dp_reduce_pphf, dp_reduce_pphfno,
					dp_head_flags, dp_packed_natural_hf, dp_packed_interleaved_hf;
				if(!(dp_data = context.h_CreateBuffer(n * sizeof(uint32_t))) ||
				   !(dp_reduce = context.h_CreateBuffer(n_seg_num * sizeof(uint32_t))) ||
				   !(dp_reduce_pphf = context.h_CreateBuffer(n_seg_num * sizeof(uint32_t))) ||
				   (m_b_strided_head_flags && // only if m_b_strided_head_flags is set
				   !(dp_reduce_pphfno = context.h_CreateBuffer(n_seg_num * sizeof(uint32_t)))) ||
				   !(dp_head_flags = context.h_CreateBuffer(n * sizeof(uint32_t))) ||
				   !(dp_packed_natural_hf = context.h_CreateBuffer(n_head_flags_size_bytes)) ||
				   !(dp_packed_interleaved_hf = context.h_CreateBuffer(n_head_flags_size_bytes))) {
					fprintf(stderr, "error: failed to alloc device buffer\n");
					return false;
				}
				// allocate memory

				_ASSERTE(!(n_head_flags_size_bytes % sizeof(uint32_t)));
				std::vector<uint32_t> packed_head_flags((m_b_strided_head_flags)?
					n_head_flags_size_bytes / sizeof(uint32_t) : 0, 0);
				// allocate this ahead of time too

				printf("running global segmented reduce test ...  \r");

				CTimer test_timer;
				double f_time = 0;
				double f_time_pack = 0;
				double f_time_pphf = 0;
				double f_time_pphf_spine = 0;
				double f_time_pphf_bootstrap = 0;
				double f_time_pphf_exscan = 0;
				double f_time_pphf_segscan = 0;
				double f_time_pphf_segreduce = 0;
				double f_time_pphfno = 0;
				int n_pass_num = 0;
				for(;;) {
					cmd_queue.n_Enqueue_Memcpy_HtoD_Async(dp_data, 0, &scan_data[0], n * sizeof(uint32_t));
					cmd_queue.n_Enqueue_Memcpy_HtoD_Async(dp_reduce, 0, &scan_data[0], n_seg_num * sizeof(uint32_t)); // clear this buffer as well
					cmd_queue.n_Enqueue_Memcpy_HtoD_Async(dp_reduce_pphf, 0, &scan_data[0], n_seg_num * sizeof(uint32_t)); // clear this buffer as well
					if(m_b_strided_head_flags)
						cmd_queue.n_Enqueue_Memcpy_HtoD_Async(dp_reduce_pphfno, 0, &scan_data[0], n_seg_num * sizeof(uint32_t)); // clear this buffer as well
					cmd_queue.n_Enqueue_Memcpy_HtoD_Async(dp_packed_interleaved_hf, 0, &scan_data[0], n_head_flags_size_bytes); // and this buffer as well

					seg_debug::CSegmentedOp_Random_Benchmark().n_Generate_HeadFlags(hf, n, n_seg_num - 1);
					// generate head flags

					if(m_b_strided_head_flags) {
						packed_head_flags.assign(n_head_flags_size_bytes / sizeof(uint32_t), 0);
						for(size_t i = 0; i < n; ++ i)
							packed_head_flags[i / 32] |= (hf.head_flags[i] != 0) << (i & 31);
						// pack the head flags on the CPU
					}

					cmd_queue.n_Enqueue_Memcpy_HtoD(dp_head_flags, 0, &hf.head_flags[0], n * sizeof(uint32_t));
					if(m_b_strided_head_flags) {
						cmd_queue.n_Enqueue_Memcpy_HtoD(dp_packed_natural_hf, 0,
							&packed_head_flags[0], n_head_flags_size_bytes); // !!
					}
					CLresult n_result0 = cmd_queue.n_Finish();
					if(n_result0) {
						fprintf(stderr, "error: pre-finish result: %d (%s, %d)\n", n_result0, __FILE__, __LINE__);
						return false;
					}
					// prepare data ...

					double f_pack_start_time = test_timer.f_Time();

					{
						CLresult n_result = Enqueue_Pack_HeadFlags(cmd_queue,
							dp_packed_interleaved_hf, dp_head_flags, n);
						if(n_result != CL_SUCCESS) {
							fprintf(stderr, "error: clCall1D() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
							return false;
						}
						n_result = cmd_queue.n_Finish();
						if(n_result != CL_SUCCESS) {
							fprintf(stderr, "error: finish result: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
							return false;
						}
					}

					f_time_pack += test_timer.f_Time() - f_pack_start_time;

					double f_pphf_start_time = test_timer.f_Time();

					double f_bootstrap_time = 0;
					double f_exscan_time = 0;
					double f_segreduce_time = 0;
					double f_segscan_time = 0;
					double f_spine_time = 0;
					double f_total_time = 0;

					{
						size_t n_seg_num1 = n_seg_num;
						CLresult n_result = n_Enqueue_SegmentedReduce_Packed_NoAutoTune_Profile(cmd_queue,
							*const_cast<cl_mem*>(&dp_reduce_pphf), n_seg_num1, n_seg_num1, dp_data, //dp_head_flags, // debug
							dp_packed_interleaved_hf, n, r_int_scan, mem_alloc, f_bootstrap_time,
							f_exscan_time, f_segreduce_time, f_segscan_time, f_spine_time, f_total_time);
						_ASSERTE(n_seg_num1 == n_seg_num);
						if(n_result != CL_SUCCESS) {
							fprintf(stderr, "error: clCall1D() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
							return false;
						}
						n_result = cmd_queue.n_Finish();
						if(n_result != CL_SUCCESS) {
							fprintf(stderr, "error: finish result: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
							return false;
						}
					}

					f_time_pphf += f_total_time;//test_timer.f_Time() - f_pphf_start_time;
					f_time_pphf_bootstrap += f_bootstrap_time;
					f_time_pphf_exscan += f_exscan_time;
					f_time_pphf_segreduce += f_segreduce_time;
					f_time_pphf_segscan += f_segscan_time;
					f_time_pphf_spine += f_spine_time;

					if(m_b_strided_head_flags) {
						double f_pphfno_start_time = test_timer.f_Time();

						{
							CCLUniqueEvent ev_start, ev_end;

							//cmd_queue.n_Enqueue_Marker(ev_start);

							size_t n_seg_num1 = n_seg_num;
							CLresult n_result = n_Enqueue_SegmentedReduce_PackedNaturalOrder_NoAutoTune(cmd_queue,
								*const_cast<cl_mem*>(&dp_reduce_pphfno), n_seg_num1, n_seg_num1, dp_data, //dp_head_flags, // debug
								dp_packed_natural_hf, n, r_int_scan, mem_alloc);
							_ASSERTE(n_seg_num1 == n_seg_num);
							if(n_result != CL_SUCCESS) {
								fprintf(stderr, "error: clCall1D() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
								return false;
							}

							/*cmd_queue.n_Enqueue_Marker(ev_end);

							ev_start.n_Wait();
							ev_end.n_Wait();
							uint64_t n_start, n_end;
							n_result = ev_start.n_GetProfilingCounter(n_start, CL_PROFILING_COMMAND_START);
							n_result = ev_end.n_GetProfilingCounter(n_end, CL_PROFILING_COMMAND_END);
							f_time_pphfno += CCLUniqueEvent::f_ProfilingCounter_Difference(n_start, n_end);*/

							n_result = cmd_queue.n_Finish();
							if(n_result != CL_SUCCESS) {
								fprintf(stderr, "error: finish result: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
								return false;
							}
						}

						f_time_pphfno += test_timer.f_Time() - f_pphfno_start_time;
					}

					double f_start_time = test_timer.f_Time();

					{
						size_t n_seg_num1 = n_seg_num;
						CLresult n_result = n_Enqueue_SegmentedReduce_NoAutoTune(cmd_queue,
							*const_cast<cl_mem*>(&dp_reduce), n_seg_num1, n_seg_num1, dp_data, dp_head_flags,
							n, r_int_scan, mem_alloc);
						_ASSERTE(n_seg_num1 == n_seg_num);
						if(n_result != CL_SUCCESS) {
							fprintf(stderr, "error: clCall1D() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
							return false;
						}
						n_result = cmd_queue.n_Finish();
						if(n_result != CL_SUCCESS) {
							fprintf(stderr, "error: finish result: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
							return false;
						}
					}

					double f_pass_time = test_timer.f_Time() - f_start_time;
					f_time += f_pass_time;
					++ n_pass_num;

					if((f_time > .5f && n_pass_num > 10) || f_time > 4)
						break;
					// make sure the timing is stable, don't take too long at the same time
				}
				//-- n_pass_num; // the first pass did not count
				// run the thing

				double f_ratio_bootstrap = f_time_pphf_bootstrap / f_time_pphf;
				double f_ratio_exscan = f_time_pphf_exscan / f_time_pphf;
				double f_ratio_segreduce = f_time_pphf_segreduce / f_time_pphf;
				double f_ratio_segscan = f_time_pphf_segscan / f_time_pphf;
				double f_ratio_spine = f_time_pphf_spine / f_time_pphf;
				double f_sum_time = f_ratio_bootstrap + f_ratio_exscan + f_ratio_segreduce + f_ratio_segscan + f_ratio_spine;

				f_time /= n_pass_num;
				f_time_pack /= n_pass_num;
				f_time_pphf /= n_pass_num;
				f_time_pphfno /= n_pass_num;
				size_t n_data = 1 * scan_data.size() * sizeof(uint32_t); // only read the data, written reductions much smaller
				double f_GBps = n_data / f_time * 1e-9; // mGPU also uses 1e-9 rather than 1024^3
				if(m_b_strided_head_flags) {
					printf("" PRIsizeB "B: %f ms -> %f*1e9 B/s (%.4f w/hf, %.4f ilv, %.4f nat, %.4f pck, %.1f%%bs %.1f%%es %.1f%%sr %.1f%%ss %.1f%%sp %.1f)\n",
						PRIsizeBparams(scan_data.size() * sizeof(uint32_t)), f_time * 1000, f_GBps, f_GBps * 2, // also read head flags (as big as the data if unpacked)
						n_data / f_time_pphf * 1e-9, n_data / f_time_pphfno * 1e-9, n_data / f_time_pack * 1e-9,
						f_ratio_bootstrap * 100,
						f_ratio_exscan * 100, f_ratio_segreduce * 100, f_ratio_segscan * 100, f_ratio_spine * 100, f_sum_time); // only read head flags
				} else {
					printf("" PRIsizeB "B: %f ms -> %f*1e9 B/s (%.4f w/hf, %.4f nat, %.4f pck, %.1f%%bs %.1f%%es %.1f%%sr %.1f%%ss %.1f%%sp %.1f)\n",
						PRIsizeBparams(scan_data.size() * sizeof(uint32_t)), f_time * 1000, f_GBps, f_GBps * 2, // also read head flags (as big as the data if unpacked)
						n_data / f_time_pphf * 1e-9, n_data / f_time_pack * 1e-9, f_ratio_bootstrap * 100,
						f_ratio_exscan * 100, f_ratio_segreduce * 100, f_ratio_segscan * 100, f_ratio_spine * 100, f_sum_time); // only read head flags
				}
				// print results

				std::vector<uint32_t> seg_reduction_cpu(n); // could work inplace but we still need a buffer for getting the GPU result(s)
				CTimer tcpu;
				seg_debug::CReference::Segmented_Reduce(seg_reduction_cpu, scan_data, hf.head_flags);
				printf("global segmented reduction takes %f msec on CPU\n", tcpu.f_Time() * 1000);
				// perform a global scan (the goal)

				bool b_test_correct;
				if(cmd_queue.n_CompareBuffer_DebugVerbose(b_test_correct, seg_reduction_cpu.begin(),
				   seg_reduction_cpu.end(), dp_reduce, 0, "global segmented reduction") != cl_Success) {
					fprintf(stderr, "error: cmd_queue.n_CompareBuffer_DebugVerbose() failed to compare the results\n");
					break;
				}
				bool b_test_correct_pphf;
				if(cmd_queue.n_CompareBuffer_DebugVerbose(b_test_correct_pphf, seg_reduction_cpu.begin(),
				   seg_reduction_cpu.end(), dp_reduce_pphf, 0, "global segmented reduction pphf") != cl_Success) {
					fprintf(stderr, "error: cmd_queue.n_CompareBuffer_DebugVerbose() failed to compare the results\n");
					break;
				}
				bool b_test_correct_pphfno = true;
				if(m_b_strided_head_flags) {
					if(cmd_queue.n_CompareBuffer_DebugVerbose(b_test_correct_pphfno, seg_reduction_cpu.begin(),
					   seg_reduction_cpu.end(), dp_reduce_pphfno, 0, "global segmented reduction pphfno") != cl_Success) {
						fprintf(stderr, "error: cmd_queue.n_CompareBuffer_DebugVerbose() failed to compare the results\n");
						break;
					}
				}
				// reusable function, uses clnqueueMapBuffer()

				if(b_test_correct && b_test_correct_pphf && b_test_correct_pphfno)
					printf("done. global segmented reduction of %d items succeeded\n", n);
				else {
					b_results_correct = false;
					break;
				}
				// make sure it is scanned correctly
			}
		}
		if(//m_h_tile_segscan_bc_kernel != cl_kernel(0) && m_h_tile_segscan_ds_kernel != cl_kernel(0) &&
		   m_h_tile_segscan_kernel != cl_kernel(0)) {
			for(int n_test = 0; n_test < sizeof(p_size) / sizeof(p_size[0]); ++ n_test) {
				size_t n = p_size[n_test];

				printf("preparing data ...\r");

				seg_debug::THeadFlag_DebugInfo hf;
				//
				std::vector<uint32_t> scan_data(n);
				for(size_t i = 0; i < n; ++ i)
					scan_data[i] = (uint32_t)(i + 1);
				std::random_shuffle(scan_data.begin(), scan_data.end());
				// generate some data

				const size_t n_avg_seg_size = 500;
				// given as an input

				size_t n_tile_num = (n + m_n_tile_size - 1) / m_n_tile_size;
				// number of tiles

				size_t n_head_flags_size_bytes = n_PackedHeadFlags_Size(n);
				// size of the packed head flags

				CCLUniqueMem dp_data, dp_scan, dp_scan_pphf, dp_scan_pphfno,
					dp_head_flags, dp_packed_natural_hf, dp_packed_interleaved_hf;
				if(!(dp_data = context.h_CreateBuffer(n * sizeof(uint32_t))) ||
				   !(dp_scan = context.h_CreateBuffer(n * sizeof(uint32_t))) ||
				   !(dp_scan_pphf = context.h_CreateBuffer(n * sizeof(uint32_t))) ||
				   (m_b_strided_head_flags && // only if m_b_strided_head_flags is set
				   !(dp_scan_pphfno = context.h_CreateBuffer(n * sizeof(uint32_t)))) ||
				   !(dp_head_flags = context.h_CreateBuffer(n * sizeof(uint32_t))) ||
				   !(dp_packed_natural_hf = context.h_CreateBuffer(n_head_flags_size_bytes)) ||
				   !(dp_packed_interleaved_hf = context.h_CreateBuffer(n_head_flags_size_bytes))) {
					fprintf(stderr, "error: failed to alloc device buffer\n");
					return false;
				}
				// allocate memory

				_ASSERTE(!(n_head_flags_size_bytes % sizeof(uint32_t)));
				std::vector<uint32_t> packed_head_flags((m_b_strided_head_flags)?
					n_head_flags_size_bytes / sizeof(uint32_t) : 0, 0);
				// allocate this ahead of time too

				printf("running global segmented scan test ...  \r");

				CTimer test_timer;
				double f_time = 0;
				double f_time_pack = 0;
				double f_time_pphf = 0;
				double f_time_pphfno = 0;
				int n_pass_num = 0;
				for(;;) {
					cmd_queue.n_Enqueue_Memcpy_HtoD_Async(dp_data, 0, &scan_data[0], n * sizeof(uint32_t));
					cmd_queue.n_Enqueue_Memcpy_HtoD_Async(dp_scan, 0, &scan_data[0], n * sizeof(uint32_t)); // clear this buffer as well
					cmd_queue.n_Enqueue_Memcpy_HtoD_Async(dp_scan_pphf, 0, &scan_data[0], n * sizeof(uint32_t)); // clear this buffer as well
					if(m_b_strided_head_flags)
						cmd_queue.n_Enqueue_Memcpy_HtoD_Async(dp_scan_pphfno, 0, &scan_data[0], n * sizeof(uint32_t)); // clear this buffer as well
					cmd_queue.n_Enqueue_Memcpy_HtoD_Async(dp_packed_interleaved_hf, 0, &scan_data[0], n_head_flags_size_bytes); // and this buffer as well

					size_t n_seg_num = n / n_avg_seg_size + 1; // there must be at least one
					seg_debug::CSegmentedOp_Random_Benchmark().n_Generate_HeadFlags(hf, n, n_seg_num - 1);
					// generate head flags

					if(m_b_strided_head_flags) {
						packed_head_flags.assign(n_head_flags_size_bytes / sizeof(uint32_t), 0);
						for(size_t i = 0; i < n; ++ i)
							packed_head_flags[i / 32] |= (hf.head_flags[i] != 0) << (i & 31);
						// pack the head flags on the CPU
					}

					cmd_queue.n_Enqueue_Memcpy_HtoD(dp_head_flags, 0, &hf.head_flags[0], n * sizeof(uint32_t));
					if(m_b_strided_head_flags) {
						cmd_queue.n_Enqueue_Memcpy_HtoD(dp_packed_natural_hf, 0,
							&packed_head_flags[0], n_head_flags_size_bytes); // !!
					}
					CLresult n_result0 = cmd_queue.n_Finish();
					if(n_result0) {
						fprintf(stderr, "error: pre-finish result: %d (%s, %d)\n", n_result0, __FILE__, __LINE__);
						return false;
					}
					// prepare data ...

					double f_pack_start_time = test_timer.f_Time();

					{
						CLresult n_result = Enqueue_Pack_HeadFlags(cmd_queue,
							dp_packed_interleaved_hf, dp_head_flags, n);
						if(n_result != CL_SUCCESS) {
							fprintf(stderr, "error: clCall1D() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
							return false;
						}
						n_result = cmd_queue.n_Finish();
						if(n_result != CL_SUCCESS) {
							fprintf(stderr, "error: finish result: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
							return false;
						}
					}

					f_time_pack += test_timer.f_Time() - f_pack_start_time;

					double f_pphf_start_time = test_timer.f_Time();

					{
						CLresult n_result = n_Enqueue_SegmentedScan_Packed_NoAutoTune(cmd_queue,
							dp_scan_pphf, dp_data, dp_packed_interleaved_hf, n, mem_alloc);
						if(n_result != CL_SUCCESS) {
							fprintf(stderr, "error: clCall1D() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
							return false;
						}
						n_result = cmd_queue.n_Finish();
						if(n_result != CL_SUCCESS) {
							fprintf(stderr, "error: finish result: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
							return false;
						}
					}

					f_time_pphf += test_timer.f_Time() - f_pphf_start_time;

					if(m_b_strided_head_flags) {
						double f_pphfno_start_time = test_timer.f_Time();

						{
							CLresult n_result = n_Enqueue_SegmentedScan_PackedNaturalOrder_NoAutoTune(cmd_queue,
								dp_scan_pphfno, dp_data, dp_packed_natural_hf, n, mem_alloc);
							if(n_result != CL_SUCCESS) {
								fprintf(stderr, "error: clCall1D() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
								return false;
							}
							n_result = cmd_queue.n_Finish();
							if(n_result != CL_SUCCESS) {
								fprintf(stderr, "error: finish result: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
								return false;
							}
						}

						f_time_pphfno += test_timer.f_Time() - f_pphfno_start_time;
					}

					double f_start_time = test_timer.f_Time();

					{
						CLresult n_result = n_Enqueue_SegmentedScan_NoAutoTune(cmd_queue,
							dp_scan, dp_data, dp_head_flags, n, mem_alloc);
						if(n_result != CL_SUCCESS) {
							fprintf(stderr, "error: clCall1D() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
							return false;
						}
						n_result = cmd_queue.n_Finish();
						if(n_result != CL_SUCCESS) {
							fprintf(stderr, "error: finish result: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
							return false;
						}
					}

					double f_pass_time = test_timer.f_Time() - f_start_time;
					f_time += f_pass_time;
					++ n_pass_num;

					if((f_time > .5f && n_pass_num > 10) || f_time > 4)
						break;
					// make sure the timing is stable, don't take too long at the same time
				}
				//-- n_pass_num; // the first pass did not count
				// run the thing

				f_time /= n_pass_num;
				f_time_pack /= n_pass_num;
				f_time_pphf /= n_pass_num;
				f_time_pphfno /= n_pass_num;
				size_t n_data = 3 * scan_data.size() * sizeof(uint32_t); // read data, write tile carry and compressed head flags, read data, write scans
				double f_GBps = n_data / f_time * 1e-9; // mGPU also uses 1e-9 rather than 1024^3
				if(m_b_strided_head_flags) {
					printf("on " PRIsizeB "B, it took %f msec, reaching %f*1e9 B/s (%.4f w/hf, %.4f ilv, %.4f nat, %.4f pck)\n",
						PRIsizeBparams(scan_data.size() * sizeof(uint32_t)), f_time * 1000, f_GBps, f_GBps * 4 / 3, // also read head flags (as big as the data if unpacked)
						n_data / f_time_pphf * 1e-9, n_data / f_time_pphfno * 1e-9, n_data / f_time_pack * 1e-9 * 1 / 3); // only read head flags
				} else {
					printf("on " PRIsizeB "B, it took %f msec, reaching %f*1e9 B/s (%.4f w/hf, %.4f nat, %.4f pck)\n",
						PRIsizeBparams(scan_data.size() * sizeof(uint32_t)), f_time * 1000, f_GBps, f_GBps * 4 / 3, // also read head flags (as big as the data if unpacked)
						n_data / f_time_pphf * 1e-9, n_data / f_time_pack * 1e-9 * 1 / 3); // only read head flags
				}
				// print results

				std::vector<uint32_t> seg_scan_cpu(n); // could work inplace but we still need a buffer for getting the GPU result(s)
				CTimer tcpu;
				seg_debug::CReference::Segmented_Scan(seg_scan_cpu, scan_data, hf.head_flags);
				printf("global segmented scan takes %f msec on CPU\n", tcpu.f_Time() * 1000);
				// perform a global scan (the goal)

				bool b_test_correct;
				if(cmd_queue.n_CompareBuffer_DebugVerbose(b_test_correct, seg_scan_cpu.begin(),
				   seg_scan_cpu.end(), dp_scan, 0, "global segmented scan") != cl_Success) {
					fprintf(stderr, "error: cmd_queue.n_CompareBuffer_DebugVerbose() failed to compare the results\n");
					break;
				}
				bool b_test_correct_pphf;
				if(cmd_queue.n_CompareBuffer_DebugVerbose(b_test_correct_pphf, seg_scan_cpu.begin(),
				   seg_scan_cpu.end(), dp_scan_pphf, 0, "global segmented scan pphf") != cl_Success) {
					fprintf(stderr, "error: cmd_queue.n_CompareBuffer_DebugVerbose() failed to compare the results\n");
					break;
				}
				bool b_test_correct_pphfno = true;
				if(m_b_strided_head_flags) {
					if(cmd_queue.n_CompareBuffer_DebugVerbose(b_test_correct_pphfno, seg_scan_cpu.begin(),
					   seg_scan_cpu.end(), dp_scan_pphfno, 0, "global segmented scan pphfno") != cl_Success) {
						fprintf(stderr, "error: cmd_queue.n_CompareBuffer_DebugVerbose() failed to compare the results\n");
						break;
					}
				}
				// reusable function, uses clnqueueMapBuffer()

				if(b_test_correct && b_test_correct_pphf && b_test_correct_pphfno)
					printf("done. global segmented scan of %d items succeeded\n", n);
				else {
					b_results_correct = false;
					break;
				}
				// make sure it is scanned correctly
			}
		}
		if(b_results_correct)
			printf("all tests finished correctly\n");
		else
			fprintf(stderr, "error: there were some errors\n");

		return true;
	}
};

/**
 *	OpenCL tiled reduce and scan primitive; holds its own configuration
 */
class CCLTiled_ReduceScan : public CCLTiled_ReduceScan_Impl {
protected:
	CCLReductionConfig m_config;

public:
	CCLTiled_ReduceScan(size_t n_workgroup_size = 128, size_t n_tile_size = 1024)
		:CCLTiled_ReduceScan_Impl(n_workgroup_size, n_tile_size)
	{}

	/**
	 *	@copydoc CCLReductionConfig::Set_ReduceOps()
	 */
	bool Set_ReduceOps(const char *p_s_elem_op = "x", const char *p_s_reduce_op = "x+y",
		const char *p_s_finalize_op = "x", const char *p_s_identity = "0")
	{
		return m_config.Set_ReduceOps(p_s_elem_op, p_s_reduce_op, p_s_finalize_op, p_s_identity);
	}

	/**
	 *	@copydoc CCLReductionConfig::Set_DataType()
	 */
	bool Set_DataType(const char *p_s_data_type)
	{
		return m_config.Set_DataType(p_s_data_type);
	}

	const CCLReductionConfig &r_Configuration() const
	{
		return m_config;
	}

	/**
	 *	@brief compiles the kernels with the current settings
	 *
	 *	@param[in] h_context is OpenCL context
	 *	@param[in] h_device is target device (currently only supports single device)
	 *	@param[in] b_want_scan is scan kernel build flag
	 *	@param[in] b_want_reduce is reduction kernel build flag
	 *	@param[in] b_verbose is verbosity flag (set to enable verbose)
	 *	@param[in] b_compiler_verbose is compiler verbosity flag (e.g. nvcc reports numbers of registers)
	 *	@param[in] b_use_nv_shuffle is NVIDIA specific shuffle instruction enable flag
	 *		(cleared automatically for devices that do not support it)
	 *	@param[in] b_use_Harris_scan is Harris-style workgroup cooperative scan flag
	 *
	 *	@return Returns true on success, false on failure.
	 *
	 *	@note Once the kernels are compiled, this function has no effect and
	 *		always returns true.
	 */
	bool Compile(cl_context h_context, cl_device_id h_device,
		bool b_want_scan, bool b_want_reduce, bool b_verbose = false,
		bool b_compiler_verbose = false, bool b_use_nv_shuffle = true,
		bool b_use_Harris_scan = false)
	{
		return CCLTiled_ReduceScan_Impl::Compile(h_context, h_device, m_config,
			b_want_scan, b_want_reduce, b_verbose, b_compiler_verbose,
			b_use_nv_shuffle, b_use_Harris_scan);
	}

	/*bool IntExScan(cl_command_queue h_cmd_queue, cl_mem dp_scan, const cl_mem dp_data,
		size_t n_elem_num, CCLTempBufferStack &r_memory_alloc) // only needed for segmented reduce tests
	{
		_ASSERTE(m_config.n_DataType_Size() == 4 &&
			strstr(m_config.p_s_DaraType(), "int32_t") != 0 &&
			!m_config.b_Has_ElementOp() && m_config.n_Reduce_Operator() == '+' &&
			!m_config.b_Has_FinalizeOp());
		// make sure this scans integers

		if(n_elem_num > m_n_tile_size) {
			size_t n_tile_num = (n_elem_num + m_n_tile_size - 1) / m_n_tile_size;
			CCLTempBufferReservation dp_temp(n_tile_num * m_config.n_DataType_Size(), r_memory_alloc);

			CLresult n_result = Enqueue_TileReduce(h_cmd_queue, dp_temp, n_tile_num, dp_data, n_elem_num);
			if(n_result != cl_Success)
				return false;
			// reduce tile partials

			if(!IntExScan(h_cmd_queue, dp_temp, dp_temp, n_elem_num, r_memory_alloc))
				return false;
			// scan them

			n_result = Enqueue_TileExScan_Downsweep(h_cmd_queue, dp_scan, dp_temp, dp_data, n_elem_num);
			if(n_result != cl_Success)
				return false;
			// downsweep
		} else {
			CLresult n_result = Enqueue_TileExScan(h_cmd_queue, dp_scan, dp_data, n_elem_num);
			if(n_result != cl_Success)
				return false;
		}
		return true;
	}*/
};

/**
 *	OpenCL tiled reduce and scan primitive; holds its own configuration
 */
class CCLTiled_SegmentedReduceScan : public CCLTiled_SegmentedReduceScan_Impl {
protected:
	CCLReductionConfig m_config;

public:
	CCLTiled_SegmentedReduceScan(size_t n_workgroup_size = 128,
		size_t n_tile_size = 1024, bool b_strided_head_flags = false)
		:CCLTiled_SegmentedReduceScan_Impl(n_workgroup_size, n_tile_size, b_strided_head_flags)
	{}

	/**
	 *	@copydoc CCLReductionConfig::Set_ReduceOps()
	 */
	bool Set_ReduceOps(const char *p_s_elem_op = "x", const char *p_s_reduce_op = "x+y",
		const char *p_s_finalize_op = "x", const char *p_s_identity = "0")
	{
		return m_config.Set_ReduceOps(p_s_elem_op, p_s_reduce_op, p_s_finalize_op, p_s_identity);
	}

	/**
	 *	@copydoc CCLReductionConfig::Set_DataType()
	 */
	bool Set_DataType(const char *p_s_data_type)
	{
		return m_config.Set_DataType(p_s_data_type);
	}

	const CCLReductionConfig &r_Configuration() const
	{
		return m_config;
	}

	/**
	 *	@brief compiles the kernels with the current settings
	 *
	 *	@param[in] h_context is OpenCL context
	 *	@param[in] h_device is target device (currently only supports single device)
	 *	@param[in] b_want_scan is scan kernel build flag
	 *	@param[in] b_want_reduce is reduction kernel build flag
	 *	@param[in] b_verbose is verbosity flag (set to enable verbose)
	 *	@param[in] b_compiler_verbose is compiler verbosity flag (e.g. nvcc reports numbers of registers)
	 *	@param[in] b_use_nv_shuffle is NVIDIA specific shuffle instruction enable flag
	 *		(cleared automatically for devices that do not support it)
	 *	@param[in] b_use_Harris_scan is Harris-style workgroup cooperative scan flag
	 *
	 *	@return Returns true on success, false on failure.
	 *
	 *	@note Once the kernels are compiled, this function has no effect and
	 *		always returns true.
	 */
	bool Compile(cl_context h_context, cl_device_id h_device,
		bool b_want_scan, bool b_want_reduce, bool b_verbose = false,
		bool b_compiler_verbose = false, bool b_use_nv_shuffle = true)
	{
		return CCLTiled_SegmentedReduceScan_Impl::Compile(h_context, h_device, m_config,
			true, b_want_scan, b_want_reduce, b_want_reduce, b_verbose, b_compiler_verbose,
			b_use_nv_shuffle);
	}

	class CFakeGlobalExScan {
	protected:
		CCLTiled_ReduceScan &m_r_int_scan;

	public:
		CFakeGlobalExScan(CCLTiled_ReduceScan &r_int_scan)
			:m_r_int_scan(r_int_scan)
		{}

		CCLKernelCall Enqueue_ExScan(cl_command_queue h_cmd_queue,
			cl_mem dp_scan, const cl_mem dp_data, size_t n)
		{
			_ASSERTE(n <= m_r_int_scan.n_Tile_Size()); // otherwise not global
			return m_r_int_scan.Enqueue_TileExScan(h_cmd_queue, dp_scan, dp_data, n);
		}
	};

	bool Test_TileSegScan(cl_command_queue h_cmd_queue, cl_context h_context,
		CCLTiled_ReduceScan &r_int_scan)
	{
		_ASSERTE(r_int_scan.r_Configuration().n_DataType_Size() == 4 &&
			strstr(r_int_scan.r_Configuration().p_s_DaraType(), "int") != 0 &&
			!r_int_scan.r_Configuration().b_Has_ElementOp() &&
			r_int_scan.r_Configuration().n_Reduce_Operator() == '+' &&
			!r_int_scan.r_Configuration().b_Has_FinalizeOp());
		// make sure r_int_scan indeed scans integers (32 bit, don't really care
		// for signedness, will be working with small enough numbers)

		try {
			CCLCommandQueueWrapper cmd_queue(h_cmd_queue);
			CCLContextWrapper context(h_context);

			CCLTempBufferStack gpu_alloc(h_context);

			size_t p_pass_size[] = {
				1, 666, 2001, 3000, 4000, 5005,
				m_n_tile_size, 2 * m_n_tile_size, 3 * m_n_tile_size, 4 * m_n_tile_size, 13 * m_n_tile_size,
				m_n_tile_size - 33, m_n_tile_size - 32, m_n_tile_size - 31, m_n_tile_size - 30,
				m_n_tile_size - 29, m_n_tile_size - 28, m_n_tile_size - 27, m_n_tile_size - 26,
				m_n_tile_size - 25, m_n_tile_size - 24, m_n_tile_size - 23, m_n_tile_size - 22,
				m_n_tile_size - 21, m_n_tile_size - 20, m_n_tile_size - 19, m_n_tile_size - 18,
				m_n_tile_size - 17, m_n_tile_size - 16, m_n_tile_size - 15, m_n_tile_size - 14,
				m_n_tile_size - 13, m_n_tile_size - 12, m_n_tile_size - 11, m_n_tile_size - 10,
				m_n_tile_size - 9, m_n_tile_size - 8, m_n_tile_size - 7, m_n_tile_size - 6,
				m_n_tile_size - 5, m_n_tile_size - 4, m_n_tile_size - 3, m_n_tile_size - 2,
				m_n_tile_size - 1, m_n_tile_size + 1, m_n_tile_size + 2, m_n_tile_size + 3,
				m_n_tile_size + 4, m_n_tile_size + 5, m_n_tile_size + 6, m_n_tile_size + 7,
				m_n_tile_size + 8, m_n_tile_size + 9, m_n_tile_size + 10, m_n_tile_size + 11,
				m_n_tile_size + 12, m_n_tile_size + 13, m_n_tile_size + 14, m_n_tile_size + 15,
				m_n_tile_size + 16, m_n_tile_size + 17, m_n_tile_size + 18, m_n_tile_size + 19,
				m_n_tile_size + 20, m_n_tile_size + 21, m_n_tile_size + 22, m_n_tile_size + 23,
				m_n_tile_size + 24, m_n_tile_size + 25, m_n_tile_size + 26, m_n_tile_size + 27,
				m_n_tile_size + 28, m_n_tile_size + 29, m_n_tile_size + 30, m_n_tile_size + 31,
				m_n_tile_size + 32, m_n_tile_size + 33,
				m_n_tile_size * 2 - 33, m_n_tile_size * 2 - 32, m_n_tile_size * 2 - 31, m_n_tile_size * 2 - 30,
				m_n_tile_size * 2 - 29, m_n_tile_size * 2 - 28, m_n_tile_size * 2 - 27, m_n_tile_size * 2 - 26,
				m_n_tile_size * 2 - 25, m_n_tile_size * 2 - 24, m_n_tile_size * 2 - 23, m_n_tile_size * 2 - 22,
				m_n_tile_size * 2 - 21, m_n_tile_size * 2 - 20, m_n_tile_size * 2 - 19, m_n_tile_size * 2 - 18,
				m_n_tile_size * 2 - 17, m_n_tile_size * 2 - 16, m_n_tile_size * 2 - 15, m_n_tile_size * 2 - 14,
				m_n_tile_size * 2 - 13, m_n_tile_size * 2 - 12, m_n_tile_size * 2 - 11, m_n_tile_size * 2 - 10,
				m_n_tile_size * 2 - 9, m_n_tile_size * 2 - 8, m_n_tile_size * 2 - 7, m_n_tile_size * 2 - 6,
				m_n_tile_size * 2 - 5, m_n_tile_size * 2 - 4, m_n_tile_size * 2 - 3, m_n_tile_size * 2 - 2,
				m_n_tile_size * 2 - 1, m_n_tile_size * 2 + 1, m_n_tile_size * 2 + 2, m_n_tile_size * 2 + 3,
				m_n_tile_size * 2 + 4, m_n_tile_size * 2 + 5, m_n_tile_size * 2 + 6, m_n_tile_size * 2 + 7,
				m_n_tile_size * 2 + 8, m_n_tile_size * 2 + 9, m_n_tile_size * 2 + 10, m_n_tile_size * 2 + 11,
				m_n_tile_size * 2 + 12, m_n_tile_size * 2 + 13, m_n_tile_size * 2 + 14, m_n_tile_size * 2 + 15,
				m_n_tile_size * 2 + 16, m_n_tile_size * 2 + 17, m_n_tile_size * 2 + 18, m_n_tile_size * 2 + 19,
				m_n_tile_size * 2 + 20, m_n_tile_size * 2 + 21, m_n_tile_size * 2 + 22, m_n_tile_size * 2 + 23,
				m_n_tile_size * 2 + 24, m_n_tile_size * 2 + 25, m_n_tile_size * 2 + 26, m_n_tile_size * 2 + 27,
				m_n_tile_size * 2 + 28, m_n_tile_size * 2 + 29, m_n_tile_size * 2 + 30, m_n_tile_size * 2 + 31,
				m_n_tile_size * 2 + 32, m_n_tile_size * 2 + 33,
				m_n_tile_size * 3 - 33, m_n_tile_size * 3 - 32, m_n_tile_size * 3 - 31, m_n_tile_size * 3 - 30,
				m_n_tile_size * 3 - 29, m_n_tile_size * 3 - 28, m_n_tile_size * 3 - 27, m_n_tile_size * 3 - 26,
				m_n_tile_size * 3 - 25, m_n_tile_size * 3 - 24, m_n_tile_size * 3 - 23, m_n_tile_size * 3 - 22,
				m_n_tile_size * 3 - 21, m_n_tile_size * 3 - 20, m_n_tile_size * 3 - 19, m_n_tile_size * 3 - 18,
				m_n_tile_size * 3 - 17, m_n_tile_size * 3 - 16, m_n_tile_size * 3 - 15, m_n_tile_size * 3 - 14,
				m_n_tile_size * 3 - 13, m_n_tile_size * 3 - 12, m_n_tile_size * 3 - 11, m_n_tile_size * 3 - 10,
				m_n_tile_size * 3 - 9, m_n_tile_size * 3 - 8, m_n_tile_size * 3 - 7, m_n_tile_size * 3 - 6,
				m_n_tile_size * 3 - 5, m_n_tile_size * 3 - 4, m_n_tile_size * 3 - 3, m_n_tile_size * 3 - 2,
				m_n_tile_size * 3 - 1, m_n_tile_size * 3 + 1, m_n_tile_size * 3 + 2, m_n_tile_size * 3 + 3,
				m_n_tile_size * 3 + 4, m_n_tile_size * 3 + 5, m_n_tile_size * 3 + 6, m_n_tile_size * 3 + 7,
				m_n_tile_size * 3 + 8, m_n_tile_size * 3 + 9, m_n_tile_size * 3 + 10, m_n_tile_size * 3 + 11,
				m_n_tile_size * 3 + 12, m_n_tile_size * 3 + 13, m_n_tile_size * 3 + 14, m_n_tile_size * 3 + 15,
				m_n_tile_size * 3 + 16, m_n_tile_size * 3 + 17, m_n_tile_size * 3 + 18, m_n_tile_size * 3 + 19,
				m_n_tile_size * 3 + 20, m_n_tile_size * 3 + 21, m_n_tile_size * 3 + 22, m_n_tile_size * 3 + 23,
				m_n_tile_size * 3 + 24, m_n_tile_size * 3 + 25, m_n_tile_size * 3 + 26, m_n_tile_size * 3 + 27,
				m_n_tile_size * 3 + 28, m_n_tile_size * 3 + 29, m_n_tile_size * 3 + 30, m_n_tile_size * 3 + 31,
				m_n_tile_size * 3 + 32, m_n_tile_size * 3 + 33};
			for(size_t n_pass = 0; n_pass < sizeof(p_pass_size) / sizeof(p_pass_size[0]); ++ n_pass) {
				const size_t n = p_pass_size[n_pass];
				const size_t n_tile_num = (n + m_n_tile_size - 1) / m_n_tile_size;

				std::vector<uint32_t> data(n);

				for(size_t i = 0; i < n; ++ i)
					data[i] = (uint32_t)(i + 1);
				for(size_t i = 0; i < n; ++ i)
					std::swap(data[i], data[CUniformIntegerDistribution<size_t>(i, n - 1)(CCLibGenerator<false>())]);
				// generate some data

				seg_debug::THeadFlag_DebugInfo seg;
				seg.Reset(n);
				// segmented info

				CCLUniqueMem dp_data(context.h_CreateBuffer(n * sizeof(uint32_t), &data.front(), CL_MEM_COPY_HOST_PTR));
				CCLUniqueMem dp_flags(context.h_CreateBuffer(n * sizeof(uint32_t)));
				// alloc source buffers, copy data there

				CCLUniqueMem dp_scan(context.h_CreateBuffer(n * sizeof(uint32_t)));
				CCLUniqueMem dp_segscan(context.h_CreateBuffer(n * sizeof(uint32_t)));
				CCLUniqueMem dp_reductions(context.h_CreateBuffer((n + 1) * sizeof(uint32_t))); // size r!
				CCLUniqueMem dp_tile_carry(context.h_CreateBuffer(n_tile_num * sizeof(uint32_t)));
				CCLUniqueMem dp_tile_flags(context.h_CreateBuffer(n_tile_num * sizeof(uint32_t)));
				CCLUniqueMem dp_tile_tail_counts_scan(context.h_CreateBuffer(n_tile_num * sizeof(uint32_t)));
				CCLUniqueMem dp_packed_head_flags(context.h_CreateBuffer(n_PackedHeadFlags_Size(n)));
				// alloc result buffers

				seg_debug::CSegmentedOp_SimplePatterns_Benchmark b0;
				seg_debug::CSegmentedOp_TravellingHead_Benchmark b1;
				seg_debug::CSegmentedOp_UniformSize_Benchmark b2;
				seg_debug::CSegmentedOp_BeginEndMiddle_Benchmark b3(m_n_tile_size);
				seg_debug::CSegmentedOp_Random_Benchmark b4;
				// prepare all the benchmarks

				const size_t n_benchmark_num0 = b0.n_Benchmark_Num(n);
				const size_t n_benchmark_num1 = n_benchmark_num0 + b1.n_Benchmark_Num(n);
				const size_t n_benchmark_num2 = n_benchmark_num1 + b2.n_Benchmark_Num(n);
				const size_t n_benchmark_num3 = n_benchmark_num2 + b3.n_Benchmark_Num(n);
				const size_t n_benchmark_num = n_benchmark_num3 + b4.n_Benchmark_Num(n);
				// calculate prefix sum of the benchmark lengths

				for(size_t n_benchmark = 0; n_benchmark < n_benchmark_num; ++ n_benchmark) {
					seg.Reset(n);
					// clear head flags! otherwise will be just scanning all ones, all the time

					std::vector<uint32_t> &head_flags = seg.head_flags;
					std::vector<size_t> &head_indices = seg.head_indices;
					std::vector<size_t> &tail_indices = seg.tail_indices;

					const size_t r =
						(n_benchmark < n_benchmark_num0)? b0.n_Generate_HeadFlags(seg, n, n_benchmark - 0) :
						(n_benchmark < n_benchmark_num1)? b1.n_Generate_HeadFlags(seg, n, n_benchmark - n_benchmark_num0) :
						(n_benchmark < n_benchmark_num2)? b2.n_Generate_HeadFlags(seg, n, n_benchmark - n_benchmark_num1) :
						(n_benchmark < n_benchmark_num3)? b3.n_Generate_HeadFlags(seg, n, n_benchmark - n_benchmark_num2) :
														  b4.n_Generate_HeadFlags(seg, n, n_benchmark - n_benchmark_num3);
					// generate benchmark flags, calculate head flags

					seg.Fill_TailIndices(r);
					// calculate tail indices from head flags

					CLresult n_result = cmd_queue.n_Enqueue_Memcpy_HtoD(dp_flags, 0, &head_flags.front(), n * sizeof(uint32_t));
					if(n_result != cl_Success) {
						fprintf(stderr, "error: OpenCL opetration %s line %d failed with %d\n",
							__FILE__, __LINE__, n_result);
						return false;
					}
					// copy head flags

					std::vector<uint32_t> seg_scan_cpu(n), tile_carry_cpu(n_tile_num),
						tile_flags_cpu(n_tile_num), tile_tailcounts_cpu(n_tile_num);
					seg_debug::CReference::Segmented_TileScan(seg_scan_cpu, tile_tailcounts_cpu,
						tile_carry_cpu, tile_flags_cpu, data, head_flags, m_n_tile_size, m_n_workgroup_size);
					// calculate reference

					bool b_good_scan, b_good_carry, b_good_flags;
					for(int n_type = 0; n_type < 3; ++ n_type) {
						if(!n_type) {
							n_result = Enqueue_TileSegScan_Carry(cmd_queue, dp_tile_carry,
								dp_tile_flags, dp_data, dp_flags, n);
						} else if(n_type == 1) {
							n_result = Enqueue_TileSegScan_Carry_Pack(cmd_queue, dp_tile_carry,
								dp_tile_flags, dp_packed_head_flags, dp_data, dp_flags, n);
						} else if(n_type == 2) {
							n_result = Enqueue_TileSegScan_Carry_Packed(cmd_queue, dp_tile_carry,
								dp_tile_flags, dp_data, dp_packed_head_flags, n);
						}
						if(n_result != cl_Success) {
							fprintf(stderr, "error: OpenCL opetration %s line %d failed with %d\n",
								__FILE__, __LINE__, n_result);
							return false;
						}
						if(n_type != 2) {
							n_result = Enqueue_TileSegScan(cmd_queue, dp_scan,
								/*dp_tile_carry, dp_tile_flags,*/ dp_data, dp_flags, n);
						} else {
							n_result = Enqueue_TileSegScan_Packed(cmd_queue, dp_scan,
								/*dp_tile_carry, dp_tile_flags,*/ dp_data, dp_packed_head_flags, n);
						}
						if(n_result != cl_Success) {
							fprintf(stderr, "error: OpenCL opetration %s line %d failed with %d\n",
								__FILE__, __LINE__, n_result);
							return false;
						}
						n_result = cmd_queue.n_Finish();
						if(n_result != cl_Success) {
							fprintf(stderr, "error: OpenCL opetration %s line %d failed with %d\n",
								__FILE__, __LINE__, n_result);
							return false;
						}
						// call the kernel

						n_result = cmd_queue.n_CompareBuffer_DebugVerbose(b_good_scan, seg_scan_cpu.begin(),
							seg_scan_cpu.end(), dp_scan, 0, (n_type == 0)? "segmented tile scan" :
							(n_type == 1)? "segmented tile scan pack" : "segmented tile scan packed");
						if(n_result != cl_Success) {
							fprintf(stderr, "error: OpenCL opetration %s line %d failed with %d\n",
								__FILE__, __LINE__, n_result);
							return false;
						}
						n_result = cmd_queue.n_CompareBuffer_DebugVerbose(b_good_carry, tile_carry_cpu.begin(),
							tile_carry_cpu.end(), dp_tile_carry, 0, (n_type == 0)? "segmented tile carry" :
							(n_type == 1)? "segmented tile carry pack" : "segmented tile carry packed"); // the last one does not matter
						if(n_result != cl_Success) {
							fprintf(stderr, "error: OpenCL opetration %s line %d failed with %d\n",
								__FILE__, __LINE__, n_result);
							return false;
						}
						n_result = cmd_queue.n_CompareBuffer_DebugVerbose(b_good_flags, tile_flags_cpu.begin(),
							tile_flags_cpu.end(), dp_tile_flags, 0, (n_type == 0)? "segmented tile flags" :
							(n_type == 1)? "segmented tile flags pack" : "segmented tile flags packed");
						if(n_result != cl_Success) {
							fprintf(stderr, "error: OpenCL opetration %s line %d failed with %d\n",
								__FILE__, __LINE__, n_result);
							return false;
						}

						if(!b_good_scan || !b_good_carry || !b_good_flags) {
							fprintf(stderr, "error: failed at test type %d\n", n_type);
							break;
						}
					}
					if(!b_good_scan || !b_good_carry || !b_good_flags) {
						fprintf(stderr, "error: failed at test " PRIsize "\n", n_benchmark);
						return false;
					} else {
						printf("debug: test_scan " PRIsize " / " PRIsize " (" PRIsize " (" PRIsize 
							" tiles), " PRIsize ") good scan, flags and carry    \r", n_benchmark,
							n_benchmark_num/*2 + n + (n - 2) + n_bme_head_benchmark_len + n*/, n, n_tile_num, r);
					}

					std::vector<uint32_t> global_scan_ref;
					//std::vector<uint32_t> tile_tailcounts_cpu/*, tile_head_flags_cpu*/; // will use that to check the reduce bootstrap kernel
					{
						//TiledSegmented_Scan_HF(seg_scan_cpu, tile_carry_cpu, tile_head_flags_cpu, data, head_flags);

						std::vector<uint32_t> scanned_carry;
						seg_debug::CReference::Segmented_SpineScan(scanned_carry, tile_carry_cpu, tile_flags_cpu); // cheating, using a full scan
						//scanned_carry.erase(scanned_carry.end() - 1);
						//scanned_carry.insert(scanned_carry.begin(), 0); // not really exclusive scan (since the counters after head flag are not reset to zero) -- just inclusive scan prefixed with a zero (or a skewed read of an inclusive scan)
						// scan carry outs using tile flags as head flags (can do that recursively)

						std::vector<uint32_t> global_scan/*, global_carryouts*/;
						seg_debug::CReference::Segmented_TileScan_Downsweep(global_scan, /*global_carryouts,*/
							//tile_tailcounts_cpu, // returns tail counts!
							data, head_flags, scanned_carry, m_n_tile_size/*, m_n_workgroup_size*/);

						seg_debug::CReference::Segmented_Scan(global_scan_ref, data, head_flags);

						seg_debug::n_Debug_CompareArrays_Verbose(global_scan, global_scan_ref, n, "global CPU segmented scan");
						bool b_good_global_scan = global_scan_ref == global_scan;
						if(!b_good_global_scan) {
							fprintf(stderr, "error: global CPU scan failed at test " PRIsize "\n", n_benchmark);
							return false;
						}
					}
					// test the global scan concept on CPU

					_ASSERTE(n_tile_num <= n_Tile_Size()); // would need another level of scan, not implemented in this benchmark function
					n_result = Enqueue_TileSegScan(cmd_queue, dp_tile_carry, dp_tile_carry, dp_tile_flags, n_tile_num);
					if(n_result != cl_Success) {
						fprintf(stderr, "error: OpenCL opetration %s line %d failed with %d\n",
							__FILE__, __LINE__, n_result);
						return false;
					}
					n_result = cmd_queue.n_Finish();
					if(n_result != cl_Success) {
						fprintf(stderr, "error: OpenCL opetration %s line %d failed with %d\n",
							__FILE__, __LINE__, n_result);
						return false;
					}
					// seg-scan the tile carry (using tile scan but really want a global scan)

					bool b_good_segscan;
					for(int n_type = 0; n_type < 1; ++ n_type) {
						if(!n_type) {
							n_result = Enqueue_TileSegScan_Downsweep(cmd_queue, dp_segscan,
								dp_data, dp_tile_carry, dp_flags, n);
						} else if(n_type == 1) {
							n_result = Enqueue_TileSegScan_Downsweep_Packed(cmd_queue, dp_segscan,
								dp_data, dp_tile_carry, dp_packed_head_flags, n);
						}
						n_result = cmd_queue.n_Finish();
						if(n_result != cl_Success) {
							fprintf(stderr, "error: OpenCL opetration %s line %d failed with %d\n",
								__FILE__, __LINE__, n_result);
							return false;
						}
						// call the kernel

						n_result = cmd_queue.n_CompareBuffer_DebugVerbose(b_good_segscan, global_scan_ref.begin(),
							global_scan_ref.end(), dp_segscan, 0, (n_type == 0)? "global tile scan" :
							(n_type == 1)? "global tile scan packed" : "(null)");
						if(n_result != cl_Success) {
							fprintf(stderr, "error: OpenCL opetration %s line %d failed with %d\n",
								__FILE__, __LINE__, n_result);
							return false;
						}

						if(!b_good_segscan) {
							fprintf(stderr, "error: failed at downsweep test type %d\n", n_type);
							break;
						}
					}
					if(!b_good_segscan) {
						fprintf(stderr, "error: failed at downsweep test " PRIsize "\n", n_benchmark);
						return false;
					}

					n_result = Enqueue_TileSegReduce_Bootstrap_Pack(cmd_queue, dp_tile_carry,
						dp_tile_flags, dp_packed_head_flags, dp_flags, n);
					/*n_result = Enqueue_TileSegReduce_Bootstrap(cmd_queue, dp_tile_carry,
						dp_tile_flags, dp_flags, n);*/
					if(n_result != cl_Success) {
						fprintf(stderr, "error: OpenCL opetration %s line %d failed with %d\n",
							__FILE__, __LINE__, n_result);
						return false;
					}
					n_result = cmd_queue.n_Finish();
					if(n_result != cl_Success) {
						fprintf(stderr, "error: OpenCL opetration %s line %d failed with %d\n",
							__FILE__, __LINE__, n_result);
						return false;
					}
					// call the kernel

					bool b_good_tail_counts, b_good_tail_flags;
					n_result = cmd_queue.n_CompareBuffer_DebugVerbose(b_good_tail_counts, tile_tailcounts_cpu.begin(),
						tile_tailcounts_cpu.end(), dp_tile_carry, 0, "segmented reduce tile counts");
					if(n_result != cl_Success) {
						fprintf(stderr, "error: OpenCL opetration %s line %d failed with %d\n",
							__FILE__, __LINE__, n_result);
						return false;
					}
					n_result = cmd_queue.n_CompareBuffer_DebugVerbose(b_good_tail_flags, tile_flags_cpu.begin(),
						tile_flags_cpu.end() /*- 1*/, dp_tile_flags, 0, "segmented reduce tile flags"); // can't ignore the last one, will need it for reduction spine fix
					if(n_result != cl_Success) {
						fprintf(stderr, "error: OpenCL opetration %s line %d failed with %d\n",
							__FILE__, __LINE__, n_result);
						return false;
					}
					// check tail counts

					if(!b_good_tail_counts || !b_good_tail_flags) {
						fprintf(stderr, "error: failed at segmented reduce bootstrap test " PRIsize " (%d, %d)\n",
							n_benchmark, (b_good_tail_counts)? 1 : 0, (b_good_tail_flags)? 1 : 0);
						return false;
					}

					std::vector<uint32_t> device_tail_counts_raw(n_tile_num, -1);
					n_result = cmd_queue.n_Enqueue_Memcpy_DtoH(&device_tail_counts_raw[0], dp_tile_carry,
						0, n_tile_num * sizeof(uint32_t));
					// save this before overwriting dp_tile_carry in one of the kernel calls below

					_ASSERTE(n_tile_num <= r_int_scan.n_Tile_Size()); // would need another level of scan, not implemented in this benchmark function
					n_result = r_int_scan.Enqueue_TileExScan(cmd_queue, dp_tile_tail_counts_scan, dp_tile_carry, n_tile_num);
					if(n_result != cl_Success) {
						fprintf(stderr, "error: OpenCL opetration %s line %d failed with %d\n",
							__FILE__, __LINE__, n_result);
						return false;
					}
					n_result = cmd_queue.n_Finish();
					if(n_result != cl_Success) {
						fprintf(stderr, "error: OpenCL opetration %s line %d failed with %d\n",
							__FILE__, __LINE__, n_result);
						return false;
					}
					// scan the tail counts (even if n_tile_num = 1, this writes a 0 there)

					const uint32_t p_sentinell[] = {0xdeafbeef, 0xbaadf00d};
					n_result = cmd_queue.n_Enqueue_Memcpy_HtoD(dp_reductions,
						(r - 1) * sizeof(uint32_t), p_sentinell, 2 * sizeof(uint32_t));
					if(n_result != cl_Success) {
						fprintf(stderr, "error: OpenCL opetration %s line %d failed with %d\n",
							__FILE__, __LINE__, n_result);
						return false;
					}

					/*n_result = Enqueue_TileSegReduce(cmd_queue, dp_reductions, dp_tile_carry,
						dp_tile_tail_counts_scan, dp_data, dp_flags, n);*/
					//n_result = Enqueue_Pack_HeadFlags(cmd_queue, dp_packed_head_flags, dp_flags, n);
					n_result = Enqueue_TileSegReduce_Packed(cmd_queue, dp_reductions, dp_tile_carry,
						dp_tile_tail_counts_scan, dp_data, dp_packed_head_flags, n);
					if(n_result != cl_Success) {
						fprintf(stderr, "error: OpenCL opetration %s line %d failed with %d\n",
							__FILE__, __LINE__, n_result);
						return false;
					}
					n_result = cmd_queue.n_Finish();
					if(n_result != cl_Success) {
						fprintf(stderr, "error: OpenCL opetration %s line %d failed with %d\n",
							__FILE__, __LINE__, n_result);
						return false;
					}
					// call the tile reduction kernel

					std::vector<uint32_t> tile_reduction_carry(n_tile_num), tile_reductions(r + 1); // only r of those
					n_result = cmd_queue.n_Enqueue_Memcpy_DtoH(&tile_reductions.front(), dp_reductions, 0, (r + 1) * sizeof(uint32_t));
					if(n_result != cl_Success) {
						fprintf(stderr, "error: OpenCL opetration %s line %d failed with %d\n",
							__FILE__, __LINE__, n_result);
						return false;
					}
					n_result = cmd_queue.n_Enqueue_Memcpy_DtoH(&tile_reduction_carry.front(), dp_tile_carry, 0, n_tile_num * sizeof(uint32_t));
					if(n_result != cl_Success) {
						fprintf(stderr, "error: OpenCL opetration %s line %d failed with %d\n",
							__FILE__, __LINE__, n_result);
						return false;
					}
					// copy the results back

					if(tile_reductions[tile_reductions.size() - 2] == p_sentinell[0])
						fprintf(stderr, "warning: seems like Enqueue_TileSegReduce() did not write the last element of the reductions array  \n");
					if(tile_reductions.back() != p_sentinell[1])
						fprintf(stderr, "warning: Enqueue_TileSegReduce() overwrite past bounds in the reductions array  \n");
					tile_reductions.erase(tile_reductions.end() - 1);
					// sentinell check

					std::vector<uint32_t> tile_tail_scan(n_tile_num);
					cmd_queue.n_Enqueue_Memcpy_DtoH(&tile_tail_scan[0], dp_tile_tail_counts_scan,
						0, n_tile_num * sizeof(uint32_t));
					if(tile_tail_scan[0])
						fprintf(stderr, "warning: dp_tile_tail_counts_scan is not exclusive  \n");
					// get tile tail indices

					std::vector<uint32_t> //tile_carry_before(n_tile_num, -1),
						tile_carry_segscan(n_tile_num, -1), device_tile_flags(n_tile_num, -1);
					//n_result = cmd_queue.n_Enqueue_Memcpy_DtoH(&tile_carry_before[0], dp_tile_carry,
					//	0, n_tile_num * sizeof(uint32_t)); // debug
					n_result = cmd_queue.n_Enqueue_Memcpy_DtoH(&device_tile_flags[0], dp_tile_flags,
						0, n_tile_num * sizeof(uint32_t));
					n_result = Enqueue_TileSegScan(cmd_queue, dp_tile_carry, dp_tile_carry, dp_tile_flags, n_tile_num);
					n_result = cmd_queue.n_Finish();
					n_result = cmd_queue.n_Enqueue_Memcpy_DtoH(&tile_carry_segscan[0], dp_tile_carry,
						0, n_tile_num * sizeof(uint32_t));
					// get tile carry segmented scan

					std::vector<uint32_t> segreduce_cpu(r, -1);
					{
						size_t n_error_num = 0;
						for(size_t i = 0; i < r; ++ i) {
							uint32_t n_partial_reduction = tile_reductions[i];
							uint32_t n_partial_reduction_cpu = seg_scan_cpu[tail_indices[i]];
							if(n_partial_reduction != n_partial_reduction_cpu) {
								if(++ n_error_num < 100) {
									fprintf(stderr, "error: %s[" PRIsize "] = %u (should be %u)\n",
										"partial segmented reduction", i, n_partial_reduction, n_partial_reduction_cpu);
								}
							}
						}
						if(n_error_num) {
							fprintf(stderr, "error: %s had " PRIsize " error(s)\n",
								"partial segmented reduction", n_error_num);
							return false;
						}

						size_t n_sum = std::accumulate(data.begin(), data.end(), uint32_t(0));

						size_t n_tail_num = tail_indices.size();
						for(size_t i = 0; i < n_tile_num - 1; ++ i) { // the -1 is needed for the second version (the first version could start looping with 1)
							/*//if((tile_tail_scan[i] < ((i + 1 < n_tile_num)? tile_tail_scan[i + 1] : n_tail_num)) && // n_tail_num inconvenient in the kernel
							//if((device_tile_flags[i] || i + 1 == n_tile_num) && // this doesn't work, suppose it is skewed one bit to the right
							if((i + 1 == n_tile_num || tile_tail_scan[i] < tile_tail_scan[i + 1]) && // only if there is a tail in this tile
							   !head_flags[i * m_n_tile_size]) // but also in case the first element of the tile is not reset by a head flag
								tile_reductions[tile_tail_scan[i]] += (i)? tile_carry_segscan[i - 1] : 0;*/
							// works but requires skewed read. we can only do skewed write though!

							if((i + 2 == n_tile_num || device_tail_counts_raw[i + 1] > 0/*tile_tail_scan[i + 1] < tile_tail_scan[i + 2]*/) && // only if there is a tail in this tile
							   !head_flags[(i + 1) * m_n_tile_size]) // but also in case the first element of the tile is not reset by a head flag
								tile_reductions[tile_tail_scan[i + 1]] += tile_carry_segscan[i];
							// skewed write, save one iteration (the first one)
						}
						// fixup tile reductions to be global reductions (this should be done in the last level of scan by a modified kernel)

						size_t n_sum2 = std::accumulate(tile_reductions.begin(), tile_reductions.end(), uint32_t(0));

						n_error_num = 0;
						for(size_t i = 0; i < r; ++ i) {
							uint32_t n_global_reduction = tile_reductions[i];
							uint32_t n_global_reduction_cpu = global_scan_ref[tail_indices[i]];
							segreduce_cpu[i] = n_global_reduction_cpu;
							if(n_global_reduction != n_global_reduction_cpu) {
								if(++ n_error_num < 100) {
									fprintf(stderr, "error: %s[" PRIsize "] = %u (should be %u)\n",
										"global segmented reduction", i, n_global_reduction, n_global_reduction_cpu);
								}
							}
						}
						if(n_error_num) {
							fprintf(stderr, "error: %s had " PRIsize " error(s)\n",
								"global segmented reduction", n_error_num);
							return false;
						}
						// check global reduction concept (calculated mostly on CPU)
					}
					tile_carry_cpu.back() = 0; // the last reduction thread clears it; we don't need it though
					tile_reduction_carry.back() = 0; // sometimes doesn't?
					bool b_good_redcarry = tile_carry_cpu == tile_reduction_carry;
					if(!b_good_redcarry) {
						fprintf(stderr, "error: failed at segment reduce test " PRIsize " (%u, %u)\n",
							n_benchmark, tile_carry_cpu.back(), tile_reduction_carry.back());
						return false;
					}
					// check results

					n_result = n_Enqueue_SegmentedScan_NoAutoTune(cmd_queue, dp_segscan, dp_data, dp_flags, n, gpu_alloc);
					if(n_result != cl_Success) {
						fprintf(stderr, "error: OpenCL opetration %s line %d failed with %d\n",
							__FILE__, __LINE__, 0);
						return false;
					}
					n_result = cmd_queue.n_Finish();
					if(n_result != cl_Success) {
						fprintf(stderr, "error: OpenCL opetration %s line %d failed with %d\n",
							__FILE__, __LINE__, n_result);
						return false;
					}
					// call the tile reduction kernel

					bool b_good_gscan;
					n_result = cmd_queue.n_CompareBuffer_DebugVerbose(b_good_gscan, global_scan_ref.begin(),
						global_scan_ref.end(), dp_segscan, 0, "global segmented scan");
					if(n_result != cl_Success) {
						fprintf(stderr, "error: OpenCL opetration %s line %d failed with %d\n",
							__FILE__, __LINE__, n_result);
						return false;
					}
					if(!b_good_gscan) {
						fprintf(stderr, "error: failed at global segment scan test " PRIsize "\n",
							n_benchmark/*, segmented_scan.back(), global_scan_ref.back()*/);
						return false;
					}

					size_t n_reds = r;
					n_result = n_Enqueue_SegmentedReduce_NoAutoTune(cmd_queue, *const_cast<cl_mem*>(&dp_reductions),
						n_reds, r, dp_data, dp_flags, n, CFakeGlobalExScan(r_int_scan), gpu_alloc);
					if(n_result != cl_Success) {
						fprintf(stderr, "error: OpenCL opetration %s line %d failed with %d\n",
							__FILE__, __LINE__, n_result);
						return false;
					}
					bool b_good_greduce;
					n_result = cmd_queue.n_CompareBuffer_DebugVerbose(b_good_greduce, segreduce_cpu.begin(),
						segreduce_cpu.end(), dp_reductions, 0, "global segmented reduce");
					if(n_result != cl_Success) {
						fprintf(stderr, "error: OpenCL opetration %s line %d failed with %d\n",
							__FILE__, __LINE__, n_result);
						return false;
					}
					if(!b_good_greduce) {
						fprintf(stderr, "error: failed at global segment reduce test " PRIsize "\n", n_benchmark);
						//return false;
					}
				}
				printf("\npass finished\n");
			}
			printf("\nall benchmarks passed (incl. global segmented scan and reduce)\n");

			return true;
		} catch(std::bad_alloc&) {
			fprintf(stderr, "\nerror: not enough memory\n");
			return false;
		}
	}
};

#endif // !__OPENCL_TILED_SCAN_REDUCTION_INCLUDED
