/*
								+--------------------------------+
								|                                |
								|   ***  OpenCL utilities  ***   |
								|                                |
								|  Copyright  -tHE SWINe- 2010  |
								|                                |
								|          ClUtils.cpp           |
								|                                |
								+--------------------------------+
*/

/**
 *	@file gpgpu/ClUtils.cpp
 *	@date 2010
 *	@author -tHE SWINe-
 *	@brief OpenCL utilities
 */

#include "../NewFix.h"
#include "../CallStack.h"
#include <stdio.h>
#include <algorithm>
#include <map>
#if defined(_WIN32) || defined(_WIN64)
#include <io.h> //_access
#define access(f,m) _access(f,m)
#else // _WIN32 || _WIN64
#include <unistd.h>
#include <errno.h> // program_invocation_name
#endif // _WIN32 || _WIN64
#include "../Integer.h"
#include "../MinMax.h"
#include "../Dir.h" // PRIsizeB
#include "../Hash.h"
#if !defined(_MSC_VER) || defined(__MWERKS__) || _MSC_VER > 1200
#include <stdint.h>
#else // !_MSC_VER || __MWERKS__ || _MSC_VER > 1200
typedef ptrdiff_t intptr_t;
#endif // !_MSC_VER || __MWERKS__ || _MSC_VER > 1200
#include <limits.h>
//#include <CL/opencl.h> // don't
#include "ClUtils.h"

#if defined(_MSC_VER) && !defined(__MWERKS__) && !defined(for) && _MSC_VER <= 1200
#define for if(0) {} else for
#endif // _MSC_VER && !__MWERKS__ && !for && _MSC_VER <= 1200
// msvc 'for' scoping hack

/*
 *								=== CCLUtils::TDevice_GFLOPS ===
 */

double CCLUtils::TDevice_GFLOPS::operator ()(cl_device_id h_device) const
{
	cl_uint n_speed, n_processors;
	if(clGetDeviceInfo(h_device, CL_DEVICE_MAX_CLOCK_FREQUENCY,
	   sizeof(n_speed), &n_speed, NULL) != CL_SUCCESS ||
	   clGetDeviceInfo(h_device, CL_DEVICE_MAX_COMPUTE_UNITS,
	   sizeof(n_processors), &n_processors, NULL) != CL_SUCCESS)
		return -1;
	return double(n_speed) * n_processors;
}

/*
 *								=== ~CCLUtils::TDevice_GFLOPS ===
 */

/*
 *								=== CCLUtils::TDevice_GlobalMemSize ===
 */

cl_ulong CCLUtils::TDevice_GlobalMemSize::operator ()(cl_device_id h_device) const
{
	cl_ulong n_memory;
	if(clGetDeviceInfo(h_device, CL_DEVICE_GLOBAL_MEM_SIZE,
	   sizeof(n_memory), &n_memory, NULL) != CL_SUCCESS)
		return 0; // -1 would be maximum
	return n_memory;
}

/*
 *								=== ~CCLUtils::TDevice_GlobalMemSize ===
 */

/*
 *								=== CCLUtils::TDevice_ComputeCapability ===
 */

int CCLUtils::TDevice_ComputeCapability::operator ()(cl_device_id h_device) const
{
	CCLDeviceParams params(h_device);
	if(!params.b_Status() || !params.b_Is_NVIDIA())
		return -1;
	return params.n_NV_ComputeCap_Major() * 100 + params.n_NV_ComputeCap_Minor();
}

/*
 *								=== ~CCLUtils::TDevice_ComputeCapability ===
 */

/*
 *								=== CCLUtils::TDevice_OpenCL_C_Version ===
 */

int CCLUtils::TDevice_OpenCL_C_Version::operator ()(cl_device_id h_device) const
{
	int n_hi, n_lo;
#ifdef CL_DEVICE_OPENCL_C_VERSION
	CCLDeviceParams params(h_device);
	std::string s_info;
	if(!params.b_Status() || params.n_GetDeviceInfoString(s_info,
	   CL_DEVICE_OPENCL_C_VERSION) != CL_SUCCESS ||
#if defined(_MSC_VER) && !defined(__MWERKS__) && _MSC_VER >= 1400
	   sscanf_s(s_info.c_str(), "OpenCL C %d.%d", &n_hi, &n_lo) != 2) // use the safe function to avoid warnings
#else // _MSC_VER && !__MWERKS__ && _MSC_VER >= 1400
	   sscanf(s_info.c_str(), "OpenCL C %d.%d", &n_hi, &n_lo) != 2)
#endif // _MSC_VER && !__MWERKS__ && _MSC_VER >= 1400
		return -1;
#else // CL_DEVICE_OPENCL_C_VERSION
	n_hi = 1;
	n_lo = 0;
	// OpenCL 1.0 does not support CL_DEVICE_OPENCL_C_VERSION
#endif // CL_DEVICE_OPENCL_C_VERSION
	return n_hi * 100 + n_lo;
}

/*
 *								=== ~CCLUtils::TDevice_OpenCL_C_Version ===
 */

/*
 *								=== CCLUtils ===
 */

CLresult CCLUtils::n_Get_PlatformList(std::vector<cl_platform_id> &r_platform_list)
{
	r_platform_list.clear();
	// !!

	cl_uint n_platform_num = 0;
	CLresult n_result;
	if((n_result = (CLresult)clGetPlatformIDs(0, NULL, &n_platform_num)) != CL_SUCCESS)
		return n_result; // error
	if(n_platform_num == 0)
		return cl_Platform_Not_Found_KHR; // no OpenCL capable platform available
	// get number of OpenCL platforms

	try {
		r_platform_list.resize(n_platform_num);
	} catch(std::bad_alloc&) {
		return cl_Out_Of_Host_Memory;
	}
	_ASSERTE(r_platform_list.size() <= /*CL*/UINT_MAX);
	if((n_result = (CLresult)clGetPlatformIDs(cl_uint(r_platform_list.size()),
	   &r_platform_list.front(), &n_platform_num)) != CL_SUCCESS)
		return n_result;
	_ASSERTE(r_platform_list.size() == n_platform_num);
	// get list of platform id's

	return cl_Success;
}

CLresult CCLUtils::n_Get_FullProfile_Platform(cl_platform_id *p_platform,
	cl_device_type n_device_type /*= CL_DEVICE_TYPE_GPU*/)
{
	std::vector<cl_platform_id> platform_list;
	CLresult n_result;
	if((n_result = n_Get_PlatformList(platform_list)) != CL_SUCCESS)
		return n_result;

	size_t n_full_platform = 0; // if no full platform is found, just choose the first one
	for(size_t i = 0, n = platform_list.size(); i < n; ++ i) {
		char p_s_profile_name[20] = {0};
		clGetPlatformInfo(platform_list[i], CL_PLATFORM_PROFILE, sizeof(p_s_profile_name), p_s_profile_name, NULL);
		p_s_profile_name[sizeof(p_s_profile_name) / sizeof(p_s_profile_name[0]) - 1] = 0; // make sure it's terminated

		if(!strcmp(p_s_profile_name, "FULL_PROFILE")) {
			{
				cl_uint n_device_num;
				CLresult n_result = (CLresult)clGetDeviceIDs(platform_list[i], n_device_type, 0, 0, &n_device_num);
				if(n_result != CL_SUCCESS || !n_device_num)
					continue;
				// make sure that the platform contains any devices that we are interested in
			}
			n_full_platform = i;
			break;
		}
	}
	// find "FULL_PROFILE" platform

	*p_platform = platform_list[n_full_platform];

	return cl_Success;
}

CLresult CCLUtils::n_OpenCL_Init(cl_context *p_context, cl_platform_id h_platform,
	size_t n_device_num, const cl_device_id *p_device)
{
	cl_context_properties p_props[3] = {cl_context_properties(CL_CONTEXT_PLATFORM),
		cl_context_properties(h_platform), 0};
	cl_int n_result;
	*p_context = clCreateContext((h_platform)? p_props : 0, (cl_uint)n_device_num, p_device, 0, 0, &n_result);
	return (CLresult)n_result;
}

CLresult CCLUtils::n_OpenCL_Init(cl_context *p_context, int n_device_type, bool b_implementation_profile_selection)
{
	cl_context_properties p_props[3] = {cl_context_properties(CL_CONTEXT_PLATFORM), 0, 0};
	if(!b_implementation_profile_selection) {
		cl_platform_id h_platform;
		CLresult n_result;
		if((n_result = (CLresult)n_Get_FullProfile_Platform(&h_platform, n_device_type)) != CL_SUCCESS)
			return n_result;
		p_props[1] = cl_context_properties(h_platform);
	}
	// get the first available "FULL_PROFILE" OpenCL platform

	cl_int n_result;
	cl_context h_context = clCreateContextFromType((b_implementation_profile_selection)?
		NULL : p_props, n_device_type, 0, 0, &n_result);
	if(n_result != CL_SUCCESS)
		return (CLresult)n_result;
	// create context

	*p_context = h_context;

	return cl_Success;
}

CLresult CCLUtils::n_GetDeviceList(cl_context h_context, std::vector<cl_device_id> &r_device_list)
{
	{
		size_t n_device_size;
		CLresult n_result;
		if((n_result = (CLresult)clGetContextInfo(h_context, CL_CONTEXT_DEVICES, 0, 0, &n_device_size)) != CL_SUCCESS)
			return n_result;
		size_t n_device_num = n_device_size / sizeof(cl_device_id); // it's in bytes
		if(!n_device_num)
			return cl_Device_Not_Found; // no devices
		try {
			r_device_list.resize(n_device_num);
		} catch(std::bad_alloc&) {
			return cl_Out_Of_Host_Memory;
		}
		if((n_result = (CLresult)clGetContextInfo(h_context, CL_CONTEXT_DEVICES, n_device_size, &r_device_list.front(), 0)) != CL_SUCCESS)
			return n_result;
	}
	// get list of devices

	return cl_Success;
}

CLresult CCLUtils::n_GetDeviceList(cl_platform_id h_platform, std::vector<cl_device_id> &r_device_list,
	cl_device_type n_device_type /*= CL_DEVICE_TYPE_GPU*/)
{
	{
		cl_uint n_device_num;
		CLresult n_result;
		if((n_result = (CLresult)clGetDeviceIDs(h_platform, n_device_type, 0, 0, &n_device_num)) != CL_SUCCESS)
			return n_result;
		if(!n_device_num)
			return cl_Device_Not_Found; // no devices
		try {
			r_device_list.resize(n_device_num);
		} catch(std::bad_alloc&) {
			return cl_Out_Of_Host_Memory;
		}
		if((n_result = (CLresult)clGetDeviceIDs(h_platform, n_device_type, n_device_num, &r_device_list.front(), 0)) != CL_SUCCESS)
			return n_result;
	}
	// get list of devices

	return cl_Success;
}

size_t CCLUtils::n_Get_MaxGFlops_DeviceId(cl_device_id *p_device_id, cl_context h_context)
{
	return CCLUtils::n_Get_Best_DeviceId(p_device_id, h_context, CCLUtils::TDevice_DefaultScoring());
}

size_t CCLUtils::n_Get_MaxGFlops_DeviceId(cl_context h_context)
{
	cl_device_id h_dummy;
	return n_Get_MaxGFlops_DeviceId(&h_dummy, h_context);
}

/*size_t CCLUtils::n_Get_MaxGFlops_DeviceId(cl_device_id *p_device_id, cl_context h_context)
{
	std::vector<cl_device_id> device_list;
	if(n_GetDeviceList(h_context, device_list) != CL_SUCCESS)
		return -1;
	// get all the devices

	size_t n_best_device = 0;
	{
		double f_best_gflops = -1;
		for(size_t i = 0, n = device_list.size(); i < n; ++ i) {
			cl_uint n_speed, n_processors;
			if(clGetDeviceInfo(device_list[i], CL_DEVICE_MAX_CLOCK_FREQUENCY,
			   sizeof(n_speed), &n_speed, NULL) != CL_SUCCESS ||
			   clGetDeviceInfo(device_list[i], CL_DEVICE_MAX_COMPUTE_UNITS,
			   sizeof(n_processors), &n_processors, NULL) != CL_SUCCESS)
				return -1;
			double f_gflops = double(n_speed) * n_processors;
			if(!i || f_best_gflops < f_gflops) {
				f_best_gflops = f_gflops;
				n_best_device = i;
			}
		}
	}
	// go trough devices, and estimate computing power (not real gflops)

	*p_device_id = device_list[n_best_device];
	// write handle to the best device

	return n_best_device;
}*/

CLresult CCLUtils::n_GetPlatformInfoString(std::string &r_s_str, cl_platform_id h_platform, int n_name)
{
	size_t n_name_size;
	CLresult n_result;
	if((n_result = (CLresult)clGetPlatformInfo(h_platform, n_name, 0, NULL, &n_name_size)) != CL_SUCCESS)
		return n_result;
	// get string length

	try {
		r_s_str.resize(n_name_size + 1);
		// allocate string

		if((n_result = (CLresult)clGetPlatformInfo(h_platform, n_name, n_name_size, &r_s_str[0], NULL)) != CL_SUCCESS)
			return n_result;
		r_s_str[n_name_size] = 0;
		r_s_str.resize(strlen(r_s_str.c_str()));
		// get string
	} catch(std::bad_alloc&) {
		return cl_Out_Of_Host_Memory;
	}

	return cl_Success;
}

struct TNamedEnum { // g++ requires to have this named and at namespace level for specializing std::find
	const char *p_s_token_name;
	int n_value;

	bool operator ==(CLresult n_ref_value) const
	{
		return n_ref_value == n_value;
	}
};

const char *CCLUtils::p_s_Error_Name(CLresult n_error)
{
	static const TNamedEnum p_error_name_table[] = {
		{"CL_SUCCESS", 0},
		{"CL_DEVICE_NOT_FOUND", -1},
		{"CL_DEVICE_NOT_AVAILABLE", -2},
		{"CL_COMPILER_NOT_AVAILABLE", -3},
		{"CL_MEM_OBJECT_ALLOCATION_FAILURE", -4},
		{"CL_OUT_OF_RESOURCES", -5},
		{"CL_OUT_OF_HOST_MEMORY", -6},
		{"CL_PROFILING_INFO_NOT_AVAILABLE", -7},
		{"CL_MEM_COPY_OVERLAP", -8},
		{"CL_IMAGE_FORMAT_MISMATCH", -9},
		{"CL_IMAGE_FORMAT_NOT_SUPPORTED", -10},
		{"CL_BUILD_PROGRAM_FAILURE", -11},
		{"CL_MAP_FAILURE", -12},
		{"CL_MISALIGNED_SUB_BUFFER_OFFSET", -13},
		{"CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST", -14},
		{"CL_COMPILE_PROGRAM_FAILURE", -15},
		{"CL_LINKER_NOT_AVAILABLE", -16},
		{"CL_LINK_PROGRAM_FAILURE", -17},
		{"CL_DEVICE_PARTITION_FAILED", -18},
		{"CL_KERNEL_ARG_INFO_NOT_AVAILABLE", -19},
		{"CL_INVALID_VALUE", -30},
		{"CL_INVALID_DEVICE_TYPE", -31},
		{"CL_INVALID_PLATFORM", -32},
		{"CL_INVALID_DEVICE", -33},
		{"CL_INVALID_CONTEXT", -34},
		{"CL_INVALID_QUEUE_PROPERTIES", -35},
		{"CL_INVALID_COMMAND_QUEUE", -36},
		{"CL_INVALID_HOST_PTR", -37},
		{"CL_INVALID_MEM_OBJECT", -38},
		{"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR", -39},
		{"CL_INVALID_IMAGE_SIZE", -40},
		{"CL_INVALID_SAMPLER", -41},
		{"CL_INVALID_BINARY", -42},
		{"CL_INVALID_BUILD_OPTIONS", -43},
		{"CL_INVALID_PROGRAM", -44},
		{"CL_INVALID_PROGRAM_EXECUTABLE", -45},
		{"CL_INVALID_KERNEL_NAME", -46},
		{"CL_INVALID_KERNEL_DEFINITION", -47},
		{"CL_INVALID_KERNEL", -48},
		{"CL_INVALID_ARG_INDEX", -49},
		{"CL_INVALID_ARG_VALUE", -50},
		{"CL_INVALID_ARG_SIZE", -51},
		{"CL_INVALID_KERNEL_ARGS", -52},
		{"CL_INVALID_WORK_DIMENSION", -53},
		{"CL_INVALID_WORK_GROUP_SIZE", -54},
		{"CL_INVALID_WORK_ITEM_SIZE", -55},
		{"CL_INVALID_GLOBAL_OFFSET", -56},
		{"CL_INVALID_EVENT_WAIT_LIST", -57},
		{"CL_INVALID_EVENT", -58},
		{"CL_INVALID_OPERATION", -59},
		{"CL_INVALID_GL_OBJECT", -60},
		{"CL_INVALID_BUFFER_SIZE", -61},
		{"CL_INVALID_MIP_LEVEL", -62},
		{"CL_INVALID_GLOBAL_WORK_SIZE", -63},
		{"CL_INVALID_PROPERTY", -64},
		{"CL_INVALID_IMAGE_DESCRIPTOR", -65},
		{"CL_INVALID_COMPILER_OPTIONS", -66},
		{"CL_INVALID_LINKER_OPTIONS", -67},
		{"CL_INVALID_DEVICE_PARTITION_COUNT", -68},
		{"CL_INVALID_PIPE_SIZE", -69},
		{"CL_INVALID_DEVICE_QUEUE", -70},
		{"CL_PLATFORM_NOT_FOUND_KHR", -1001},
		{"CL_DEVICE_PARTITION_FAILED_EXT", -1057},
		{"CL_INVALID_PARTITION_COUNT_EXT", -1058},
		{"CL_INVALID_PARTITION_NAME_EXT", -1059}
	}, *p_error;
	const size_t n_error_token_num = sizeof(p_error_name_table) /
		sizeof(p_error_name_table[0]);

	p_error = std::find(p_error_name_table, p_error_name_table + n_error_token_num, n_error);
	// could use std::lower_bound here but probably better be safe then fast in these matters
	// if we get this far the speed does not matter much

	if(p_error != p_error_name_table + n_error_token_num)
		return p_error->p_s_token_name;
	return 0; // no name found
}

void CCLUtils::ErrorHandler(CLresult n_result, const char *p_s_command,
	const char *p_s_file, int n_line) // throw(std::runtime_error)
{
	_ASSERTE(n_result != CL_SUCCESS); // this is expected to be called with errors

	std::string s_error;
	const char *p_s_error_token = p_s_Error_Name(n_result);
	if((!p_s_error_token && !stl_ut::Format(s_error, "OpenCL error: file \'%s\', line %d: \'%s\'"
	   " failed with %d", p_s_file, n_line, p_s_command, n_result)) || (p_s_error_token &&
	   !stl_ut::Format(s_error, "OpenCL error: file \'%s\', line %d: \'%s\' failed with %s (%d)",
	   p_s_file, n_line, p_s_command, p_s_error_token, n_result)))
		throw ::opencl_error("OpenCL command failed and not enough memory for error description");
	else
		throw ::opencl_error(s_error);
}

void CCLUtils::AssertHandler(CLresult n_result, const char *p_s_command, const char *p_s_file, int n_line)
{
	_ASSERTE(n_result != CL_SUCCESS); // this is expected to be called with errors

	std::string s_error;
	const char *p_s_error_token;
	if((p_s_error_token = p_s_Error_Name(n_result))) {
		fprintf(stderr, "file \'%s\', line %d: OpenCL assertion: \'%s\' failed with %s (%d)",
			p_s_file, n_line, p_s_command, p_s_error_token, n_result);
	} else {
		fprintf(stderr, "file \'%s\', line %d: OpenCL assertion: \'%s\' failed with %d",
			p_s_file, n_line, p_s_command, n_result);
	}
}

/*
 *								=== ~CCLUtils ===
 */

/*
 *								=== CCLDeviceParams ===
 */

CLresult CCLDeviceParams::n_GetDeviceInfoString(std::string &r_s_str, int n_name)
{
	return n_GetDeviceInfoString(r_s_str, m_h_device, n_name);
}

CLresult CCLDeviceParams::n_GetDeviceInfoString(std::string &r_s_str, cl_device_id h_device, int n_name)
{
	size_t n_name_size;
	CLresult n_result;
	if((n_result = (CLresult)clGetDeviceInfo(h_device, n_name, 0, NULL, &n_name_size)) != CL_SUCCESS)
		return n_result;
	// get string length

	try {
		r_s_str.resize(n_name_size + 1);
		// allocate string

		if((n_result = (CLresult)clGetDeviceInfo(h_device, n_name, n_name_size, &r_s_str[0], NULL)) != CL_SUCCESS)
			return n_result;
		r_s_str[n_name_size] = 0;
		r_s_str.resize(strlen(r_s_str.c_str()));
		// get string
	} catch(std::bad_alloc&) {
		return cl_Out_Of_Host_Memory;
	}

	return cl_Success;
}

CLresult CCLDeviceParams::n_GetDeviceInfoString_Safe(std::string &r_s_str, cl_device_id h_device,
	int n_name, bool b_allow_spaces /*= true*/, char n_replacement_char /*= '_'*/)
{
	CLresult n_result = n_GetDeviceInfoString(r_s_str, h_device, n_name);

	const char *p_s_forbidden = " \t\b\r\n/\\:*?\"\'<>|`%"
		"\x01\x02\x03\x04\x05\x06\x07"/*"\x08\x09\x0a"*/"\x0b\x0c"/*"\x0d"*/"\x0e\x0f" // \b\t\n, \r already above
		"\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f";
	if(b_allow_spaces)
		++ p_s_forbidden; // only spaces, not tabs or newlines
	// forbidden characters

	//_ASSERTE(!strchr(p_s_forbidden, n_replacement_char)); // lets be flexible though
	// this is usually a bad idea

	size_t n_pos = 0;
	while((n_pos = r_s_str.find_first_of(p_s_forbidden, n_pos)) != std::string::npos) {
		if(n_replacement_char) {
			r_s_str[n_pos] = n_replacement_char;
			++ n_pos; // search from the next character the next time around
		} else {
			r_s_str.erase(n_pos, 1);
			if(n_pos == r_s_str.length())
				break;
		}
	}
	// replace forbidden characters by a replacement or delete them from the name

	return n_result;
}

bool CCLDeviceParams::b_IsExtensionSupported(const char *p_s_extension_name) const
{
	return b_IsExtensionSupported(m_h_device, p_s_extension_name);
}

bool CCLDeviceParams::b_IsExtensionSupported(cl_device_id h_device, const char *p_s_extension_name)
{
	std::string s_exts;
	if(n_GetDeviceInfoString(s_exts, h_device, CL_DEVICE_EXTENSIONS) != CL_SUCCESS)
		return false;
	// get device extension string

	size_t n_ext_name_length = strlen(p_s_extension_name);
	size_t n_pos = 0, n_end_pos = s_exts.length() - n_ext_name_length;
	while((n_pos = s_exts.find(p_s_extension_name, n_pos)) != std::string::npos) {
		if((!n_pos || isspace(s_exts[n_pos - 1])) &&
		   (n_pos >= n_end_pos || isspace(s_exts[n_pos + n_ext_name_length])))
			return true;
		// there must be space before and after it
	}
	// attempt to find the extension

	return false;
	// extension not found
}

bool CCLDeviceParams::Get_SafeName(std::string &r_s_name,
	bool b_allow_spaces /*= true*/, char n_replacement_char /*= '_'*/) const
{
	if(!stl_ut::Assign(r_s_name, m_s_name))
		return false;
	// ...

	const char *p_s_forbidden = " \t\b\r\n/\\:*?\"\'<>|`%"
		"\x01\x02\x03\x04\x05\x06\x07"/*"\x08\x09\x0a"*/"\x0b\x0c"/*"\x0d"*/"\x0e\x0f" // \b\t\n, \r already above
		"\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f";
	if(b_allow_spaces)
		++ p_s_forbidden; // only spaces, not tabs or newlines
	// forbidden characters

	//_ASSERTE(!strchr(p_s_forbidden, n_replacement_char)); // lets be flexible though
	// this is usually a bad idea

	size_t n_pos = 0;
	while((n_pos = r_s_name.find_first_of(p_s_forbidden, n_pos)) != std::string::npos) {
		if(n_replacement_char) {
			r_s_name[n_pos] = n_replacement_char;
			++ n_pos; // search from the next character the next time around
		} else {
			r_s_name.erase(n_pos, 1);
			if(n_pos == r_s_name.length())
				break;
		}
	}
	// replace forbidden characters by a replacement or delete them from the name

	return true;
}

void CCLDeviceParams::Dump(FILE *p_fw)
{
	if(b_Is_NVIDIA()) {
		fprintf(p_fw, "device(\'%s\', " PRIsizeB
			"B RAM, %.2f MHz, SM: " PRIsize ", max-threads-block: %d, max-block-size: %dx%dx%d, cap %d.%d)\n",
			p_s_Name(),
			PRIsizeBparams(n_Memory_Size()),
			t_Properties().clockRate / 1e3f,
			n_Multiprocessor_Num(),
			t_Properties().maxThreadsPerBlock,
			t_Properties().maxThreadsDim[0],
			t_Properties().maxThreadsDim[1],
			t_Properties().maxThreadsDim[2],
			m_p_device_caps[0], m_p_device_caps[1]);
	} else {
		fprintf(p_fw, "device(\'%s\', " PRIsizeB
			"B RAM, %.2f MHz, multiproc: " PRIsize ", max-threads-block: %d, max-block-size: %dx%dx%d)\n",
			p_s_Name(),
			PRIsizeBparams(n_Memory_Size()),
			t_Properties().clockRate / 1e3f,
			n_Multiprocessor_Num(),
			t_Properties().maxThreadsPerBlock,
			t_Properties().maxThreadsDim[0],
			t_Properties().maxThreadsDim[1],
			t_Properties().maxThreadsDim[2]);
	}
	// show some device parameters
}

CCLDeviceParams::CCLDeviceParams(cl_device_id h_device)
	:m_h_device(h_device), m_n_memory_size(0)
{
	memset(m_p_device_caps, 0, 2 * sizeof(int));
	memset(&m_t_devprop, 0, sizeof(CLdevprop));

	if(!QueryDeviceParams()) {
		m_h_device = 0; // mark error
		return;
	}
	// get device params
}

CCLDeviceParams::CCLDeviceParams(cl_context h_context, size_t n_device_index)
	:m_h_device(0), // mark error
	m_n_memory_size(0)
{
	memset(m_p_device_caps, 0, 2 * sizeof(int));
	memset(&m_t_devprop, 0, sizeof(CLdevprop));

	{
		std::vector<cl_device_id> device_list;
		if(CCLUtils::n_GetDeviceList(h_context, device_list) != CL_SUCCESS)
			return;
		// get all the devices

		if(n_device_index < 0 || n_device_index >= device_list.size())
			return;
		// check index

		m_h_device = device_list[n_device_index];
	}
	// query all devices available to the context, get device handle

	if(!QueryDeviceParams()) {
		m_h_device = 0; // mark error
		return;
	}
	// get device params
}

bool CCLDeviceParams::QueryDeviceParams()
{
	if(n_GetDeviceInfoString(m_s_name, CL_DEVICE_NAME) != CL_SUCCESS)
		return false;
	// get device name

	bool b_nv_device_attribute_query = b_IsExtensionSupported("cl_nv_device_attribute_query");
	// determines wheter we have cl_nv_device_attribute_query

	{
		/*std::string s_version;
		if(n_GetDeviceInfoString(s_version, CL_DEVICE_VERSION) != CL_SUCCESS)
			return;
		m_p_device_caps[0] = atol(s_version.c_str());
		m_p_device_caps[1] = (strchr(s_version.c_str(), '.'))? atol(strchr(s_version.c_str(), '.') + 1) : 0;*/ // todo

		if(b_nv_device_attribute_query) {
			cl_uint n_major, n_minor;
			if(clGetDeviceInfo(m_h_device, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &n_major, NULL) != CL_SUCCESS ||
			   clGetDeviceInfo(m_h_device, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof(cl_uint), &n_minor, NULL) != CL_SUCCESS)
				return false;
			m_p_device_caps[0] = n_major;
			m_p_device_caps[1] = n_minor;
		} else {
			m_p_device_caps[0] = 1;
			m_p_device_caps[1] = 0;
		}
	}
	// get device OpenCL revision

	{
		if(b_nv_device_attribute_query) {
			cl_uint n_value;
			if(clGetDeviceInfo(m_h_device, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, sizeof(n_value), &n_value, NULL) != CL_SUCCESS)
				return false;
			m_b_kernel_exec_timeout = (n_value != 0);
		} else
			m_b_kernel_exec_timeout = false;
	}
	{
		cl_uint n_value;
		if(clGetDeviceInfo(m_h_device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(n_value), &n_value, NULL) != CL_SUCCESS)
			return false;
		m_n_multiprocessor_num = n_value;
	}
	// get some interesting device attributes

	{
		cl_ulong n_value; // this is actually a 64bit integer
		if(clGetDeviceInfo(m_h_device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(n_value), &n_value, NULL) != CL_SUCCESS)
			return false;
		_ASSERTE(n_value <= UINT64_MAX); // this should be enough for a while
		m_n_memory_size = uint64_t(n_value);
	}
	// get device memory size

	return n_Query_DeviceProperties(m_t_devprop, m_h_device) == CL_SUCCESS;
}

CLresult CCLDeviceParams::n_Query_DeviceProperties(CLdevprop &r_t_devprop, cl_device_id h_device)
{
	memset(&r_t_devprop, 0, sizeof(CLdevprop));
	// clear output

	bool b_nv_device_attribute_query = b_IsExtensionSupported(h_device, "cl_nv_device_attribute_query");
	// determines wheter we have cl_nv_device_attribute_query

	{
		cl_uint n_value;
		CLresult n_result;
		if((n_result = (CLresult)clGetDeviceInfo(h_device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(n_value), &n_value, NULL)) != CL_SUCCESS)
			return n_result;
		r_t_devprop.clockRate = n_value * 1000;
	}
	{
		cl_uint n_value;
		CLresult n_result;
		if((n_result = (CLresult)clGetDeviceInfo(h_device, CL_DEVICE_ADDRESS_BITS, sizeof(n_value), &n_value, NULL)) != CL_SUCCESS)
			return n_result;
		n_value /= 2;
		int n_size = int(min(uint64_t(INT_MAX), (uint64_t(1) << (n_value - 1)) | ((uint64_t(1) << (n_value - 1)) - 1)));
		r_t_devprop.maxGridSize[0] = n_size;
		r_t_devprop.maxGridSize[1] = n_size;

		if(b_nv_device_attribute_query) {
			cl_uint n_major, n_minor;
			CLresult n_result;
			if((n_result = (CLresult)clGetDeviceInfo(h_device, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &n_major, NULL)) != CL_SUCCESS ||
			   (n_result = (CLresult)clGetDeviceInfo(h_device, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof(cl_uint), &n_minor, NULL)) != CL_SUCCESS)
				return n_result;
			if(n_major < 2) {
				_ASSERTE(n_size == 65535); // CUDA_C_Programming_Guide.pdf, table 12 (page 185) ...
				r_t_devprop.maxGridSize[2] = n_size;
			} else
				r_t_devprop.maxGridSize[2] = 65535; // CUDA_C_Programming_Guide.pdf, table 12 (page 185). note that this *will* change with future models
		} else {
			r_t_devprop.maxGridSize[2] = n_size;
			// assume it is the same accross all the dimensions (according to OpenCL spec, it should be;
			// at http://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueNDRangeKernel.html
			// it says: global_work_size values must be in the range 1 .. 2^32 - 1)
		}
		// handle the z-dimension
	}
	{
		size_t p_value[3]; // 3 is safe value, no device shall have CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS less than 3
		CLresult n_result;
		if((n_result = (CLresult)clGetDeviceInfo(h_device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(p_value), p_value, NULL)) != CL_SUCCESS)
			return n_result;
		_ASSERTE(p_value[0] <= INT_MAX && p_value[1] <= INT_MAX && p_value[2] <= INT_MAX);
		r_t_devprop.maxThreadsDim[0] = int(p_value[0]);
		r_t_devprop.maxThreadsDim[1] = int(p_value[1]);
		r_t_devprop.maxThreadsDim[2] = int(p_value[2]); // block size
	}
	{
		size_t n_value;
		CLresult n_result;
		if((n_result = (CLresult)clGetDeviceInfo(h_device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(n_value), &n_value, NULL)) != CL_SUCCESS)
			return n_result;
		_ASSERTE(n_value <= INT_MAX);
		r_t_devprop.maxThreadsPerBlock = int(n_value);
	}
	{
		cl_ulong n_value;
		CLresult n_result;
		if((n_result = (CLresult)clGetDeviceInfo(h_device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(n_value), &n_value, NULL)) != CL_SUCCESS)
			return n_result;
		if(n_value <= INT_MAX)
			r_t_devprop.memPitch = int(n_value);
		else
			r_t_devprop.memPitch = -1; // too large (AMD)
		_ASSERTE(n_value <= INT64_MAX);
		r_t_devprop.memPitch64 = n_value;
	}
	{
		if(b_nv_device_attribute_query) {
			cl_uint n_value;
			CLresult n_result;
			if((n_result = (CLresult)clGetDeviceInfo(h_device, CL_DEVICE_REGISTERS_PER_BLOCK_NV, sizeof(n_value), &n_value, NULL)) != CL_SUCCESS)
				return n_result;
			r_t_devprop.regsPerBlock = n_value;
		} else
			r_t_devprop.regsPerBlock = -1; // unknown
	}
	{
		cl_ulong n_value;
		CLresult n_result;
		if((n_result = (CLresult)clGetDeviceInfo(h_device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(n_value), &n_value, NULL)) != CL_SUCCESS)
			return n_result;
		_ASSERTE(n_value <= INT_MAX);
		r_t_devprop.sharedMemPerBlock = int(n_value);
	}
	{
		if(b_nv_device_attribute_query) {
			cl_uint n_value;
			CLresult n_result;
			if((n_result = (CLresult)clGetDeviceInfo(h_device, CL_DEVICE_WARP_SIZE_NV, sizeof(n_value), &n_value, NULL)) != CL_SUCCESS)
				return n_result;
			r_t_devprop.SIMDWidth = n_value;
		} else {
			r_t_devprop.SIMDWidth = -1;
			// unknown

			do {
				cl_int n_result;
				cl_context h_context = clCreateContext(0, 1, &h_device, 0, 0, &n_result);
				if(n_result != CL_SUCCESS)
					break;

				cl_command_queue h_cmd_queue = clCreateCommandQueue(h_context,
					h_device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &n_result);
				if(n_result != CL_SUCCESS) {
					clReleaseContext(h_context);
					break;
				}
				// create command queue

				cl_program h_program;
				if((n_result = CCLProgramCompiler::n_CompileProgram(h_context, &h_program,
				   "__kernel void k_dummy(__global int *x) { x[get_global_id(0)] = 0; }",
				   1, &h_device)) != CL_SUCCESS) {
					clReleaseCommandQueue(h_cmd_queue);
					clReleaseContext(h_context);
					break;
				}
				// compile the program

				cl_kernel h_dummy_kernel = clCreateKernel(h_program, "k_dummy", &n_result);
				if(n_result != CL_SUCCESS) {
					clReleaseCommandQueue(h_cmd_queue);
					clReleaseContext(h_context);
					clReleaseProgram(h_program);
					break;
				}
				// get the kernel

				size_t n_wavefront_size = 0, n_size;
				clGetKernelWorkGroupInfo(h_dummy_kernel, h_device,
					CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
					sizeof(size_t), &n_wavefront_size, &n_size);
				// get work-group size multiple

				if(n_result == CL_SUCCESS)
					r_t_devprop.SIMDWidth = int(n_wavefront_size);

				clReleaseKernel(h_dummy_kernel);
				clReleaseProgram(h_program);
				clReleaseCommandQueue(h_cmd_queue);
				clReleaseContext(h_context);
				// cleanup
			} while(false);
			// get SIMD width on non-NVIDIA platforms
		}
	}
	{
		r_t_devprop.textureAlign = -1; // unknown; todo
	}
	{
		cl_ulong n_value;
		CLresult n_result;
		if((n_result = (CLresult)clGetDeviceInfo(h_device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(n_value), &n_value, NULL)) != CL_SUCCESS)
			return n_result;
		_ASSERTE(n_value <= INT_MAX);
		r_t_devprop.totalConstantMemory = int(n_value);
	}
	// get device properties

	return cl_Success;
}

/*
 *								=== ~CCLDeviceParams ===
 */

/*
 *								=== CCLProgramStorage::TProgramBinary::TBinary ===
 */

inline CCLProgramStorage::TProgramBinary::TBinary::TBinary()
	:n_size(0), p_data(0)
{}

/*
 *								=== ~CCLProgramStorage::TProgramBinary::TBinary ===
 */

/*
 *								=== CCLProgramStorage::TProgramBinary ===
 */

inline void CCLProgramStorage::TProgramBinary::Swap(TProgramBinary &r_t_other)
{
	binary_size_list.swap(r_t_other.binary_size_list);
	std::swap(t_data_buffer, r_t_other.t_data_buffer);
}

/*
 *								=== ~CCLProgramStorage::TProgramBinary ===
 */

/*
 *								=== CCLProgramStorage::_TSHA1 ===
 */

inline bool CCLProgramStorage::_TSHA1::operator ==(const _TSHA1 &r_t_other) const
{
	return !memcmp(p_data, r_t_other.p_data, 5 * sizeof(uint32_t));
}

inline bool CCLProgramStorage::_TSHA1::operator <(const _TSHA1 &r_t_other) const
{
	return memcmp(p_data, r_t_other.p_data, 5 * sizeof(uint32_t)) < 0;
}

inline bool CCLProgramStorage::_TSHA1::operator !=(const _TSHA1 &r_t_other) const
{
	return !(*this == r_t_other);
}

/*
 *								=== ~CCLProgramStorage::_TSHA1 ===
 */

/*
 *								=== CCLProgramStorage::TProgramBinary ===
 */

bool CCLProgramStorage::TProgramBinary::Download(cl_program h_program, size_t n_device_num)
{
	_ASSERTE(!t_data_buffer.p_data && binary_size_list.empty()); // make sure this is empty

	cl_uint _n_device_num;
	if(clGetProgramInfo(h_program, CL_PROGRAM_NUM_DEVICES,
	   sizeof(size_t), &_n_device_num, NULL) != CL_SUCCESS || _n_device_num != n_device_num)
		return false;
	// get number of devices, make sure it matches description

	if(!n_device_num || n_device_num > UINT32_MAX)
		return false;
	// we can't query binaries without devices

	std::vector<const unsigned char*> binary_ptr_list;
	try {
		binary_ptr_list.resize(n_device_num);
#ifdef _DEBUG
		_ASSERTE(n_device_num < SIZE_MAX);
		binary_size_list.resize(n_device_num + 1);
		if(n_device_num)
			binary_size_list[n_device_num - 1] = 0xbaadf00d; // paranoid check
		binary_size_list[n_device_num] = 0xbaadf00d; // "magic" word
#else // _DEBUG
		binary_size_list.resize(n_device_num);
#endif // _DEBUG
	} catch(std::bad_alloc&) {
		return false;
	}
	if(clGetProgramInfo(h_program, CL_PROGRAM_BINARY_SIZES,
	   n_device_num * sizeof(size_t), &binary_size_list.front(), NULL) != CL_SUCCESS)
		return false;
#ifdef _DEBUG
	_ASSERTE(binary_size_list[n_device_num] == 0xbaadf00d); // make sure it's returning the right amount of data
	_ASSERTE(!n_device_num || binary_size_list[n_device_num - 1] != 0xbaadf00d); // paranoid check
	binary_size_list.erase(binary_size_list.end() - 1); // remove it from the list, length must be equal to n_device_num
#endif // _DEBUG
	// get binary sizes

	uint64_t n_size_total = 0;
	for(size_t i = 0; i < n_device_num; ++ i) {
		if(n_size_total > UINT64_MAX - binary_size_list[i])
			return false;
		n_size_total += binary_size_list[i];
	}
	// sum binary sizes up

	t_data_buffer.n_size = n_size_total;
	if(n_size_total > SIZE_MAX || !(t_data_buffer.p_data = new(std::nothrow) uint8_t[size_t(n_size_total)]))
		return false;
	// alloc data buffer

	{
		const uint8_t *p_data_ptr = t_data_buffer.p_data;
		for(size_t i = 0; i < n_device_num; ++ i) {
			binary_ptr_list[i] = (const unsigned char*)p_data_ptr;
			p_data_ptr += binary_size_list[i];
		}
		_ASSERTE(p_data_ptr == t_data_buffer.p_data + t_data_buffer.n_size);
	}
	// get pointers to individual binaries

	if(clGetProgramInfo(h_program, CL_PROGRAM_BINARIES,
	   binary_ptr_list.size() * sizeof(uint8_t*)/*size_t(n_size_total)*/, &binary_ptr_list.front(), NULL) != CL_SUCCESS)
		return false;
	// download binaries

	return true;
}

int CCLProgramStorage::TProgramBinary::n_Upload(cl_context h_context, cl_program *p_program,
	CLresult &r_n_result, size_t n_device_num, const cl_device_id *p_device_list) const
{
	_ASSERTE(p_program); // must not be null
	_ASSERTE(!binary_size_list.empty()); // make sure this is not empty

	if(binary_size_list.size() != n_device_num)
		return build_InvalidParams;
	// device list must match

	std::vector<const unsigned char*> binary_ptr_list;
	try {
		binary_ptr_list.resize(n_device_num);
	} catch(std::bad_alloc&) {
		return build_LowMemory;
	}
	{
		const uint8_t *p_data_ptr = t_data_buffer.p_data;
		for(size_t i = 0; i < n_device_num; ++ i) {
			binary_ptr_list[i] = (const unsigned char*)p_data_ptr;
			p_data_ptr += binary_size_list[i];
		}
		_ASSERTE(p_data_ptr == t_data_buffer.p_data + t_data_buffer.n_size);
	}
	// get pointers to individual binaries

	cl_int n_result;
	_ASSERTE(n_device_num <= /*CL*/UINT_MAX);
	cl_program h_prog = clCreateProgramWithBinary(h_context, cl_uint(n_device_num), p_device_list,
		&binary_size_list.front(), &binary_ptr_list.front(), NULL, &n_result);
	if(n_result != CL_SUCCESS) {
		r_n_result = (CLresult)n_result;
		return build_CreateProgramFailure;
	}
	if((n_result = clBuildProgram(h_prog, 0, NULL, NULL, NULL, NULL)) != CL_SUCCESS) { // note no build options here
		r_n_result = (CLresult)n_result;
		clReleaseProgram(h_prog);
		return build_BuildProgramFailure;
	}
	// use loaded binaries to create the program ...

#ifdef CL_PROGRAM_COMPILER_DISPLAY_SUCCESSFUL_BUILD_WARNINGS
	/*if(n_device_num) { // compiling from binary doesn't show output :'(
		std::string s_build_log, s_device_name;
		for(size_t i = 0; i < n_device_num; ++ i) {
			cl_build_status n_build_status;

			if(CCLProgramCompiler::Get_BuildLog(s_build_log, n_build_status, h_prog, p_device_list[i]) != CL_SUCCESS)
				return n_result;
			// get buld log and build status

			if(CCLDeviceParams::n_GetDeviceInfoString(s_device_name, p_device_list[i], CL_DEVICE_NAME) != CL_SUCCESS)
				s_device_name.erase();
			const char *p_s_device_name = (s_device_name.empty())? "(null)" : s_device_name.c_str();
			// get device name

			if(n_build_status == CL_BUILD_NONE) {
				fprintf(stderr, "warning: program wasn't built for device %d (%s)\n", i, p_s_device_name);
				continue;
			} else if(n_build_status == CL_BUILD_ERROR) {
				fprintf(stderr, "warning: there were errors while building program for device %d (%s)\n",
					i, p_s_device_name);
			} else if(n_build_status != CL_BUILD_SUCCESS)
				fprintf(stderr, "warning: unknown program build status for device %d (%s)\n", i, p_s_device_name);
			// show build status

			if(n_build_status != CL_BUILD_SUCCESS || !s_build_log.empty()) {
				fprintf(stderr, "=== OpenCL build log for device %d (%s) ===\n%s\n",
					i, p_s_device_name, s_build_log.c_str());
			}
			// show build log
		}
	}*/
	// show warnings
#endif // CL_PROGRAM_COMPILER_DISPLAY_SUCCESSFUL_BUILD_WARNINGS

	r_n_result = cl_Success; // !!
	*p_program = h_prog;
	// write output

	return build_Success;
	// success
}

void CCLProgramStorage::TProgramBinary::Free()
{
	if(t_data_buffer.p_data) {
		delete[] t_data_buffer.p_data;
		t_data_buffer.p_data = 0;
	}
	binary_size_list.clear();
}

/*
 *								=== CCLProgramStorage::~TProgramBinary ===
 */

/*
 *								=== CCLProgramStorage::TProgramInstance ===
 */

bool CCLProgramStorage::TProgramInstance::Download(cl_program h_program,
	size_t n_device_num, _TSHA1 t_build_options_hash)
{
	t_build_opts_hash = t_build_options_hash;
	// copy hash of build options

	return TProgramBinary::Download(h_program, n_device_num);
}

bool CCLProgramStorage::TProgramInstance::Read(FILE *p_fr, size_t n_device_num)
{
	_ASSERTE(!t_data_buffer.p_data && binary_size_list.empty()); // make sure this is empty

	if(fread(&t_build_opts_hash, sizeof(_TSHA1), 1, p_fr) != 1)
		return false;
	// read build options hash

	std::vector<uint64_t> binary_size_list64;
	try {
		binary_size_list64.resize(n_device_num);
		binary_size_list.resize(n_device_num);
	} catch(std::exception) { // can be also length_error, not only bad_alloc
		return false;
	}
	// alloc arrays

	if(fread(&binary_size_list64.front(), sizeof(uint64_t), n_device_num, p_fr) != n_device_num)
		return false;
	uint64_t n_size_total = 0;
	for(size_t i = 0; i < n_device_num; ++ i) {
		if(binary_size_list64[i] > SIZE_MAX || n_size_total > UINT64_MAX - binary_size_list64[i])
			return false;
		binary_size_list[i] = size_t(binary_size_list64[i]);
		n_size_total += binary_size_list64[i];
	}
	// read binary lengths, sum up

	t_data_buffer.n_size = n_size_total;
	if(n_size_total > SIZE_MAX || !(t_data_buffer.p_data = new(std::nothrow) uint8_t[size_t(n_size_total)]))
		return false;
	{
		uint64_t n_size_remaining = n_size_total;
		uint8_t *p_data_ptr = t_data_buffer.p_data;
		while(n_size_remaining > 0) {
			int n_read = int(min(uint64_t(INT_MAX), n_size_remaining));
			if(fread(p_data_ptr, 1, n_read, p_fr) != n_read)
				return false;
			n_size_remaining -= n_read;
			p_data_ptr += n_read;
		}
		_ASSERTE(p_data_ptr == t_data_buffer.p_data + n_size_total);
	}
	// alloc and read data buffer

	return true;
}

bool CCLProgramStorage::TProgramInstance::Write(FILE *p_fw, size_t n_device_num)
{
	if(binary_size_list.size() != n_device_num)
		return false;
	// make sure there is correct number of binaries

	if(fwrite(&t_build_opts_hash, sizeof(_TSHA1), 1, p_fw) != 1)
		return false;
	// write build options hash

	for(size_t i = 0; i < n_device_num; ++ i) {
		_ASSERTE(SIZE_MAX <= UINT64_MAX);
		uint64_t n_size = binary_size_list[i];
		if(fwrite(&n_size, sizeof(uint64_t), 1, p_fw) != 1)
			return false;
	}
	// write binary lengths

	{
		uint64_t n_size_remaining = t_data_buffer.n_size;
		uint8_t *p_data_ptr = t_data_buffer.p_data;
		while(n_size_remaining) {
			int n_write = int(min(uint64_t(INT_MAX), n_size_remaining));
			if(fwrite(p_data_ptr, 1, n_write, p_fw) != n_write)
				return false;
			n_size_remaining -= n_write;
			p_data_ptr += n_write;
		}
		_ASSERTE(p_data_ptr == t_data_buffer.p_data + t_data_buffer.n_size);
	}
	// write binaries data buffer

	return true;
}

inline void CCLProgramStorage::TProgramInstance::Swap(TProgramInstance &r_t_other)
{
	std::swap(t_build_opts_hash, r_t_other.t_build_opts_hash);
	TProgramBinary::Swap(r_t_other);
}

/*
 *								=== ~CCLProgramStorage::TProgramInstance ===
 */

/*
 *								=== CCLProgramStorage::TProgramInstanceEx ===
 */

bool CCLProgramStorage::TProgramInstanceEx::Download(cl_program h_program,
	size_t n_context_device_num, const cl_device_id *p_context_device_list,
	_TSHA1 t_build_options_hash, _TSHA1 _t_source_hash)
{
	t_source_hash = _t_source_hash;
	// remember this

	device_index_list.clear(); // !!

	cl_uint n_device_num;
	if(clGetProgramInfo(h_program, CL_PROGRAM_NUM_DEVICES,
	   sizeof(size_t), &n_device_num, NULL) != CL_SUCCESS || n_device_num > n_context_device_num)
		return false;
	// get the number of program devices

	std::vector<cl_device_id> program_device_list;
	if(!stl_ut::Resize_To_N(program_device_list, n_device_num) ||
	   !stl_ut::Resize_To_N(device_index_list, n_device_num) ||
	   clGetProgramInfo(h_program, CL_PROGRAM_DEVICES, sizeof(cl_device_id) *
	   n_device_num, &program_device_list.front(), 0))
		return false;
	// get program devices

	for(size_t i = 0; i < n_device_num; ++ i) {
		device_index_list[i] = std::find(p_context_device_list,
			p_context_device_list + n_context_device_num,
			program_device_list[i]) - p_context_device_list;
		if(device_index_list[i] > n_context_device_num) {
			fprintf(stderr, "error: device id not on the context devices list\n");
			// if this happens, will just have to store strings everywhere
			return false;
		}
	}
	// resolve device indices

	return TProgramInstance::Download(h_program, n_device_num, t_build_options_hash);
}

inline bool CCLProgramStorage::TProgramInstanceEx::Read(FILE *p_fr)
{
	device_index_list.clear(); // !!

	if(fread(&t_source_hash, sizeof(_TSHA1), 1, p_fr) != 1)
		return false;
	// read source code hash

	uint32_t n_device_num;
	if(fread(&n_device_num, sizeof(uint32_t), 1, p_fr) != 1 ||
	   !stl_ut::Resize_To_N(device_index_list, n_device_num) ||
	   fread(&device_index_list.front(), sizeof(uint32_t), n_device_num, p_fr) != n_device_num)
		return false;
	// read device list

	if(!TProgramInstance::Read(p_fr, n_device_num))
		return false;
	// read the rest

	return true;
}

inline bool CCLProgramStorage::TProgramInstanceEx::Write(FILE *p_fw)
{
	if(fwrite(&t_source_hash, sizeof(_TSHA1), 1, p_fw) != 1)
		return false;
	// write source code hash

	size_t n_device_num = device_index_list.size();
	if(n_device_num != binary_size_list.size() || n_device_num > UINT32_MAX)
		return false; // that's a problem
	uint32_t n_device_num32 = uint32_t(n_device_num);
	//
	if(fwrite(&n_device_num32, sizeof(uint32_t), 1, p_fw) != 1 ||
	   fwrite(&device_index_list.front(), sizeof(uint32_t), n_device_num, p_fw) != n_device_num)
		return false;
	// write device list

	return TProgramInstance::Write(p_fw, n_device_num);
}

inline void CCLProgramStorage::TProgramInstanceEx::Swap(TProgramInstanceEx &r_t_other)
{
	TProgramInstance::Swap((TProgramInstance&)r_t_other);
	std::swap(t_source_hash, r_t_other.t_source_hash); // !!
	device_index_list.swap(r_t_other.device_index_list);
}

bool CCLProgramStorage::TProgramInstanceEx::b_CheckDeviceIds(const std::vector<size_t> &r_device_indices)
{
	if(device_index_list.size() != r_device_indices.size())
		return false; // this is built for a different number of devices

	//if(device_index_list == r_device_indices) // not the same type, msvc60 won't compare
	//	return true; // good, exactly the same
	/*if(device_index_list.size() == r_device_indices.size()) { // unnecessarily cumbersome
		bool b_differences = false;
		for(size_t i = 0, n = device_index_list.size(); i < n; ++ i) {
			if(device_index_list[i] != r_device_indices[i]) {
				b_differences = true;
				break;
			}
		}
		if(!b_differences)
			return true; // good, exactly the same
	}*/
	if(std::mismatch(device_index_list.begin(), device_index_list.end(),
	   r_device_indices.begin()).first == device_index_list.end())
		return true; // good, exactly the same

	for(size_t i = 0, n = device_index_list.size(); i < n; ++ i) {
		if(r_device_indices[i] == device_index_list[i])
			continue;
		std::vector<uint32_t>::const_iterator p_it;
		if((p_it = std::find(device_index_list.begin(), device_index_list.end(),
		   r_device_indices[i])) == device_index_list.end())
			return false; // not found at all
		// now would need to swap the order of the binaries and sizes and offsets
	}
	fprintf(stderr, "error: there seems to be a binary "
		"available but the order of the devices is wrong\n"); // see this? implement swapping the order.
	return false; // i guess the device order should not change much
}

bool CCLProgramStorage::TProgramInstanceEx::b_CheckDeviceIds(const std::vector<std::string> &r_device_id_list,
	const std::vector<std::string> &r_built_for_device_list) const
{
	if(device_index_list.size() != r_built_for_device_list.size())
		return false; // this is built for a different number of devices
	for(size_t i = 0, n = device_index_list.size(); i < n; ++ i) {
		const std::string &r_s_my_dev = r_device_id_list[device_index_list[i]];
		if(r_built_for_device_list[i] != r_s_my_dev)
			return false; // device list mismatch
	}
	return true;
}

/*
 *								=== ~CCLProgramStorage::TProgramInstanceEx ===
 */

/*
 *								=== CCLProgramStorage ===
 */

CCLProgramStorage::CCLProgramStorage()
	:m_b_dirty(false)
{
	memset(&m_t_header, 0, sizeof(TFileHeader));

	m_t_header.p_magic[0] = 'L';
	m_t_header.p_magic[1] = 'C';
	m_t_header.p_magic[2] = 'L';
	m_t_header.p_magic[3] = 's';
	// set magic word ("Lame CL storage") - so b_Status() doesn't fail
}

CCLProgramStorage::CCLProgramStorage(const char *p_s_filename)
	:m_b_dirty(false)
{
	if(!Load(p_s_filename))
		memset(&m_t_header, 0, sizeof(TFileHeader)); // to mark error
}

CCLProgramStorage::~CCLProgramStorage()
{
	std::for_each(m_instance_list.begin(), m_instance_list.end(), DeleteInstance);
}

bool CCLProgramStorage::b_Status() const
{
	return m_t_header.p_magic[0] == 'L' && m_t_header.p_magic[1] == 'C' &&
		m_t_header.p_magic[2] == 'L' && m_t_header.p_magic[3] != 's';
}

bool CCLProgramStorage::b_Dirty() const
{
	/*_TSHA1 t_ctrl_hash;
	if(!Calc_ControlHash(t_ctrl_hash))
		fprintf(stderr, "hash check failed ...\n"); // debug
	if(m_b_dirty != (t_ctrl_hash != m_t_header.t_control_hash)) {
		fprintf(stderr, "dirty discrepancy (dirty %d, hash-check %d) ...\n",
			m_b_dirty, t_ctrl_hash != m_t_header.t_control_hash); // debug
	}*/ // debug

	return m_b_dirty;
}

/*bool CCLProgramStorage::b_Check_DeviceIds(size_t n_device_num, const cl_device_id *p_device_list) const
{
	if(m_device_id_list.size() != n_device_num)
		return false;
	// counts must match

	std::string s_tmp0, s_tmp1;
	for(size_t i = 0, n = m_device_id_list.size(); i < n; ++ i) {
		if(CCLDeviceParams::n_GetDeviceInfoString(s_tmp0,
		   p_device_list[i], CL_DEVICE_NAME) != CL_SUCCESS ||
		   CCLDeviceParams::n_GetDeviceInfoString(s_tmp1,
		   p_device_list[i], CL_DRIVER_VERSION) != CL_SUCCESS ||
		   !stl_ut::AppendCStr(s_tmp0, "|") ||
		   !stl_ut::Append(s_tmp0, s_tmp1))
			return false; // this is, in fact, error, not npos
		if(s_tmp0 != m_device_id_list[i])
			return false;
	}
	// check device identification

	return true;
}*/

static bool b_External_File_Check(const char *p_s_source_code)
{
	if(!p_s_source_code)
		return true; // passed, no code to speak of, let alone includes
	const char *p_s_include, *p_s_begin = p_s_source_code, *p_s_end = p_s_source_code + strlen(p_s_source_code);
	while((p_s_include = strstr(p_s_source_code, "include")) != 0) {
		const char *p_s_hashtag = p_s_include;
		while(p_s_hashtag > p_s_source_code &&
		   isspace(uint8_t((*p_s_hashtag - 1))) && (*p_s_hashtag - 1) != '\n')
			-- p_s_hashtag;
		_ASSERTE(p_s_hashtag >= p_s_source_code);
		if(p_s_hashtag == p_s_source_code || *(p_s_hashtag - 1) != '#') {
			p_s_source_code = p_s_include + 7/*strlen("include")*/;
			continue;
		}
		p_s_include = p_s_hashtag - 1;
		// look for the start

		size_t n_line = std::count(p_s_begin, p_s_include, '\n');
		const char *p_s_line_begin;
		if(n_line) {
			for(p_s_line_begin = p_s_include; p_s_line_begin > p_s_source_code && *(p_s_line_begin - 1) != '\n';)
				-- p_s_line_begin; // find where the line starts
		} else
			p_s_line_begin = p_s_source_code; // line 0 starts at the start
		const char *p_s_line_end = std::find(p_s_line_begin, p_s_end, '\n');
		// find start and end of the include line

		const char *p_s_comment;
		if((p_s_comment = strstr(p_s_line_begin, "//")) != 0 && p_s_comment < p_s_include) {
			_ASSERTE(p_s_comment >= p_s_line_begin && p_s_comment < p_s_line_end - 1);
			p_s_source_code = p_s_line_end; // continue looking from here
			continue;
		}
		// see if the include is perhaps commented out (multiline comments not supported yet)

		std::string s_line;
		try {
			s_line.insert(s_line.begin(), p_s_line_begin, p_s_line_end);
			stl_ut::TrimSpace(s_line); // might contain '\r'
		} catch(std::bad_alloc&) {
			s_line.erase(); // ignore errors here
		}
		fprintf(stderr, "warning: source code contains #include directives: "
			"bypassing cache\nline " PRIsize ": \'%s\'\n", n_line, s_line.c_str());
		// report to the user

		return false;
		// found an include, failed the test
	}
	return true;
}

size_t CCLProgramStorage::n_Find_ProgramBinaries(const char *p_s_source_code, size_t n_device_num,
	const cl_device_id *p_device_list, const char *p_s_build_options) const
{
	if(!b_External_File_Check(p_s_source_code))
		return npos;
	// in case there are #includes, need to compile every time (the included files may change)
	// todo - hash all the includes as well

	/*if(m_device_id_list.size() != n_device_num ||
	   m_t_header.t_source_hash != t_Hash_String(p_s_source_code) || // now the checks are with instances
	   !b_Check_DeviceIds(n_device_num, p_device_list))
		return npos;*/
	// make sure there are the same devices (that's kind of tricky though, this is limited
	// to number of devices as there's no device id that could be written to the file (except
	// maybe hash of name and drivers)) and the same source code // @t_odo - implement better device identification

	_TSHA1 t_source_hash = t_Hash_String(p_s_source_code);
	_TSHA1 t_build_opts_hash = t_Hash_String(p_s_build_options);
	// hash build options

	std::vector<size_t> device_indices;
	bool b_new_devs;
	if(!Map_DeviceIds(device_indices, b_new_devs, n_device_num, p_device_list) || b_new_devs)
		return npos;
	// get device ids which are comparable to the stored ones

	_ASSERTE(device_indices.size() == n_device_num);
#ifdef _DEBUG
	for(size_t i = 0, n = device_indices.size(); i < n; ++ i) {
		std::vector<std::string> context_device_i;
		_ASSERTE(!Convert_DeviceIds(context_device_i, 1, &p_device_list[i]) || // either we run out of memory
			context_device_i.front() == m_device_id_list[device_indices[i]]); // or make sure this is the same device
	}
#endif // _DEBUG

	for(size_t i = 0, n = m_instance_list.size(); i < n; ++ i) {
		if(m_instance_list[i].t_build_opts_hash == t_build_opts_hash &&
		   m_instance_list[i].t_source_hash == t_source_hash && // now the check is with instances
		   m_instance_list[i].b_CheckDeviceIds(device_indices)) {
			if(i) {
				/*fprintf(stderr, "promoting binaries %d\n", i);*/ // debug
				m_instance_list[i].Swap(m_instance_list[i - 1]);
				-- i;
				// promote this instance, effectively invalidating any indices obtained by previous calls to this functions

				m_b_dirty = true;
				// we just changed order of binaries
			}
			/*fprintf(stderr, "requested binaries found: %d (\'%s\')\n", i, p_s_build_options);*/ // debug
			return i;
		}
	}
	// try to find the program binary

	/*fprintf(stderr, "requested binaries not found (\'%s\')\n", p_s_build_options);*/ // debug

	return npos;
	// not found
}

int CCLProgramStorage::n_Get_ProgramBinaries(cl_context h_context, cl_program *p_program, CLresult &r_n_result,
	size_t n_binaries_index, size_t n_device_num, const cl_device_id *p_device_list) const
{
	if(n_binaries_index == npos || n_binaries_index >= m_instance_list.size()) //|| m_device_id_list.size() != n_device_num
		return build_InvalidParams;
	// check binaries index and devices

	const TProgramInstanceEx &r_t_inst = m_instance_list[n_binaries_index];
	// get program instance

	if(r_t_inst.device_index_list.size() != n_device_num)
		return build_InvalidParams;

#ifdef _DEBUG
	{
		std::vector<size_t> device_indices;
		bool b_new_devs;
		if(!Map_DeviceIds(device_indices, b_new_devs, n_device_num, p_device_list))
			return build_LowMemory;
		if(b_new_devs || std::mismatch(r_t_inst.device_index_list.begin(),
		   r_t_inst.device_index_list.end(), device_indices.begin()).first != r_t_inst.device_index_list.end())
			return build_InvalidParams;
	}
#endif // _DEBUG
	//_ASSERTE(b_Check_DeviceIds(n_device_num, p_device_list)); // not easily checked anymore
	// device identification isn't checked in release, it's checked in n_Find_ProgramBinaries(),
	// this function is supposed to get the same parameters

	return r_t_inst.n_Upload(h_context, p_program, r_n_result, n_device_num, p_device_list);
}

bool CCLProgramStorage::Convert_DeviceIds(std::vector<std::string> &r_storeable_device_ids,
	size_t n_device_num, const cl_device_id *p_device_list)
{
	if(!stl_ut::Resize_To_N(r_storeable_device_ids, n_device_num))
		return false;
	try {
		std::map<cl_platform_id, std::vector<cl_device_id> > device_map;
		for(size_t i = 0; i < n_device_num; ++ i) {
			cl_platform_id h_platform;
			std::string s_dev_platform, s_dev_platform_idx;
			std::string s_dev_driver;
			size_t n_index;
			if(clGetDeviceInfo(p_device_list[i], CL_DEVICE_PLATFORM,
			   sizeof(cl_platform_id), &h_platform, 0) != CL_SUCCESS ||
			   CCLUtils::n_GetPlatformInfoString(s_dev_platform,
			   h_platform, CL_PLATFORM_NAME) != CL_SUCCESS ||
			   (!device_map.count(h_platform) && CCLUtils::n_GetDeviceList(h_platform,
			   device_map[h_platform], CL_DEVICE_TYPE_ALL) != CL_SUCCESS) ||
			   (n_index = std::find(device_map[h_platform].begin(), device_map[h_platform].end(),
			   p_device_list[i]) - device_map[h_platform].begin()) == device_map[h_platform].size() ||
			   CCLDeviceParams::n_GetDeviceInfoString(r_storeable_device_ids[i],
			   p_device_list[i], CL_DEVICE_NAME) != CL_SUCCESS ||
			   CCLDeviceParams::n_GetDeviceInfoString(s_dev_driver,
			   p_device_list[i], CL_DRIVER_VERSION) != CL_SUCCESS ||
			   !stl_ut::AppendCStr(r_storeable_device_ids[i], "|") ||
			   !stl_ut::Append(r_storeable_device_ids[i], s_dev_driver) ||
			   !stl_ut::AppendCStr(r_storeable_device_ids[i], "|") ||
			   !stl_ut::Append(r_storeable_device_ids[i], s_dev_platform) ||
			   !stl_ut::Format(s_dev_platform_idx, "." PRIsize, n_index) ||
			   !stl_ut::Append(r_storeable_device_ids[i], s_dev_platform_idx)) {
				r_storeable_device_ids.clear();
				return false;
			}
		}
	} catch(std::bad_alloc&) {
		r_storeable_device_ids.clear();
		return false;
	}
	// get list of device id's in the format "device name|driver version|platform name.device index"
	// (supposedly unique enough id's)

	return true;
}

// this creates a mapping of the given devices to the ids in m_device_id_list
// while taking care of not mapping two devices with the same name to the same index
bool CCLProgramStorage::Map_DeviceIds(std::vector<size_t> &r_mapped_device_indices,
	bool &r_b_have_new_devices, size_t n_device_num, const cl_device_id *p_device_list) const
{
	r_b_have_new_devices = false;

	std::vector<std::string> device_ids;
	if(!Convert_DeviceIds(device_ids, n_device_num, p_device_list))
		return false;
	// get comparable ids

	std::vector<bool> device_covered;
	const size_t n_orig_device_num = m_device_id_list.size();
	if(!stl_ut::Resize_To_N(device_covered, n_orig_device_num, false) ||
	   !stl_ut::Resize_To_N(r_mapped_device_indices, device_ids.size()))
		return false;
	for(size_t i = 0, n = device_ids.size(); i < n; ++ i) {
		std::vector<std::string>::const_iterator p_dev_it =
			std::find(m_device_id_list.begin(), m_device_id_list.end(), device_ids[i]);
		size_t n_idx = p_dev_it - m_device_id_list.begin();
		if(n_idx == n_orig_device_num || device_covered[n_idx]) {
			r_mapped_device_indices[i] = size_t(-1); // this device is not present in the storage
			r_b_have_new_devices = true;
		} else if(n_idx < n_orig_device_num) {
			_ASSERTE(!device_covered[n_idx]);
			//device_covered[n_idx].flip();
			device_covered[n_idx] = true; // msvc60 doesn't know flip()
			r_mapped_device_indices[i] = n_idx; // this device is found in the storage
		}
	}
	// map the device ids onto the device ids contained in the storage

	return true;
}

// this needs to produce a list as big as m_device_id_list with matching device ids,
// possibly containing null ids and possibly extending m_device_id_list list with new devices
bool CCLProgramStorage::Merge_DeviceIds(std::vector<cl_device_id> &r_merge_device_list,
	size_t n_device_num, const cl_device_id *p_device_list)
{
	std::vector<std::string> device_ids;
	if(!Convert_DeviceIds(device_ids, n_device_num, p_device_list))
		return false;
	// get comparable ids

	std::vector<bool> device_covered;
	const size_t n_orig_device_num = m_device_id_list.size();
	if(!stl_ut::Resize_To_N(device_covered, n_orig_device_num, false))
		return false;
	for(size_t i = 0, n = device_ids.size(); i < n; ++ i) {
		std::vector<std::string>::iterator p_dev_it =
			std::find(m_device_id_list.begin(), m_device_id_list.end(), device_ids[i]);
		size_t n_idx = p_dev_it - m_device_id_list.begin();
		if(/*p_dev_it == m_device_id_list.end() ||*/ n_idx >= n_orig_device_num || device_covered[n_idx]) { // all newly added ones are automatically covered
			if(!stl_ut::Resize_Add_1More(m_device_id_list, device_ids[i]))
				return false;
		} else if(n_idx < n_orig_device_num) {
			_ASSERTE(!device_covered[n_idx]);
			//device_covered[n_idx].flip();
			device_covered[n_idx] = true; // msvc60 doesn't know flip()
		}
	}
	// extend m_device_id_list with the new ids (add at the end so as to
	// not invalidate the device indices in TProgramInstanceEx
	// note that this extension counts on duplicate devices (e.g. a computer with two identical GPUs)

	if(!stl_ut::Resize_To_N(r_merge_device_list, m_device_id_list.size()) ||
	   !stl_ut::Reserve_N(device_covered, m_device_id_list.size()))
		return false;
	device_covered.assign(m_device_id_list.size(), false); // clear coverage
	// allocate the list

	for(size_t i = 0, n = m_device_id_list.size(); i < n; ++ i) {
		std::vector<std::string>::iterator p_dev_it = device_ids.begin();
		for(;;) {
			_ASSERTE(p_dev_it <= device_ids.end());
			p_dev_it = std::find(p_dev_it, device_ids.end(), m_device_id_list[i]);
			if(p_dev_it == device_ids.end())
				break; // not found
			if(!device_covered[p_dev_it - device_ids.begin()]) {
				//device_covered[p_dev_it - device_ids.begin()].flip();
				device_covered[p_dev_it - device_ids.begin()] = true; // msvc60 doesn't know flip()
				break; // found, not covered
			}
			++ p_dev_it; // found but covered, have to look for the next one
		}
		if(p_dev_it == device_ids.end())
			r_merge_device_list[i] = (cl_device_id)0; // this was present previously but is not present in the context we are currently running so this is a null id
		else
			r_merge_device_list[i] = p_device_list[p_dev_it - device_ids.begin()]; // this is in the list and we can associate it to a device in the current context
	}
	// associate the device

	return true;
}

bool CCLProgramStorage::Get_ProgramBuildEnvHash(TSHA1 &r_t_hash, const char *p_s_source_code, size_t n_device_num,
	const cl_device_id *p_device_list, const char *p_s_build_options)
{
	std::vector<std::string> dev_ids;
	if(!CCLProgramStorage::Convert_DeviceIds(dev_ids, n_device_num, p_device_list))
		return false;
	CStreamHash<TSHA1> hash;
	hash.Process_Data(p_s_source_code, strlen(p_s_source_code) * sizeof(char));
	for(size_t i = 0, n = dev_ids.size(); i < n; ++ i)
		hash.Process_Data(dev_ids[i].data(), dev_ids[i].length() * sizeof(char));
	hash.Process_Data(p_s_build_options, strlen(p_s_build_options) * sizeof(char));
	r_t_hash = hash.t_Result();
	return true;
}

bool CCLProgramStorage::Get_ProgramBuildEnv_DebugKernel(std::string &r_s_kernel, TSHA1 t_build_env_hash)
{
	return stl_ut::Format(r_s_kernel,
		"__kernel void Debug_Get_ProgramBuildEnv(__global unsigned int *p_hash)\n"
		"{\n"
		"    if(!get_global_id(0)) {\n"
		"        p_hash[0] = 0x%08x;\n"
		"        p_hash[1] = 0x%08x;\n"
		"        p_hash[2] = 0x%08x;\n"
		"        p_hash[3] = 0x%08x;\n"
		"        p_hash[4] = 0x%08x;\n"
		"    }"
		"}\n"
		"#line 1\n", t_build_env_hash[0], t_build_env_hash[1],
		t_build_env_hash[2], t_build_env_hash[3], t_build_env_hash[4]);
}

bool CCLProgramStorage::Verify_ProgramBuildEnv(bool &r_b_verify_result, cl_context h_context,
	size_t n_device_num, const cl_device_id *p_device_list,
	cl_program h_program, TSHA1 t_build_env_hash)
{
	r_b_verify_result = false;
	uint32_t p_hash[5];

	{
		if(!n_device_num)
			return false; // nothing to run at
		cl_int n_result;
		CCLUniqueCommandqueue cmd_queue(clCreateCommandQueue(h_context, p_device_list[0],
			CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &n_result));
		if(n_result != CL_SUCCESS)
			return false;
		CCLUniqueKernel kernel(clCreateKernel(h_program, "Debug_Get_ProgramBuildEnv", &n_result));
		if(n_result != CL_SUCCESS)
			return false;
		CCLUniqueMem dp_hash(clCreateBuffer(h_context,
			CL_MEM_READ_WRITE, 5 * sizeof(uint32_t), 0, &n_result));
		if(n_result != CL_SUCCESS)
			return false;
		size_t n_wg_size;
		if(clGetKernelWorkGroupInfo(kernel, p_device_list[0],
		   CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
		   sizeof(size_t), &n_wg_size, 0) != CL_SUCCESS || !n_wg_size)
			n_wg_size = 1; // just try
		n_result = clCall1D1(cmd_queue, kernel, n_wg_size, n_wg_size, dp_hash);
		if(n_result != CL_SUCCESS)
			return false;
		n_result = cmd_queue.n_Finish();
		if(n_result != CL_SUCCESS)
			return false;
		n_result = cmd_queue.n_Enqueue_Memcpy_DtoH(p_hash, dp_hash, 0, 5 * sizeof(uint32_t));
		if(n_result != CL_SUCCESS)
			return false;
	}
	// run a simple OpenCL kernel to verify that the correct binaries are being used

	r_b_verify_result = true;
	for(int i = 0; i < 5; ++ i) {
		if(t_build_env_hash[i] != p_hash[i]) {
			r_b_verify_result = false;
			break;
		}
	}
	return true;
}

bool CCLProgramStorage::Put_ProgramBinaries(cl_program h_program, const char *p_s_source_code,
	size_t n_device_num, const cl_device_id *p_device_list, const char *p_s_build_options)
{
	if(!b_External_File_Check(p_s_source_code))
		return false;
	// don't store a program with external includes (those might change)

	// puts compiled program binaries to the storage
	// number of instances is limited (LRU-type policy), but not here
	// note this might erase all other instances in case source code changed

	for(size_t i = 0; i < n_device_num; ++ i) {
		cl_build_status n_status;
		if(clGetProgramBuildInfo(h_program, p_device_list[i], CL_PROGRAM_BUILD_STATUS,
		   sizeof(cl_build_status), &n_status, NULL) != CL_SUCCESS || n_status != CL_BUILD_SUCCESS)
			return false;
	}
	// the program must be completely and successfuly built

	if(n_Find_ProgramBinaries(p_s_source_code, n_device_num, p_device_list, p_s_build_options) != npos)
		return true;
	// it's already there

	m_b_dirty = true;
	// we're going to modify the binaries

	_TSHA1 t_source_hash = t_Hash_String(p_s_source_code);
	// calculate hash of source code

	std::vector<cl_device_id> context_device_list;
	if(!Merge_DeviceIds(context_device_list, n_device_num, p_device_list))
		return false;
	// get device ids in the order matching what is stored in this binary

	_ASSERTE(context_device_list.size() == m_device_id_list.size());
#ifdef _DEBUG
	for(size_t i = 0, n = m_device_id_list.size(); i < n; ++ i) {
		std::vector<std::string> context_device_i;
		if(context_device_list[i] != (cl_device_id)0) {
			_ASSERTE(!Convert_DeviceIds(context_device_i, 1, &context_device_list[i]) || // either we run out of memory
				context_device_i.front() == m_device_id_list[i]); // or make sure this is the same device
		}
	}
#endif // _DEBUG

	/*if(m_device_id_list.size() != n_device_num || m_t_header.t_source_hash != t_source_hash ||
	   !b_Check_DeviceIds(n_device_num, p_device_list)) { // this is costy, make it at the end
		std::for_each(m_instance_list.begin(), m_instance_list.end(), DeleteInstance);
		m_instance_list.clear();
		m_device_id_list.clear();

		m_t_header.t_source_hash = t_source_hash;

		try {
			m_device_id_list.resize(n_device_num);
		} catch(std::bad_alloc&) {
			return false;
		}
		for(size_t i = 0; i < n_device_num; ++ i) {
			std::string s_dev_driver;
			if(CCLDeviceParams::n_GetDeviceInfoString(m_device_id_list[i],
			   p_device_list[i], CL_DEVICE_NAME) != CL_SUCCESS ||
			   CCLDeviceParams::n_GetDeviceInfoString(s_dev_driver,
			   p_device_list[i], CL_DRIVER_VERSION) != CL_SUCCESS ||
			   !stl_ut::AppendCStr(m_device_id_list[i], "|") ||
			   !stl_ut::Append(m_device_id_list[i], s_dev_driver)) {
				m_device_id_list.clear();
				return false;
			}
		}
		// get list of device id's (supposedly unique enough id's)
	}
	// in case it is completely different program, it invalidates all the instances*/
	// this does not happen anymore

	try {
		m_instance_list.insert(m_instance_list.begin(), 1, TProgramInstanceEx()); // add new instances at the beginning of the list
	} catch(std::bad_alloc&) {
		return false;
	}
	_TSHA1 t_build_opts_hash = t_Hash_String(p_s_build_options);
	if(!m_instance_list.front().Download(h_program, context_device_list.size(),
	   (context_device_list.empty())? 0 : &context_device_list.front(),
	   t_build_opts_hash, t_source_hash)) {
		m_instance_list.front().Free();
		m_instance_list.erase(m_instance_list.begin()); // try to clean up so this would still be useful
		return false;
	}
	// add and fill a new program instance

	return true;
}

bool CCLProgramStorage::Save(const char *p_s_filename, size_t n_max_program_instance_num)
{
	{
		size_t n_instance_num = min(n_max_program_instance_num, m_instance_list.size());

		std::vector<size_t> device_id_reindex_list;
		std::vector<size_t> device_id_backindex_list;
		if(!stl_ut::Resize_To_N(device_id_reindex_list, m_device_id_list.size(), size_t(0)) ||
		   !stl_ut::Resize_To_N(device_id_backindex_list, m_device_id_list.size(), size_t(-1)))
			return false;
		for(size_t i = 0; i < n_instance_num; ++ i) {
			const TProgramInstanceEx &r_t_inst = m_instance_list[i];
			for(size_t j = 0, m = r_t_inst.device_index_list.size(); j < m; ++ j) {
				_ASSERTE(r_t_inst.device_index_list[j] < device_id_reindex_list.size()); // this would indicate a corrupt storage
				device_id_reindex_list[r_t_inst.device_index_list[j]] = 1;
			}
		}
		// mark which device ids are used by the first n_max_program_instance_num programs

		size_t n_device_id_usage = 0;
		for(size_t i = 0, n = device_id_reindex_list.size(); i < n; ++ i) {
			if(device_id_reindex_list[i]) {
				device_id_reindex_list[i] = n_device_id_usage; // there
				device_id_backindex_list[n_device_id_usage] = i; // and also reverse
				++ n_device_id_usage;
			} else
				device_id_reindex_list[i] = size_t(-1);
		}
		// calculate indices of the device ids after remapping

		if(n_device_id_usage < m_device_id_list.size()) {
			std::vector<std::string> pruned_device_id_list;
			if(!stl_ut::Resize_To_N(pruned_device_id_list, n_device_id_usage))
				return false;
			for(size_t i = 0, n = device_id_reindex_list.size(); i < n; ++ i) {
				if(device_id_reindex_list[i] != size_t(-1)) {
					if(!stl_ut::Assign(pruned_device_id_list[device_id_reindex_list[i]], m_device_id_list[i]))
						return false;
					_ASSERTE(device_id_backindex_list[device_id_reindex_list[i]] == i); // make sure it is reversible
				}
			}
			// build a list of pruned device ids

			std::swap(pruned_device_id_list, m_device_id_list);
			for(size_t i = 0, n = n_instance_num; i < n; ++ i) {
				TProgramInstanceEx &r_t_inst = m_instance_list[i];
				for(size_t j = 0, m = r_t_inst.device_index_list.size(); j < m; ++ j) {
					size_t n_old_dev_id = r_t_inst.device_index_list[j];
					_ASSERTE(device_id_reindex_list[n_old_dev_id] != size_t(-1)); // otherwise we failed to mark this as used
					size_t n_new_dev_id = device_id_reindex_list[n_old_dev_id];
					_ASSERTE(device_id_backindex_list[n_new_dev_id] == n_old_dev_id); // make sure we can return it to the original state
					r_t_inst.device_index_list[j] = n_new_dev_id;
				}
			}
			// swap to the pruned table and indices

			bool b_result = Write(p_s_filename, n_instance_num);

			for(size_t i = 0, n = n_instance_num; i < n; ++ i) {
				TProgramInstanceEx &r_t_inst = m_instance_list[i];
				for(size_t j = 0, m = r_t_inst.device_index_list.size(); j < m; ++ j) {
					size_t n_new_dev_id = r_t_inst.device_index_list[j];
					_ASSERTE(device_id_backindex_list[n_new_dev_id] != size_t(-1)); // otherwise the inverse mapping is not calculated correctly
					size_t n_old_dev_id = device_id_backindex_list[n_new_dev_id];
					r_t_inst.device_index_list[j] = n_old_dev_id;
				}
			}
			std::swap(pruned_device_id_list, m_device_id_list);
			// swap back!

			return b_result;
		}
	}
	// see if we can prune the device id list before saving

	return Write(p_s_filename, n_max_program_instance_num);
}

bool CCLProgramStorage::Write(const char *p_s_filename, size_t n_max_program_instance_num)
{
	if(!Calc_ControlHash(m_t_header.t_control_hash, n_max_program_instance_num))
		return false;
	// update hash

	FILE *p_fw;
#if defined(_MSC_VER) && !defined(__MWERKS__) && _MSC_VER >= 1400
	if(fopen_s(&p_fw, p_s_filename, "wb"))
#else //_MSC_VER && !__MWERKS__ && _MSC_VER >= 1400
	if(!(p_fw = fopen(p_s_filename, "wb")))
#endif //_MSC_VER && !__MWERKS__ && _MSC_VER >= 1400
		return false;
	CFILE_PtrGuard guard(p_fw); // closes the file when leaving the scope
	// open file

	m_t_header.p_magic[0] = 'L';
	m_t_header.p_magic[1] = 'C';
	m_t_header.p_magic[2] = 'L';
	m_t_header.p_magic[3] = 's';
	// set magic word ("Lame CL storage")

	_ASSERTE(m_device_id_list.size() < UINT32_MAX);
	uint32_t n_device_num = uint32_t(m_device_id_list.size());
	// set device count

	if(min(n_max_program_instance_num, m_instance_list.size()) > UINT32_MAX)
		return false;
	uint32_t n_instance_num = uint32_t(min(n_max_program_instance_num, m_instance_list.size()));
	if(fwrite(&m_t_header, sizeof(TFileHeader), 1, p_fw) != 1 ||
	   fwrite(&n_device_num, sizeof(uint32_t), 1, p_fw) != 1 ||
	   fwrite(&n_instance_num, sizeof(uint32_t), 1, p_fw) != 1)
		return false;
	// write header

	for(size_t i = 0, n = m_device_id_list.size(); i < n; ++ i) {
		_ASSERTE(m_device_id_list[i].length() < SIZE_MAX);
		_ASSERTE(sizeof(char) == 1); // this better be consistent
		size_t n_length = m_device_id_list[i].length();
		if(fwrite(m_device_id_list[i].c_str(), sizeof(char), n_length + 1, p_fw) != n_length + 1)
			return false;
	}
	// write device names (null-terminated)

	for(size_t i = 0; i < n_instance_num; ++ i) {
		if(!m_instance_list[i].Write(p_fw/*, m_device_id_list.size()*/))
			return false;
	}
	// write instances

	if(ferror(p_fw))
		return false;
	// close file

	m_b_dirty = (n_max_program_instance_num < m_instance_list.size());
	// the file now contains actual data (in case we didn't trim it)

	/*fprintf(stderr, "saving binaries ...\n");*/ // debug

	return true;
}

const char *CCLProgramStorage::p_s_SHA1(CCLProgramStorage::_TSHA1 t_hash)
{
	static char p_s_buffer[41];
	TSHA1 t_sha1;
	for(int i = 0; i < 5; ++ i)
		t_sha1[i] = t_hash.p_data[i];
	return t_sha1.p_s_ToString(p_s_buffer, sizeof(p_s_buffer));
}

void CCLProgramStorage::Dump() const
{
	printf("program storage(" PRIsize " instances stored for " PRIsize " devices)\n",
		m_instance_list.size(), m_device_id_list.size());
	printf("    device pool:\n");
	for(size_t i = 0, n = m_device_id_list.size(); i < n; ++ i)
		printf("        device " PRIsize ": \'%s\'\n", i, m_device_id_list[i].c_str());

	std::map<_TSHA1, std::vector<const TProgramInstanceEx*> > src_map;
	try {
		for(size_t i = 0, n = m_instance_list.size(); i < n; ++ i)
			src_map[m_instance_list[i].t_source_hash].push_back(&m_instance_list[i]);
	} catch(std::bad_alloc&) {
		fprintf(stderr, "error: caught std::bad_alloc CCLProgramStorage::Dump()\n");
		return;
	}

	printf("    cache contents:\n");
	for(std::map<_TSHA1, std::vector<const TProgramInstanceEx*> >::iterator
	   p_map_it = src_map.begin(), p_end_it = src_map.end(); p_map_it != p_end_it; ++ p_map_it) {
		_TSHA1 t_src = (*p_map_it).first;
		std::vector<const TProgramInstanceEx*> &r_insts = (*p_map_it).second;
		printf("        source code %s (" PRIsize " binaries):\n", p_s_SHA1(t_src), r_insts.size());
		for(size_t i = 0, n = r_insts.size(); i < n; ++ i) {
			printf("            bin(rank %2" _PRIsize ", dev(",
				size_t(r_insts[i] - &m_instance_list.front()));
			const TProgramInstanceEx &r_t_inst = m_instance_list[i];
			for(size_t j = 0, m = r_t_inst.device_index_list.size(); j < m; ++ j)
				printf(", \'%s\'" + ((j)? 0 : 2), m_device_id_list[r_t_inst.device_index_list[j]].c_str());
			printf("), build opts %s)\n", p_s_SHA1(r_t_inst.t_build_opts_hash));
		}
	}
}

bool CCLProgramStorage::Load(const char *p_s_filename)
{
	/*fprintf(stderr, "loading binaries ...\n");*/ // debug

	std::for_each(m_instance_list.begin(), m_instance_list.end(), DeleteInstance);
	m_instance_list.clear();
	m_device_id_list.clear();
	// clear first

	{
		FILE *p_fr;
#if defined(_MSC_VER) && !defined(__MWERKS__) && _MSC_VER >= 1400
		if(fopen_s(&p_fr, p_s_filename, "rb"))
#else //_MSC_VER && !__MWERKS__ && _MSC_VER >= 1400
		if(!(p_fr = fopen(p_s_filename, "rb")))
#endif //_MSC_VER && !__MWERKS__ && _MSC_VER >= 1400
			return false;
		CFILE_PtrGuard guard(p_fr); // closes the file when leaving the scope
		// open file

		uint32_t n_instance_num, n_device_num;
		if(fread(&m_t_header, sizeof(TFileHeader), 1, p_fr) != 1 ||
		   fread(&n_device_num, sizeof(uint32_t), 1, p_fr) != 1 ||
		   fread(&n_instance_num, sizeof(uint32_t), 1, p_fr) != 1 ||
		   n_device_num > SIZE_MAX || n_instance_num > SIZE_MAX ||
		   m_t_header.p_magic[0] != 'L' || m_t_header.p_magic[1] != 'C' ||
		   m_t_header.p_magic[2] != 'L' || m_t_header.p_magic[3] != 's')
			return false;
		// read header

		try {
			_ASSERTE(n_device_num <= SIZE_MAX);
			m_device_id_list.resize(size_t(n_device_num));
			_ASSERTE(n_instance_num <= SIZE_MAX);
			m_instance_list.resize(size_t(n_instance_num));
		} catch(std::exception&) { // can be also length_error, not only bad_alloc
			return false;
		}
		// alloc devices and instances

		{
			for(size_t i = 0, n = m_device_id_list.size(); i < n; ++ i) {
				std::string s_id;
				for(;;) {
					char c;
					if(fread(&c, sizeof(char), 1, p_fr) != 1) {
						m_device_id_list.clear();
						return false;
					}
					// read a single character

					if(!c)
						break;
					// detect terminating null

					try {
						s_id += c;
					} catch(std::bad_alloc&) {
						m_device_id_list.clear();
						return false;
					}
					// add character to the string
				}

				m_device_id_list[i].swap(s_id);
			}
		}
		// read device id's

		for(size_t i = 0, n = m_instance_list.size(); i < n; ++ i) {
			if(!m_instance_list[i].Read(p_fr/*, size_t(n_device_num)*/)) {
				std::for_each(m_instance_list.begin(), m_instance_list.end(), DeleteInstance);
				m_instance_list.clear();
				m_device_id_list.clear();
				// !!

				return false;
			}
		}
		// read instances
	}

	_TSHA1 t_ctrl_hash;
	if(!Calc_ControlHash(t_ctrl_hash) || t_ctrl_hash != m_t_header.t_control_hash) {
		/*fprintf(stderr, "hash check failed ...\n");*/ // debug

		std::for_each(m_instance_list.begin(), m_instance_list.end(), DeleteInstance);
		m_instance_list.clear();
		m_device_id_list.clear();
		// !!

		return false;
	}
	// check hash

	m_b_dirty = false;
	// actual data are now loaded

	//Dump(); // debug

	return true;
}

CCLProgramStorage::_TSHA1 CCLProgramStorage::t_Hash_String(const char *p_s_str)
{
	CStreamHash<TSHA1> hash;
	hash.Process_Data(p_s_str, strlen(p_s_str) * sizeof(char));
	// calculate

	TSHA1 t_hash = hash.t_Result();
	_TSHA1 t_result;
	memcpy(&t_result, &t_hash[0], sizeof(_TSHA1)); // TSHA1 contains byte counter which is not needed
	// convert result

	return t_result;
}

bool CCLProgramStorage::Calc_ControlHash(_TSHA1 &r_t_result, size_t n_instance_limit) const
{
	CStreamHash<TSHA1> hash;
	//hash.Process_Data(&m_t_header.t_source_hash, sizeof(_TSHA1)); // moved with TProgramInstanceEx
	_ASSERTE(m_device_id_list.size() <= UINT32_MAX);
	uint32_t n_device_num = uint32_t(m_device_id_list.size());
	hash.Process_Data(&n_device_num, sizeof(uint32_t));
	size_t n_instance_num_wide = min(n_instance_limit, m_instance_list.size()); // a bug, found at 2016-06-23
	_ASSERTE(n_instance_num_wide <= UINT32_MAX);
	uint32_t n_instance_num = uint32_t(n_instance_num_wide);
	hash.Process_Data(&n_instance_num, sizeof(uint32_t));
	// hash header ...

	for(size_t i = 0, n = m_device_id_list.size(); i < n; ++ i) {
		_ASSERTE(m_device_id_list[i].length() < SIZE_MAX);
		hash.Process_Data(m_device_id_list[i].c_str(), (m_device_id_list[i].length() + 1) * sizeof(char));
	}
	// hash device id's (including the terminating null - to mimic layout in the file)

	for(size_t i = 0, n = min(n_instance_limit, m_instance_list.size()); i < n; ++ i) {
		const TProgramInstanceEx &r_t_inst = m_instance_list[i];
		// get program instance ...

		{
			hash.Process_Data(&r_t_inst.t_source_hash, sizeof(_TSHA1));
			// hash source code

			if(r_t_inst.device_index_list.size() != r_t_inst.binary_size_list.size())
				return false;
			hash.Process_Data(&r_t_inst.device_index_list.front(),
				sizeof(uint32_t) * r_t_inst.device_index_list.size());
			// hash device indices
		}
		// added by TProgramInstanceEx

		hash.Process_Data(&r_t_inst.t_build_opts_hash, sizeof(_TSHA1));
		/*if(r_t_inst.binary_size_list.size() != m_device_id_list.size()) // not true anymore
			return false;*/
		for(size_t j = 0, m = r_t_inst.binary_size_list.size(); j < m; ++ j) {
			_ASSERTE(SIZE_MAX <= UINT64_MAX);
			uint64_t n_size = r_t_inst.binary_size_list[j];
			hash.Process_Data(&n_size, sizeof(uint64_t));
		}
		// hash header

		_ASSERTE(r_t_inst.t_data_buffer.n_size <= SIZE_MAX);
		hash.Process_Data(r_t_inst.t_data_buffer.p_data, size_t(r_t_inst.t_data_buffer.n_size)); // note this might overflow
		// hash data
	}
	// hash instances

	TSHA1 t_hash = hash.t_Result();
	memcpy(&r_t_result, &t_hash[0], sizeof(_TSHA1)); // TSHA1 contains byte counter which is not needed
	// write result

	return true;
}

inline void CCLProgramStorage::DeleteInstance(TProgramInstanceEx &r_t_inst)
{
	r_t_inst.Free();
}

/*
 *								=== ~CCLProgramStorage ===
 */

/*
 *								=== CCLProgramCompiler ===
 */

#define CL_COMPILER_UTIL_BOGUS_DATA_DETECTION

void CCLProgramCompiler::Dump_StatusWord(int n_status_word, FILE *p_fw)
{
	if(n_status_word & cache_ReadAttempted) {
		if(n_status_word & cache_ReadSucceeded)
			fprintf(p_fw, "program loaded from cache\n");
		else {
			if(n_status_word & cache_ReadFailed_FileNotFound)
				fprintf(p_fw, "program not cached\n");
			else if(n_status_word & cache_ReadFailed_IO)
				fprintf(p_fw, "i/o error while loading program from cache\n");
			else if(n_status_word & cache_ReadFailed_OutOfMemory)
				fprintf(p_fw, "not enough memory while loading program from cache\n");
			else if(n_status_word & cache_ReadFailed_BinaryNotFound)
				fprintf(p_fw, "program cache miss\n");
			else if(n_status_word & cache_ReadFailed_BinaryChecksum)
				fprintf(p_fw, "program cache corrupt\n");
			else if(n_status_word & cache_ReadFailed_CreateProgram)
				fprintf(p_fw, "program cached for different device\n");
			else
				fprintf(p_fw, "unknown error while loading program from cache\n");
		}
	}
	if(n_status_word & cache_VerifyInstrumentationAdded) {
		if(n_status_word & cache_VerifyAttempted) {
			if(n_status_word & cache_CouldNotVerify)
				fprintf(p_fw, "build config environment instrumentation failed to execute\n");
			else {
				if(n_status_word & cache_VerifyPassed)
					fprintf(p_fw, "cached binary build environment verification passed\n");
				else
					fprintf(p_fw, "cached binary build environment verification failed\n");
			}
		} else
			fprintf(p_fw, "build environment verify instrumentation added\n");
	}
	if(n_status_word & prog_CompiledFromSource) {
		if(!(n_status_word & prog_CreateSucceeded))
			fprintf(p_fw, "clCreateProgramWithSource() failed while compiling program from source\n");
		else if(!(n_status_word & prog_BuildSucceeded))
			fprintf(p_fw, "clBuildProgram() failed while compiling program from source\n");
		else
			fprintf(p_fw, "program was successfully compiled from source\n");
	}
	if(n_status_word & cache_WriteAttempted) {
		if(n_status_word & cache_WriteSucceeded) {
			if(n_status_word & cache_WriteTrimmed) {
				fprintf(p_fw, "program binaries were successfully written to cache, binary versions dropped\n");
			} else
				fprintf(p_fw, "program binaries were successfully written to cache\n");
		} else
			fprintf(p_fw, "failed to cache program binaries\n");
	}
}

CLresult CCLProgramCompiler::n_CompileProgram(cl_context h_context, cl_program *p_program,
	const char *p_s_source, size_t n_device_num, const cl_device_id *p_device_list,
	const char *p_s_build_options, const char *p_s_cache_file, int *p_status_word,
	size_t n_max_program_instance_num)
{
	int n_dummy_status_word;
	if(!p_status_word)
		p_status_word = &n_dummy_status_word;
	// makes sure p_status_word is not null

	if(!p_s_build_options)
		p_s_build_options = "";
	// makes sure p_s_build_options is not null

	*p_status_word = 0;

	std::vector<cl_device_id> all_dev_list;
	{
		CLresult n_result;
		if((n_result = CCLUtils::n_GetDeviceList(h_context, all_dev_list)) != CL_SUCCESS)
			return n_result;
	}
	if(!n_device_num) {
		n_device_num = all_dev_list.size();
		p_device_list = (n_device_num)? &all_dev_list.front() : 0;
	} else
		_ASSERTE(p_device_list);
	// in case the devices are not specified, get the devices for this context;
	// otherwise will be unable to save program binaries

	const char *p_s_source_to_compile = p_s_source;
	const char *p_s_source_to_query = p_s_source;
	const char *p_s_original_source = p_s_source;
#ifdef CL_PROGRAM_COMPILER_VERIFY_BINARY
	TSHA1 t_build_env_hash;
	std::string s_marked_source;
	std::string s_augmented_source; // must stay in scope for being saved
	{
		if(!stl_ut::AssignCStr(s_marked_source, "/* UberLame build environment verify marker 1.0 */") ||
		   !stl_ut::AppendCStr(s_marked_source, p_s_original_source) ||
		   !CCLProgramStorage::Get_ProgramBuildEnvHash(t_build_env_hash, p_s_original_source,
		   n_device_num, p_device_list, p_s_build_options) ||
		   !CCLProgramStorage::Get_ProgramBuildEnv_DebugKernel(s_augmented_source, t_build_env_hash) ||
		   !stl_ut::AppendCStr(s_augmented_source, p_s_original_source))
			return cl_Out_Of_Host_Memory; // or something
		p_s_source_to_compile = s_augmented_source.c_str();
		p_s_source_to_query = s_marked_source.c_str();
		// hash the build environment and store the hash in a kernel

		// there are two versions of kernel, one that serves as the cache key
		// and another one that is actually compiled

		// the cache key does not contain the actual hash as it would make all the source codes
		// look different, that's why there is a query version of the source (p_s_source_to_query)
		// and compiler version of the source (p_s_source_to_compile)

		// the same kernel needs to be passed to both CCLProgramStorage::Put_ProgramBinaries()
		// and CCLProgramStorage::n_Find_ProgramBinaries(), otherwise a) the program does not
		// contain the verify kernel, gets recompiled and is then ignored by CCLProgramStorage::Put_ProgramBinaries()
		// or b) the program does contain the verify kernel and then is not found by
		// CCLProgramStorage::n_Find_ProgramBinaries()
	}
	*p_status_word |= cache_VerifyInstrumentationAdded;
#endif // CL_PROGRAM_COMPILER_VERIFY_BINARY
	p_s_source = 0; // do not use this variable

	std::string s_real_cache_file;
	if(p_s_cache_file && *p_s_cache_file == '%') {
		if(strstr(p_s_cache_file, "%temp_folder%") == p_s_cache_file) {
			p_s_cache_file += strlen("%temp_folder%");
			// skip

			_ASSERTE(strlen(p_s_cache_file) > 1 && (*p_s_cache_file == '/' ||
				*p_s_cache_file == CPath::path_Separator));
			// make sure that a slash and the actual name follows

			if(!CPath::Get_TempDirectory(s_real_cache_file) || // in Dir.h so far, don't have to include Dir.cpp to older builds
			   !stl_ut::AppendCStr(s_real_cache_file, p_s_cache_file))
				return cl_Out_Of_Host_Memory;
			p_s_cache_file = s_real_cache_file.c_str();
			// get temp, append the rest of the name
		} else if(strstr(p_s_cache_file, "%temp_default%") == p_s_cache_file) {
			p_s_cache_file += strlen("%temp_default%");
			// skip

			const char *p_s_filename;
#if defined(_WIN32) || defined(_WIN64)
			char p_s_filename_buffer[1024];
			if(!GetModuleFileNameA(GetModuleHandle(NULL), p_s_filename_buffer,
			   sizeof(p_s_filename_buffer) / sizeof(p_s_filename_buffer[0]) - 1))
				p_s_filename = 0; // error
			else
				p_s_filename = p_s_filename_buffer;
			// get name of the .exe
#else // _WIN32 || _WIN64
			p_s_filename = program_invocation_name;
#endif // _WIN32 || _WIN64
			if(p_s_filename && strchr(p_s_filename, CPath::path_Separator))
				p_s_filename = strrchr(p_s_filename, CPath::path_Separator) + 1;
			else if(!p_s_filename)
				p_s_filename = "null";
			// get filename

			char p_s_sep[2] = {CPath::path_Separator, 0}; // CPath::Get_TempDirectory() doesn't append slash!
			if(!CPath::Get_TempDirectory(s_real_cache_file) || // in Dir.h so far, don't have to include Dir.cpp to older builds
			   !stl_ut::AppendCStr(s_real_cache_file, p_s_sep) ||
			   !stl_ut::AppendCStr(s_real_cache_file, p_s_filename) ||
			   !stl_ut::AppendCStr(s_real_cache_file, "_") ||
			   !stl_ut::AppendCStr(s_real_cache_file, p_s_cache_file) ||
			   !stl_ut::AppendCStr(s_real_cache_file, ".clbin"))
				return cl_Out_Of_Host_Memory;
			p_s_cache_file = s_real_cache_file.c_str();
			// concat
		}

		//printf("debug: OpenCL compiler temp filename is \'%s\'\n", p_s_cache_file); // debug
	}
	// handle cache file in temp (so that the app doesn't need to have write privileges)

#ifdef CL_PROGRAM_COMPILER_USE_OLD_BINARIES_FORMAT
	TSHA1 t_hash = t_Hash_ProgramSource_BuildOptions(p_s_source_to_query, p_s_build_options);
	// calculate program and build options hash
#else // CL_PROGRAM_COMPILER_USE_OLD_BINARIES_FORMAT
	CCLProgramStorage program_storage;
	// prepare program storage
#endif // CL_PROGRAM_COMPILER_USE_OLD_BINARIES_FORMAT

	if(p_s_cache_file) {
		*p_status_word |= cache_ReadAttempted;

#ifdef CL_PROGRAM_COMPILER_USE_OLD_BINARIES_FORMAT
		int n_result;
		if((n_result = n_ReadProgramBinaries(p_s_cache_file, t_hash, h_context,
		   p_program, n_device_num, p_device_list)) == cache_ReadSucceeded) {
			*p_status_word |= cache_ReadSucceeded;
			return CL_SUCCESS;
		}
		// use the old function

		*p_status_word |= n_result;
#else // CL_PROGRAM_COMPILER_USE_OLD_BINARIES_FORMAT
		if(program_storage.Load(p_s_cache_file)) {
			size_t n_binaries_index;
			if((n_binaries_index = program_storage.n_Find_ProgramBinaries(p_s_source_to_query,
			   n_device_num, p_device_list, p_s_build_options)) != CCLProgramStorage::npos) {
				int n_result; CLresult n_cl_result;
				if((n_result = program_storage.n_Get_ProgramBinaries(h_context, p_program, n_cl_result, n_binaries_index,
				   n_device_num, p_device_list)) == CCLProgramStorage::build_Success) {
					do {
#ifdef CL_PROGRAM_COMPILER_VERIFY_BINARY
						*p_status_word |= cache_VerifyAttempted;

						bool b_build_env_verify;
						if(/*!CCLProgramStorage::Get_ProgramBuildEnvHash(t_build_env_hash, p_s_original_source,
						   n_device_num, p_device_list, p_s_build_options) ||*/ // already have this
						   !CCLProgramStorage::Verify_ProgramBuildEnv(b_build_env_verify,
						   h_context, n_device_num, p_device_list, *p_program, t_build_env_hash)) {
							*p_status_word |= cache_CouldNotVerify;
							break; // could not verify, maybe was not built with that. just rebuild
						}

						if(!b_build_env_verify) {
							fprintf(stderr, "fatal error: program binary retreived by program cache \n"
								"does not match its supposed build environment\n"); // internal error, needs to be fixed
							return cl_Invalid_Program_Executable; // or something
						}

						*p_status_word |= cache_VerifyPassed;
#endif // CL_PROGRAM_COMPILER_VERIFY_BINARY

						*p_status_word |= cache_ReadSucceeded;

						if(program_storage.b_Dirty()) {
							*p_status_word |= cache_WriteAttempted;

#ifdef CL_PROGRAM_COMPILER_PRECAUTIOUS_BINARIES_SAVE
							std::string s_temp_name;
							char p_s_time[5];
							sprintf(p_s_time, "%04x", clock() & 0xffff); // "random" identifier
							try {
								s_temp_name = p_s_cache_file;
								s_temp_name += ".tmp";
								s_temp_name += p_s_time;
							} catch(std::bad_alloc&) {
								return CL_SUCCESS; // load succeeded, just the save failed. ignore it.
							}
							// create temp filename

							if(program_storage.Save(s_temp_name.c_str(), n_max_program_instance_num)) {
								remove(p_s_cache_file);
								rename(s_temp_name.c_str(), p_s_cache_file);

								*p_status_word |= cache_WriteSucceeded;
							}
							// save with new filename, if succeeds, rewrite the old file; this is important because of the lru policy
#else // CL_PROGRAM_COMPILER_PRECAUTIOUS_BINARIES_SAVE
							if(program_storage.Save(p_s_cache_file, n_max_program_instance_num)) {
								*p_status_word |= cache_WriteSucceeded;
								if(program_storage.b_Dirty())
									*p_status_word |= cache_WriteTrimmed; // it's dirty - file doesn't contain all the versions - n_max_program_instance_num limit
							} else
								remove(p_s_cache_file); // don't leave garbage
						// save with old filename, remove file on failure (no fallback)
#endif // CL_PROGRAM_COMPILER_PRECAUTIOUS_BINARIES_SAVE
						}

						return cl_Success;
					} while(0);
				}
				switch(n_result) {
				case CCLProgramStorage::build_InvalidParams:
					*p_status_word |= cache_ReadFailed_OutOfMemory; // not actually the right error code (but it shouldn't happen in here, it's more of an internal program error)
					break;
				case CCLProgramStorage::build_LowMemory:
					*p_status_word |= cache_ReadFailed_OutOfMemory;
					break;
				case CCLProgramStorage::build_CreateProgramFailure:
					*p_status_word |= cache_ReadFailed_CreateProgram;
					break;
				case CCLProgramStorage::build_BuildProgramFailure:
					*p_status_word |= cache_ReadFailed_CreateProgram; // not actually the right error code
					break;
				case CCLProgramStorage::build_Success:
					_ASSERTE((*p_status_word) & cache_CouldNotVerify);
					break; // do nothing, we just got cache_CouldNotVerify
				}
			} else
				*p_status_word |= cache_ReadFailed_BinaryNotFound;
		} else {
			if(access(p_s_cache_file, 00) != 0)
				*p_status_word |= cache_ReadFailed_FileNotFound;
			else
				*p_status_word |= cache_ReadFailed_IO;
		}
		// use the new program storage
#endif // CL_PROGRAM_COMPILER_USE_OLD_BINARIES_FORMAT
	}
	// attempt to load program binaries from file

	cl_program h_prog;
	{
		*p_status_word |= prog_CompiledFromSource;

		cl_int n_result;
		h_prog = clCreateProgramWithSource(h_context, 1, &p_s_source_to_compile, NULL, &n_result);
		if(n_result != CL_SUCCESS)
			return (CLresult)n_result;

		*p_status_word |= prog_CreateSucceeded;

		_ASSERTE(n_device_num <= /*CL*/UINT_MAX);
		if((n_result = clBuildProgram(h_prog, cl_uint(n_device_num), p_device_list, p_s_build_options, NULL, NULL)) != CL_SUCCESS) {
#ifdef CL_PROGRAM_COMPILER_DISPLAY_BUILD_ERRORS
			if(n_device_num) {
				std::string s_build_log, s_device_name;
				for(size_t i = 0; i < n_device_num; ++ i) {
					cl_build_status n_build_status;

					CLresult n_cl_result;
					if((n_cl_result = n_Get_BuildLog(s_build_log, n_build_status, h_prog, p_device_list[i])) != CL_SUCCESS)
						return n_cl_result;
					// get buld log and build status

					if(CCLDeviceParams::n_GetDeviceInfoString(s_device_name, p_device_list[i], CL_DEVICE_NAME) != CL_SUCCESS)
						s_device_name.erase();
					const char *p_s_device_name = (s_device_name.empty())? "(null)" : s_device_name.c_str();
					// get device name

					if(n_build_status == CL_BUILD_NONE) {
						fprintf(stderr, "error: program wasn't built for device " PRIsize " (%s)\n", i, p_s_device_name);
						continue;
					} else if(n_build_status == CL_BUILD_ERROR) {
						fprintf(stderr, "error: there were errors while building program for device " PRIsize " (%s)\n",
							i, p_s_device_name);
					} else if(n_build_status != CL_BUILD_SUCCESS)
						fprintf(stderr, "error: unknown program build status for device " PRIsize " (%s)\n", i, p_s_device_name);
					// show build status

					stl_ut::TrimSpace(s_build_log);
					if(n_build_status != CL_BUILD_SUCCESS || !s_build_log.empty()) {
						fprintf(stderr, "=== OpenCL build log for device " PRIsize " (%s) ===\n%s\n",
							i, p_s_device_name, s_build_log.c_str());
					}
					// show build log
				}
			}
			// show build error(s)
#endif // CL_PROGRAM_COMPILER_DISPLAY_BUILD_ERRORS

			clReleaseProgram(h_prog);
			return (CLresult)n_result;
		}
		// build program

#ifdef CL_PROGRAM_COMPILER_DISPLAY_SUCCESSFUL_BUILD_WARNINGS
		if(n_device_num) {
			std::string s_build_log, s_device_name;
			for(size_t i = 0; i < n_device_num; ++ i) {
				cl_build_status n_build_status;

				CLresult n_cl_result;
				if((n_cl_result = n_Get_BuildLog(s_build_log, n_build_status, h_prog, p_device_list[i])) != CL_SUCCESS)
					return n_cl_result;
				// get buld log and build status

				if(CCLDeviceParams::n_GetDeviceInfoString(s_device_name, p_device_list[i], CL_DEVICE_NAME) != CL_SUCCESS)
					s_device_name.erase();
				const char *p_s_device_name = (s_device_name.empty())? "(null)" : s_device_name.c_str();
				// get device name

				if(n_build_status == CL_BUILD_NONE) {
					fprintf(stderr, "warning: program wasn't built for device %d (%s)\n", i, p_s_device_name);
					continue;
				} else if(n_build_status == CL_BUILD_ERROR) {
					fprintf(stderr, "warning: there were errors while building program for device %d (%s)\n",
						i, p_s_device_name);
				} else if(n_build_status != CL_BUILD_SUCCESS)
					fprintf(stderr, "warning: unknown program build status for device %d (%s)\n", i, p_s_device_name);
				// show build status

				stl_ut::TrimSpace(s_build_log);
				if(n_build_status != CL_BUILD_SUCCESS || !s_build_log.empty()) {
					fprintf(stderr, "=== OpenCL build log for device %d (%s) ===\n%s\n",
						i, p_s_device_name, s_build_log.c_str());
				}
				// show build log
			}
		}
		// show warnings
#endif // CL_PROGRAM_COMPILER_DISPLAY_SUCCESSFUL_BUILD_WARNINGS

		*p_status_word |= prog_BuildSucceeded;
	}
	// create program

	*p_program = h_prog;
	// output

	if(p_s_cache_file) {
		*p_status_word |= cache_WriteAttempted;

#ifdef CL_PROGRAM_COMPILER_USE_OLD_BINARIES_FORMAT
		if(WriteProgramBinaries(h_prog, t_hash, p_s_cache_file))
			*p_status_word |= cache_WriteSucceeded;
		else {
			remove(p_s_cache_file);
			// in case it didn't save correctly, do not leave it arround
		}
#else // CL_PROGRAM_COMPILER_USE_OLD_BINARIES_FORMAT
		if(program_storage.Put_ProgramBinaries(h_prog, p_s_source_to_query, all_dev_list.size(),
		   (all_dev_list.empty())? 0 : &all_dev_list.front(), p_s_build_options)) {
			_ASSERTE(program_storage.b_Dirty());
			// should be "dirty" now

#ifdef CL_PROGRAM_COMPILER_PRECAUTIOUS_BINARIES_SAVE
			std::string s_temp_name;
			char p_s_time[5];
			sprintf(p_s_time, "%04x", clock() & 0xffff); // "random" identifier
			try {
				s_temp_name = p_s_cache_file;
				s_temp_name += ".tmp";
				s_temp_name += p_s_time;
			} catch(std::bad_alloc&) {
				return CL_SUCCESS; // load succeeded, just the save failed. ignore it.
			}
			// create temp filename

			if(program_storage.Save(s_temp_name.c_str(), n_max_program_instance_num)) {
				remove(p_s_cache_file);
				rename(s_temp_name.c_str(), p_s_cache_file);

				*p_status_word |= cache_WriteSucceeded;
			}
			// save with new filename, if succeeds, rewrite the old file; this is important because of the lru policy
#else // CL_PROGRAM_COMPILER_PRECAUTIOUS_BINARIES_SAVE
			if(program_storage.Save(p_s_cache_file, n_max_program_instance_num)) {
				*p_status_word |= cache_WriteSucceeded;
				if(program_storage.b_Dirty())
					*p_status_word |= cache_WriteTrimmed; // it's dirty - file doesn't contain all the versions - n_max_program_instance_num limit
			} else
				remove(p_s_cache_file); // don't leave garbage
			// save with old filename, remove file on failure (no fallback)
#endif // CL_PROGRAM_COMPILER_PRECAUTIOUS_BINARIES_SAVE
		}
#endif // CL_PROGRAM_COMPILER_USE_OLD_BINARIES_FORMAT
	}
	// write file with binaries

	return cl_Success;
}

CLresult CCLProgramCompiler::n_Get_BuildLog(std::string &r_s_build_log, cl_build_status &r_n_build_status,
	cl_program h_program, cl_device_id h_device)
{
	r_s_build_log.erase();
	r_n_build_status = CL_BUILD_NONE;
	// clear output

	for(;;) {
		CLresult n_result;
		if((n_result = (CLresult)clGetProgramBuildInfo(h_program, h_device, CL_PROGRAM_BUILD_STATUS,
		   sizeof(cl_build_status), &r_n_build_status, NULL)) != CL_SUCCESS)
			return n_result;
		if(r_n_build_status != CL_BUILD_IN_PROGRESS)
			break;
#if defined(_WIN32) || defined(_WIN64)
		Sleep(100);
#else // _WIN32 || _WIN64
		sleep(1);
#endif // _WIN32 || _WIN64
	}
	// wait while build is in progress

	if(r_n_build_status == CL_BUILD_NONE)
		return cl_Success;
	// check build status

	size_t n_build_log_size;
	{
		CLresult n_result;
		if((n_result = (CLresult)clGetProgramBuildInfo(h_program, h_device,
		   CL_PROGRAM_BUILD_LOG, 0, NULL, &n_build_log_size)) != CL_SUCCESS)
			return n_result;
	}
	if(!stl_ut::Resize_To_N(r_s_build_log, n_build_log_size + 1)) {
		fprintf(stderr, "error: not enough memory for build log\n");
		return cl_Out_Of_Host_Memory;
	}
	// get build log length, allocate string buffer

	{
		CLresult n_result;
		if((n_result = (CLresult)clGetProgramBuildInfo(h_program, h_device, CL_PROGRAM_BUILD_LOG,
		   n_build_log_size, &r_s_build_log[0], NULL)) != CL_SUCCESS)
			return n_result;
	}
	r_s_build_log.resize(strlen(r_s_build_log.c_str()));
	// get build log, erase terminating null from string

	return cl_Success;
}

CLresult CCLProgramCompiler::n_CompileProgramFile(cl_context h_context, cl_program *p_program,
	const char *p_s_source_file, size_t n_device_num, const cl_device_id *p_device_list,
	const char *p_s_build_options, const char *p_s_cache_file, int *p_status_word,
	size_t n_max_program_instance_num)
{
	int n_dummy_status_word;
	if(!p_status_word)
		p_status_word = &n_dummy_status_word;
	// makes sure p_status_word is not null

	*p_status_word = 0;
	// in case i/o errors occur below

	std::string s_program;
	{
		FILE *p_fr;
#if defined(_MSC_VER) && !defined(__MWERKS__) && _MSC_VER >= 1400
		if(fopen_s(&p_fr, p_s_source_file, "rb"))
#else //_MSC_VER && !__MWERKS__ && _MSC_VER >= 1400
		if(!(p_fr = fopen(p_s_source_file, "rb")))
#endif //_MSC_VER && !__MWERKS__ && _MSC_VER >= 1400
			return cl_Build_Program_Failure; // not really great, but there's no better error code
		CFILE_PtrGuard guard(p_fr); // closes the file when leaving the scope
		fseek(p_fr, 0, SEEK_END);
		size_t n_file_size = ftell(p_fr);
		try {
			s_program.resize(n_file_size);
		} catch(std::exception&) { // can be also length_error, not only bad_alloc
			return cl_Out_Of_Host_Memory;
		}
		fseek(p_fr, 0, SEEK_SET);
		if(fread(&s_program[0], 1, n_file_size, p_fr) != n_file_size)
			return cl_Build_Program_Failure; // not really great, but there's no better error code
	}
	// read program from a file

	return n_CompileProgram(h_context, p_program, s_program.c_str(), n_device_num,
		p_device_list, p_s_build_options, p_s_cache_file, p_status_word, n_max_program_instance_num);
	// use the other function
}

#ifdef CL_PROGRAM_COMPILER_USE_OLD_BINARIES_FORMAT

TSHA1 CCLProgramCompiler::t_Hash_ProgramSource_BuildOptions(const char *p_s_source, const char *p_s_build_options)
{
	CStreamHash<TSHA1> hash;
	hash.Process_Data(p_s_source, strlen(p_s_source) * sizeof(char));
	if(p_s_build_options && strcmp(p_s_build_options, ""))
		hash.Process_Data(p_s_build_options, strlen(p_s_build_options) * sizeof(char));
	return hash.t_Result();
}

int CCLProgramCompiler::n_ReadProgramBinaries(const char *p_s_filename, TSHA1 t_hash, cl_context h_context,
	cl_program *p_program, size_t n_device_num, const cl_device_id *p_device_list)
{
	FILE *p_fr;
	if(!(p_fr = fopen(p_s_filename, "rb")))
		return cache_ReadFailed_FileNotFound;
	// open the file

	TSHA1 t_ref_hash;
	if(fread(&t_ref_hash[0], sizeof(uint32_t), 5, p_fr) != 5) {
		fclose(p_fr);
		return cache_ReadFailed_IO;
	}
	// read reference hash

	if(memcmp(&t_hash[0], &t_ref_hash[0], 5 * sizeof(uint32_t))) {
		fclose(p_fr);
		return cache_ReadFailed_BinaryNotFound;
	}
	// compare it to current hash

	size_t n_binary_num;
	std::vector<size_t> binary_size_list;
	{
		uint32_t n_binary_num32;
		if(fread(&n_binary_num32, sizeof(uint32_t), 1, p_fr) != 1 || n_binary_num32 > SIZE_MAX) {
			fclose(p_fr);
			return cache_ReadFailed_IO;
		}
		n_binary_num = size_t(n_binary_num32);
		try {
			binary_size_list.resize(n_binary_num);
		} catch(std::exception&) { // can be also length_error, not only bad_alloc
			fclose(p_fr);
			return cache_ReadFailed_OutOfMemory;
		}
		for(size_t i = 0; i < n_binary_num; ++ i) {
			uint32_t n_binary_size32;
			if(fread(&n_binary_size32, sizeof(uint32_t), 1, p_fr) != 1 || n_binary_size32 > SIZE_MAX) {
				fclose(p_fr);
				return cache_ReadFailed_IO;
			}
			binary_size_list[i] = size_t(n_binary_size32);
		}
	}
	// read binary sizes

	size_t n_binaries_size = 0;
	for(size_t i = 0; i < n_binary_num; ++ i) {
		_ASSERTE(n_binaries_size <= SIZE_MAX - binary_size_list[i]);
		n_binaries_size += binary_size_list[i];
	}
	// sum binary sizes up

	TSHA1 t_binaries_hash;
	std::vector<const unsigned char*> binary_ptr_list;
	unsigned char *p_binaries;
	{
		try {
			binary_ptr_list.resize(n_binary_num);
			p_binaries = new unsigned char[n_binaries_size];
		} catch(std::bad_alloc&) {
			fclose(p_fr);
			return cache_ReadFailed_OutOfMemory;
		}
		// alloc buffers

		CStreamHash<TSHA1> hash;

		unsigned char *p_binary_ptr = p_binaries;
		for(size_t i = 0; i < n_binary_num; ++ i) {
			binary_ptr_list[i] = p_binary_ptr;

			size_t n_binary_size = binary_size_list[i];

			if(fread(p_binary_ptr, n_binary_size, 1, p_fr) != 1) {
				fclose(p_fr);
				return cache_ReadFailed_IO;
			}

			hash.Process_Data(p_binary_ptr, n_binary_size);

			p_binary_ptr += n_binary_size;
		}
		// get pointers, read binaries (so read block size is 32-bit), calculate their hashes

		t_binaries_hash = hash.t_Result();
	}
	// read binaries

	TSHA1 t_binaries_hash_ref;
	if(fread(&t_binaries_hash_ref[0], sizeof(uint32_t), 5, p_fr) != 5) {
		fclose(p_fr);
		return cache_ReadFailed_IO;
	}
	// read checksum

	fclose(p_fr);
	// close file

	if(memcmp(&t_binaries_hash_ref[0], &t_binaries_hash_ref[0], 5 * sizeof(uint32_t)))
		return cache_ReadFailed_BinaryChecksum;
	// make sure checksum is correct

#if defined CL_COMPILER_UTIL_BOGUS_DATA_DETECTION && defined(__DIR_INCLUDED)
	TFileInfo t_file(p_s_filename);
	if(t_file.n_Size64() != 2 * 5 * sizeof(uint32_t) + sizeof(uint32_t) + // size of two hashes, number of binaries
	   n_binary_num * sizeof(uint32_t) + n_binaries_size) // lengths of binaries, binaries data
		return cache_ReadFailed_BinaryChecksum; // ...
	// make sure file doesn't contain bogus data (paranoid)
#endif // CL_COMPILER_UTIL_BOGUS_DATA_DETECTION && __DIR_INCLUDED

	cl_int n_result;
	cl_program h_prog = clCreateProgramWithBinary(h_context, n_device_num, p_device_list,
		&binary_size_list.front(), &binary_ptr_list.front(), NULL, &n_result);
	if(n_result != CL_SUCCESS)
		return cache_ReadFailed_CreateProgram;
	if((n_result = clBuildProgram(h_prog, 0, NULL, NULL, NULL, NULL)) != CL_SUCCESS) {
		clReleaseProgram(h_prog);
		return cache_ReadFailed_CreateProgram;
	}
	// use loaded binaries to create the program ...

	*p_program = h_prog;
	// write output

	return cache_ReadSucceeded;
}

bool CCLProgramCompiler::WriteProgramBinaries(cl_program h_program, TSHA1 t_hash, const char *p_s_filename)
{
	cl_uint n_device_num;
	if(clGetProgramInfo(h_program, CL_PROGRAM_NUM_DEVICES,
	   sizeof(size_t), &n_device_num, NULL) != CL_SUCCESS)
		return false;
	// get number of devices

	if(!n_device_num || n_device_num > UINT32_MAX)
		return false;
	// we can't query binaries without devices

	size_t n_binary_num = n_device_num;

	std::vector<size_t> binary_size_list;
	try {
#ifdef _DEBUG
		_ASSERTE(n_binary_num < SIZE_MAX);
		binary_size_list.resize(n_binary_num + 1);
		binary_size_list[n_binary_num] = 0xbaadf00d; // "magic" word
#else // _DEBUG
		binary_size_list.resize(n_binary_num);
#endif // _DEBUG
	} catch(std::bad_alloc&) {
		return false;
	}
	if(clGetProgramInfo(h_program, CL_PROGRAM_BINARY_SIZES,
	   n_binary_num * sizeof(size_t), &binary_size_list.front(), NULL) != CL_SUCCESS)
		return false;
#ifdef _DEBUG
	_ASSERTE(binary_size_list[n_binary_num] == 0xbaadf00d); // make sure it's returning the right amount of data
#endif // _DEBUG
	// get binary sizes

	size_t n_binaries_size = 0;
	for(size_t i = 0; i < n_binary_num; ++ i) {
		_ASSERTE(n_binaries_size <= SIZE_MAX - binary_size_list[i]);
		n_binaries_size += binary_size_list[i];
	}
	// sum binary sizes up

	FILE *p_fw;
	if(!(p_fw = fopen(p_s_filename, "wb")))
		return false;
	// open file for writing ...

	if(fwrite(&t_hash[0], sizeof(uint32_t), 5, p_fw) != 5) {
		fclose(p_fw);
		return false;
	}
	// write hash

	uint32_t n_binary_num32 = uint32_t(n_binary_num);
	if(fwrite(&n_binary_num32, sizeof(uint32_t), 1, p_fw) != 1) {
		fclose(p_fw);
		return false;
	}
	// write number of binaries

	for(size_t i = 0; i < n_binary_num; ++ i) {
		_ASSERTE(binary_size_list[i] <= UINT32_MAX);
		uint32_t n_binary_size32 = binary_size_list[i];
		if(fwrite(&n_binary_size32, sizeof(uint32_t), 1, p_fw) != 1) {
			fclose(p_fw);
			return false;
		}
	}
	// write size of each binary

	unsigned char *p_binaries;
	std::vector<const unsigned char*> binary_ptr_list;
	{
		try {
			binary_ptr_list.resize(n_binary_num);
			p_binaries = new unsigned char[n_binaries_size];
		} catch(std::bad_alloc&) {
			fclose(p_fw);
			return false;
		}
		// alloc buffers

		unsigned char *p_binary_ptr = p_binaries;
		for(size_t i = 0; i < n_binary_num; ++ i) {
			binary_ptr_list[i] = p_binary_ptr;
			p_binary_ptr += binary_size_list[i];
		}
		// get pointers
	}
	if(clGetProgramInfo(h_program, CL_PROGRAM_BINARIES,
	   n_binaries_size, &binary_ptr_list.front(), NULL) != CL_SUCCESS) {
		fclose(p_fw);
		delete[] p_binaries;
		return false;
	}
	// get binaries

	TSHA1 t_binaries_hash;
	{
		CStreamHash<TSHA1> hash;
		if(fwrite(p_binaries, n_binaries_size, 1, p_fw) != 1) {
			fclose(p_fw);
			delete[] p_binaries;
			return false;
		}
		hash.Process_Data(p_binaries, n_binaries_size);
		t_binaries_hash = hash.t_Result();
	}
	// write binaries, calculate hash

	delete[] p_binaries;
	// cleanup

	if(fwrite(&t_binaries_hash[0], sizeof(uint32_t), 5, p_fw) != 5) {
		fclose(p_fw);
		return false;
	}
	// write hash

	fclose(p_fw);
	// close output file

	return true;
}

#endif // CL_PROGRAM_COMPILER_USE_OLD_BINARIES_FORMAT

/*
 *								=== ~CCLProgramCompiler ===
 */

/*
 *								=== CCLUniqueProgram ===
 */

#if defined(_MSC_VER) && !defined(__MWERKS__) && _MSC_VER <= 1200
const CCLUniqueProgram::TBuildFromSource_Tag CCLUniqueProgram::from_source_code;
const CCLUniqueProgram::TBuildFromCompressedSource_Tag CCLUniqueProgram::from_compressed;
const CCLUniqueProgram::TBuildFromFile_Tag CCLUniqueProgram::from_file;
// MSVC 60 can't initialize empty structures
#else // _MSC_VER && !__MWERKS__ && _MSC_VER <= 1200
const CCLUniqueProgram::TBuildFromSource_Tag CCLUniqueProgram::from_source_code = {};
const CCLUniqueProgram::TBuildFromCompressedSource_Tag CCLUniqueProgram::from_compressed = {};
const CCLUniqueProgram::TBuildFromFile_Tag CCLUniqueProgram::from_file = {}; // g++ requires initialization
#endif // _MSC_VER && !__MWERKS__ && _MSC_VER <= 1200
// constructor tags

CCLUniqueProgram::CCLUniqueProgram(cl_context h_context, const char *p_s_source_code,
	TBuildFromSource_Tag UNUSED(t_tag), const char *p_s_compiler_options /*= ""*/,
	const char *p_s_cache_file /*= "%copykernelname%"*/, int n_max_cache_size /*= 32*/)
	:m_h_program(0), m_n_compile_flags(0)
{
	m_n_last_result = n_Init(false, h_context, p_s_source_code, 0, 0,
		p_s_compiler_options, p_s_cache_file, n_max_cache_size);
}

CCLUniqueProgram::CCLUniqueProgram(CCLUniqueInstance &r_instance, const char *p_s_source_code,
	TBuildFromSource_Tag UNUSED(t_tag), const char *p_s_compiler_options /*= ""*/,
	const char *p_s_cache_file /*= "%copykernelname%"*/, int n_max_cache_size /*= 32*/)
	:m_h_program(0), m_n_compile_flags(0)
{
	m_n_last_result = n_Init(false, r_instance.h_Context(), p_s_source_code,
		r_instance.n_Device_Num(), r_instance.p_Device(),
		p_s_compiler_options, p_s_cache_file, n_max_cache_size);
}

CCLUniqueProgram::CCLUniqueProgram(CCLUniqueInstance &r_instance, const char *p_s_filename,
	TBuildFromFile_Tag UNUSED(t_tag), const char *p_s_compiler_options /*= ""*/,
	const char *p_s_cache_file /*= "%copykernelname%"*/, int n_max_cache_size /*= 32*/)
	:m_h_program(0), m_n_compile_flags(0)
{
	m_n_last_result = n_Init(true, r_instance.h_Context(), p_s_filename,
		r_instance.n_Device_Num(), r_instance.p_Device(),
		p_s_compiler_options, p_s_cache_file, n_max_cache_size);
}

CCLUniqueProgram::CCLUniqueProgram(cl_context h_context, const char *p_s_filename,
	TBuildFromFile_Tag UNUSED(t_tag), const char *p_s_compiler_options /*= ""*/,
	const char *p_s_cache_file /*= "%copykernelname%"*/, int n_max_cache_size /*= 32*/)
	:m_h_program(0), m_n_compile_flags(0)
{
	m_n_last_result = n_Init(true, h_context, p_s_filename, 0, 0, // for all devices
		p_s_compiler_options, p_s_cache_file, n_max_cache_size);
}

CCLUniqueProgram::CCLUniqueProgram(cl_context h_context, const char *p_s_filename,
	TBuildFromFile_Tag UNUSED(t_tag), cl_device_id h_device,
	const char *p_s_compiler_options /*= ""*/,
	const char *p_s_cache_file /*= "%copykernelname%"*/, int n_max_cache_size /*= 32*/)
	:m_h_program(0), m_n_compile_flags(0)
{
	m_n_last_result = n_Init(true, h_context, p_s_filename, 1, &h_device,
		p_s_compiler_options, p_s_cache_file, n_max_cache_size);
}

CCLUniqueProgram::CCLUniqueProgram(cl_context h_context, const char *p_s_filename,
	TBuildFromFile_Tag UNUSED(t_tag), size_t n_device_num,
	const cl_device_id *p_device, const char *p_s_compiler_options /*= ""*/,
	const char *p_s_cache_file /*= "%copykernelname%"*/, int n_max_cache_size /*= 32*/)
	:m_h_program(0), m_n_compile_flags(0)
{
	m_n_last_result = n_Init(true, h_context, p_s_filename, n_device_num, p_device,
		p_s_compiler_options, p_s_cache_file, n_max_cache_size);
}

CCLUniqueProgram::~CCLUniqueProgram()
{
	if(m_h_program)
		clReleaseProgram(m_h_program);
}

/*int CCLUniqueProgram::n_Get_Kernel(cl_kernel &r_h_kernel,
	const char *p_s_kernel_name)
{
	cl_int n_result;
	r_h_kernel = clCreateKernel(m_h_program, p_s_kernel_name, &n_result);
	return n_result;
}

cl_kernel CCLUniqueProgram::h_Get_Kernel(const char *p_s_kernel_name)
{
	cl_kernel h_kernel;
	int n_result = n_Get_Kernel(h_kernel, p_s_kernel_name);
	_ASSERTE(n_result == CL_SUCCESS); // this would be programmer's error mostly, otherwise it seldom fails
	return (n_result == CL_SUCCESS)? h_kernel : 0;
}*/

cl_kernel CCLUniqueProgram::h_Get_Kernel(const char *p_s_kernel_name, CLresult &r_n_result)
{
	cl_int n_cl_result;
	cl_kernel h_kernel = clCreateKernel(m_h_program, p_s_kernel_name, &n_cl_result);
	r_n_result = (CLresult)n_cl_result;
	return h_kernel;
}

cl_kernel CCLUniqueProgram::h_Get_Kernel(const char *p_s_kernel_name)
{
	cl_int n_result;
	cl_kernel h_kernel = clCreateKernel(m_h_program, p_s_kernel_name, &n_result);
	CL_ASSERT(n_result); // this would be programmer's error mostly, otherwise it seldom fails
	return (n_result == CL_SUCCESS)? h_kernel : 0;
}

CLresult CCLUniqueProgram::n_Init(bool b_is_filename, cl_context h_context, const char *p_s_filename,
	size_t n_device_num, const cl_device_id *p_device, const char *p_s_compiler_options /*= ""*/,
	const char *p_s_cache_file /*= "%copykernelname%"*/, int n_max_cache_size /*= 32*/)
{
	std::string s_cache_file, s_source_hash;
	if(!strcmp(p_s_cache_file, "%copykernelname%")) {
		TMD5 t_hash;
		if(!b_is_filename) {
			const char *p_s_source_code = p_s_filename;
			CStreamHash<TMD5> hash;
			hash.Process_Data(p_s_source_code, strlen(p_s_source_code) * sizeof(char));
			t_hash = hash.t_Result();
		}
		char p_s_md5buff[33];
		if(!stl_ut::AssignCStr(s_cache_file, "%temp_default%") ||
		   !stl_ut::AppendCStr(s_cache_file, (b_is_filename)? p_s_filename :
		   t_hash.p_s_ToString(p_s_md5buff, sizeof(p_s_md5buff))))
			return cl_Out_Of_Host_Memory;
		std::replace(s_cache_file.begin(), s_cache_file.end(), '/', '_');
		std::replace(s_cache_file.begin(), s_cache_file.end(), '\\', '_');
		std::replace(s_cache_file.begin(), s_cache_file.end(), ':', '_');
		p_s_cache_file = s_cache_file.c_str();
	}
	// mangle input filename to cache filename

	CLresult n_result;
	if(b_is_filename) {
		n_result = CCLProgramCompiler::n_CompileProgramFile(h_context, &m_h_program, p_s_filename,
			n_device_num, p_device, p_s_compiler_options, p_s_cache_file, &m_n_compile_flags,
			n_max_cache_size);
	} else {
		const char *p_s_source_code = p_s_filename;
		n_result = CCLProgramCompiler::n_CompileProgram(h_context, &m_h_program, p_s_source_code,
			n_device_num, p_device, p_s_compiler_options, p_s_cache_file, &m_n_compile_flags,
			n_max_cache_size);
	}
	// compile the program

	/*if(b_verbose) 
		CCLProgramCompiler::Dump_StatusWord(m_n_compile_flags);
	// verbose (see what steps did it take)

	if(n_result != CL_SUCCESS && b_stderr_output)
		fprintf(stderr, "error: failed to load OpenCL program (%d)\n", n_result);*/
	// write out

	return n_result;
}

CLresult CCLUniqueProgram::n_Dump_RawBinary(const char *p_s_filename) const
{
	CLresult n_result;
	cl_uint n_device_num;
	if((n_result = (CLresult)clGetProgramInfo(m_h_program, CL_PROGRAM_NUM_DEVICES,
	   sizeof(size_t), &n_device_num, NULL)) != CL_SUCCESS)
		return n_result;
	// get number of devices, make sure it matches description

	std::vector<cl_device_id> device_list(n_device_num);
	if((n_result = (CLresult)clGetProgramInfo(m_h_program, CL_PROGRAM_DEVICES,
	   n_device_num * sizeof(cl_device_id), &device_list.front(), NULL)) != CL_SUCCESS)
		return n_result;
	// get the devices

	CCLProgramStorage::TProgramBinary b;
	if(!b.Download(m_h_program, n_device_num))
		return cl_Invalid_Program/*false*/;
	// download

	FILE *p_fw;
#if defined(_MSC_VER) && !defined(__MWERKS__) && _MSC_VER >= 1400
	if(fopen_s(&p_fw, p_s_filename, "wb"))
#else //_MSC_VER && !__MWERKS__ && _MSC_VER >= 1400
	if(!(p_fw = fopen(p_s_filename, "wb")))
#endif //_MSC_VER && !__MWERKS__ && _MSC_VER >= 1400
		return cl_Out_Of_Resources; // well ...
	CFILE_PtrGuard guard(p_fw); // close the file once finished
	// open the file

	_ASSERTE(b.binary_size_list.size() == n_device_num);
	std::string s_name;
	for(size_t i = 0, n_offset = 0; i < n_device_num; ++ i) {
		if((n_result = CCLDeviceParams::n_GetDeviceInfoString(s_name,
		   device_list[i], CL_DEVICE_NAME)) != cl_Success)
			return n_result;
		fprintf(p_fw, "// binary for device " PRIsize " (%s)\n\n", i, s_name.c_str());
		fwrite(b.t_data_buffer.p_data + n_offset, b.binary_size_list[i], 1, p_fw);
		if(b.binary_size_list[i] > 0 && b.t_data_buffer.p_data[n_offset + b.binary_size_list[i] - 1] != '\n')
			fprintf(p_fw, "\n"); // nvidia ptx is actually text and ends with a newline already
		fprintf(p_fw, "// ~binary for device " PRIsize " (%s)\n\n", i, s_name.c_str());
		n_offset += b.binary_size_list[i];
	}
	if(ferror(p_fw))
		return cl_Out_Of_Resources; // well ...

	return cl_Success;
}

/*
 *								=== ~CCLUniqueProgram ===
 */

/*
 *								=== CCLUniqueInstance ===
 */

CCLUniqueInstance::CCLUniqueInstance(int n_device_type /*= CL_DEVICE_TYPE_GPU*/,
	bool b_implementation_profile_selection /*= false*/, bool b_stderr_output /*= true*/,
	int n_queue_options /*= CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE*/)
{
	m_p_device[0] = 0;
	m_n_last_error = n_Init(CCLUtils::TDevice_DefaultScoring(), n_device_type, n_queue_options,
		b_implementation_profile_selection, b_stderr_output);
}

CCLUniqueInstance::~CCLUniqueInstance()
{
	// note that both command queue and context now release itself
}

/*
 *								=== ~CCLUniqueInstance ===
 */

#if 0

static void PrintMacros()
{
	for(int i = 1; i < 17; ++ i) {
		printf(
			"/**\n"
			" *	@def clCall3D%d\n"
			" *	@brief sets arguments of a 3D kernel with %d arguments and calls it\n"
			" *\n"
			" *	@param[in] h_cmd_queue is handle to the command queue\n"
			" *	@param[in] h_kernel is handle to the kernel to call\n"
			" *	@param[in] n_work_size_x is global work size in the x dimension\n"
			" *	@param[in] n_work_size_y is global work size in the x dimension\n"
			" *	@param[in] n_work_size_z is global work size in the x dimension\n"
			" *	@param[in] n_block_size_x is thread block size in the x dimension\n"
			" *	@param[in] n_block_size_y is thread block size in the x dimension\n"
			" *	@param[in] n_block_size_z is thread block size in the x dimension\n"
			" *	@param[in] ... are kernel function argument (int, float or cl_mem)\n", i, i);
		printf(
			" */\n"
			"#define clCall3D%d(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...", i);
		printf(
			") \\\n"
			"	CCLKernelCall(clSetKernelArgs%d((h_kernel), __VA_ARGS__", i);
		printf(
			"), \\\n"
			"	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), (n_block_size_x), (n_block_size_y), (n_block_size_z)).n_Result()\n\n");
	}

	for(int i = 1; i < 17; ++ i) {
		printf(
			"/**\n"
			" *	@def clCall2D%d\n"
			" *	@brief sets arguments of a 2D kernel with %d arguments and calls it\n"
			" *\n"
			" *	@param[in] h_cmd_queue is handle to the command queue\n"
			" *	@param[in] h_kernel is handle to the kernel to call\n"
			" *	@param[in] n_work_size_x is global work size in the x dimension\n"
			" *	@param[in] n_work_size_y is global work size in the x dimension\n"
			" *	@param[in] n_block_size_x is thread block size in the x dimension\n"
			" *	@param[in] n_block_size_y is thread block size in the x dimension\n"
			" *	@param[in] ... are kernel function argument (int, float or cl_mem)\n", i, i);
		printf(
			" */\n"
			"#define clCall2D%d(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...", i);
		printf(
			") \\\n"
			"	CCLKernelCall(clSetKernelArgs%d((h_kernel), __VA_ARGS__", i);
		printf(
			"), \\\n"
			"	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y)).n_Result()\n\n");
	}

	for(int i = 1; i < 17; ++ i) {
		printf(
			"/**\n"
			" *	@def clCall1D%d\n"
			" *	@brief sets arguments of a 1D kernel with %d arguments and calls it\n"
			" *\n"
			" *	@param[in] h_cmd_queue is handle to the command queue\n"
			" *	@param[in] h_kernel is handle to the kernel to call\n"
			" *	@param[in] n_work_size_x is global work size in the x dimension\n"
			" *	@param[in] n_block_size_x is thread block size in the x dimension\n"
			" *	@param[in] ... are kernel function argument (int, float or cl_mem)\n", i, i);
		printf(
			" */\n"
			"#define clCall1D%d(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...", i);
		printf(
			") \\\n"
			"	CCLKernelCall(clSetKernelArgs%d((h_kernel), __VA_ARGS__", i);
		printf(
			"), \\\n"
			"	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x)).n_Result()\n\n");
	}

	for(int i = 1; i < 17; ++ i) {
		printf(
			"/**\n"
			" *	@def clCall3D%d\n"
			" *	@brief sets arguments of a 2D kernel with %d arguments and calls it\n"
			" *\n"
			" *	@param[in] h_cmd_queue is handle to the command queue\n"
			" *	@param[in] h_kernel is handle to the kernel to call\n"
			" *	@param[in] n_work_size_x is global work size in the x dimension\n"
			" *	@param[in] n_work_size_y is global work size in the y dimension\n"
			" *	@param[in] n_work_size_z is global work size in the z dimension\n"
			" *	@param[in] n_block_size_x is thread block size in the x dimension\n"
			" *	@param[in] n_block_size_y is thread block size in the y dimension\n"
			" *	@param[in] n_block_size_z is thread block size in the z dimension\n", i, i);
		for(int j = 0; j < i; ++ j)
			printf(" *	@param[in] %c is kernel function argument (int, float or cl_mem)\n", 'a' + j);
		printf(
			" */\n"
			"#define clCall3D%d(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z", i);
		for(int j = 0; j < i; ++ j)
			printf(",%c", 'a' + j);
		printf(
			") \\\n"
			"	CCLKernelCall(clSetKernelArgs%d((h_kernel)", i);
		for(int j = 0; j < i; ++ j)
			printf(", (%c)", 'a' + j);
		printf(
			"), \\\n"
			"	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \\\n"
			"	(n_block_size_x), (n_block_size_y), (n_block_size_z)).n_Result()\n\n");
	}

	for(int i = 1; i < 17; ++ i) {
		printf(
			"/**\n"
			" *	@def clCall2D%d\n"
			" *	@brief sets arguments of a 2D kernel with %d arguments and calls it\n"
			" *\n"
			" *	@param[in] h_cmd_queue is handle to the command queue\n"
			" *	@param[in] h_kernel is handle to the kernel to call\n"
			" *	@param[in] n_work_size_x is global work size in the x dimension\n"
			" *	@param[in] n_work_size_y is global work size in the y dimension\n"
			" *	@param[in] n_block_size_x is thread block size in the x dimension\n"
			" *	@param[in] n_block_size_y is thread block size in the y dimension\n", i, i);
		for(int j = 0; j < i; ++ j)
			printf(" *	@param[in] %c is kernel function argument (int, float or cl_mem)\n", 'a' + j);
		printf(
			" */\n"
			"#define clCall2D%d(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y", i);
		for(int j = 0; j < i; ++ j)
			printf(",%c", 'a' + j);
		printf(
			") \\\n"
			"	CCLKernelCall(clSetKernelArgs%d((h_kernel)", i);
		for(int j = 0; j < i; ++ j)
			printf(", (%c)", 'a' + j);
		printf(
			"), \\\n"
			"	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y)).n_Result()\n\n");
	}

	for(int i = 1; i < 17; ++ i) {
		printf(
			"/**\n"
			" *	@def clCall1D%d\n"
			" *	@brief sets arguments of a 1D kernel with %d arguments and calls it\n"
			" *\n"
			" *	@param[in] h_cmd_queue is handle to the command queue\n"
			" *	@param[in] h_kernel is handle to the kernel to call\n"
			" *	@param[in] n_work_size_x is global work size in the x dimension\n"
			" *	@param[in] n_block_size_x is thread block size in the x dimension\n", i, i);
		for(int j = 0; j < i; ++ j)
			printf(" *	@param[in] %c is kernel function argument (int, float or cl_mem)\n", 'a' + j);
		printf(
			" */\n"
			"#define clCall1D%d(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x", i);
		for(int j = 0; j < i; ++ j)
			printf(",%c", 'a' + j);
		printf(
			") \\\n"
			"	CCLKernelCall(clSetKernelArgs%d((h_kernel)", i);
		for(int j = 0; j < i; ++ j)
			printf(", (%c)", 'a' + j);
		printf(
			"), \\\n"
			"	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x)).n_Result()\n\n");
	}
}

#endif // 0
