/*
								+--------------------------------+
								|                                |
								|   ***  OpenCL utilities  ***   |
								|                                |
								|  Copyright  -tHE SWINe- 2010  |
								|                                |
								|          ClUtils.cpp           |
								|                                |
								+--------------------------------+
*/

#include "../NewFix.h"
#include "../CallStack.h"
#include <stdio.h>
#include "../Integer.h"
#include "../MinMax.h"
#include "../Dir.h" // PRIsizeB
#include "../Hash.h"
#include <CL/opencl.h>
#include "ClUtils.h"

#if defined(_MSC_VER) && !defined(__MWERKS__) && !defined(for)
#define for if(0) {} else for
#endif

/*
 *								=== CCLUtils ===
 */

int CCLUtils::n_OpenCL_Init(cl_context *p_context, int n_device_type, bool b_implementation_profile_selection)
{
	cl_context_properties p_props[3] = {cl_context_properties(CL_CONTEXT_PLATFORM), 0, 0};
	if(!b_implementation_profile_selection) {
		cl_uint n_platform_num = 0;
		int n_result;
		if((n_result = clGetPlatformIDs(0, NULL, &n_platform_num)) != CL_SUCCESS)
			return n_result; // error
		if(n_platform_num == 0)
			return CL_DEVICE_NOT_FOUND; // no OpenCL capable platform available
		// get number of OpenCL platforms

		std::vector<cl_platform_id> platform_list;
		try {
			platform_list.resize(n_platform_num);
		} catch(std::bad_alloc&) {
			return CL_OUT_OF_HOST_MEMORY;
		}
		if((n_result = clGetPlatformIDs(platform_list.size(),
		   &platform_list[0], &n_platform_num)) != CL_SUCCESS)
			return n_result;
		_ASSERTE(platform_list.size() == n_platform_num);
		// get list of platform id's

		int n_full_platform = 0; // if no full platform is found, just choose the first one
		for(size_t i = 0, n = platform_list.size(); i < n; ++ i) {
			char p_s_profile_name[20] = {0};
			clGetPlatformInfo(platform_list[i], CL_PLATFORM_PROFILE, sizeof(p_s_profile_name), p_s_profile_name, NULL);
			p_s_profile_name[sizeof(p_s_profile_name) / sizeof(p_s_profile_name[0]) - 1] = 0; // make sure it's terminated

			if(!strcmp(p_s_profile_name, "FULL_PROFILE")) {
				n_full_platform = i;
				break;
			}
		}
		// find "FULL_PROFILE" platform

		p_props[1] = cl_context_properties(platform_list[n_full_platform]);
	}
	// get first available "FULL_PROFILE" OpenCL platform

	cl_int n_result;
	cl_context h_context = clCreateContextFromType((b_implementation_profile_selection)?
		NULL : p_props, n_device_type, 0, 0, &n_result);
	if(n_result != CL_SUCCESS)
		return n_result;
	// create context

	*p_context = h_context;

	return CL_SUCCESS;
}

int CCLUtils::n_GetDeviceList(cl_context h_context, std::vector<cl_device_id> &r_device_list)
{
	{
		size_t n_device_size;
		int n_result;
		if((n_result = clGetContextInfo(h_context, CL_CONTEXT_DEVICES, 0, 0, &n_device_size)) != CL_SUCCESS)
			return n_result;
		size_t n_device_num = n_device_size / sizeof(cl_device_id); // it's in bytes
		if(!n_device_num)
			return CL_DEVICE_NOT_FOUND; // no devices
		try {
			r_device_list.resize(n_device_num);
		} catch(std::bad_alloc&) {
			return CL_OUT_OF_HOST_MEMORY;
		}
		if((n_result = clGetContextInfo(h_context, CL_CONTEXT_DEVICES, n_device_size, &r_device_list[0], 0)) != CL_SUCCESS)
			return n_result;
	}
	// get list of devices

	return CL_SUCCESS;
}

int CCLUtils::n_Get_MaxGFlops_DeviceId(cl_device_id *p_device_id, cl_context h_context)
{
	std::vector<cl_device_id> device_list;
	if(n_GetDeviceList(h_context, device_list) != CL_SUCCESS)
		return -1;
	// get all the devices

	size_t n_best_device = 0;
	{
		double f_best_gflops = -1;
		for(size_t i = 0, n = device_list.size(); i < n; ++ i) {
			cl_uint n_speed, n_processors;
			if(clGetDeviceInfo(device_list[i], CL_DEVICE_MAX_CLOCK_FREQUENCY,
			   sizeof(n_speed), &n_speed, NULL) != CL_SUCCESS ||
			   clGetDeviceInfo(device_list[i], CL_DEVICE_MAX_COMPUTE_UNITS,
			   sizeof(n_processors), &n_processors, NULL) != CL_SUCCESS)
				return -1;
			double f_gflops = n_speed * n_processors;
			if(!i || f_best_gflops < f_gflops) {
				f_best_gflops = f_gflops;
				n_best_device = i;
			}
		}
	}
	// go trough devices, and estimate computing power (not real gflops)

	*p_device_id = device_list[n_best_device];
	// write handle to the best device

	return n_best_device;
}

int CCLUtils::n_Get_MaxGFlops_DeviceId(cl_context h_context)
{
	cl_device_id h_dummy;
	return n_Get_MaxGFlops_DeviceId(&h_dummy, h_context);
}

/*
 *								=== ~CCLUtils ===
 */

/*
 *								=== CCLDeviceParams ===
 */

int CCLDeviceParams::n_GetDeviceInfoString(std::string &r_s_str, int n_name)
{
	return n_GetDeviceInfoString(r_s_str, m_h_device, n_name);
}

int CCLDeviceParams::n_GetDeviceInfoString(std::string &r_s_str, cl_device_id h_device, int n_name)
{
	size_t n_name_size;
	int n_result;
	if((n_result = clGetDeviceInfo(h_device, n_name, 0, NULL, &n_name_size)) != CL_SUCCESS)
		return n_result;
	// get string length

	try {
		r_s_str.resize(n_name_size + 1);
		// allocate string

		if((n_result = clGetDeviceInfo(h_device, n_name, n_name_size, &r_s_str[0], NULL)) != CL_SUCCESS)
			return n_result;
		r_s_str[n_name_size] = 0;
		r_s_str.resize(strlen(r_s_str.c_str()));
		// get string
	} catch(std::bad_alloc&) {
		return CL_OUT_OF_HOST_MEMORY;
	}

	return CL_SUCCESS;
}

bool CCLDeviceParams::b_IsExtensionSupported(const char *p_s_extension_name)
{
	return b_IsExtensionSupported(m_h_device, p_s_extension_name);
}

bool CCLDeviceParams::b_IsExtensionSupported(cl_device_id h_device, const char *p_s_extension_name)
{
	std::string s_exts;
	if(n_GetDeviceInfoString(s_exts, h_device, CL_DEVICE_EXTENSIONS) != CL_SUCCESS)
		return false;
	// get device extension string

	size_t n_ext_name_length = strlen(p_s_extension_name);
	size_t n_pos = 0, n_end_pos = s_exts.length() - n_ext_name_length;
	while((n_pos = s_exts.find(p_s_extension_name, n_pos)) != std::string::npos) {
		if((!n_pos || isspace(s_exts[n_pos - 1])) &&
		   (n_pos >= n_end_pos || isspace(s_exts[n_pos + n_ext_name_length])))
			return true;
		// there must be space before and after it
	}
	// attempt to find the extension

	return false;
	// extension not found
}

void CCLDeviceParams::Dump(FILE *p_fw)
{
	fprintf(p_fw, "device(\'%s\', " PRIsizeB
		"B RAM, %.2f MHz, multiproc: %d, max-threads-block: %d, max-block-size: %dx%dx%d)\n",
		p_s_Name(),
		PRIsizeBparams(n_Memory_Size()),
		t_Properties().clockRate / 1e3f,
		n_Multiprocessor_Num(),
		t_Properties().maxThreadsPerBlock,
		t_Properties().maxThreadsDim[0],
		t_Properties().maxThreadsDim[1],
		t_Properties().maxThreadsDim[2]);
	// show some device parameters
}

CCLDeviceParams::CCLDeviceParams(cl_device_id h_device)
	:m_h_device(h_device), m_n_memory_size(0)
{
	memset(m_p_device_caps, 0, 2 * sizeof(int));
	memset(&m_t_devprop, 0, sizeof(CLdevprop));

	if(!QueryDeviceParams()) {
		m_h_device = 0; // mark error
		return;
	}
	// get device params
}

CCLDeviceParams::CCLDeviceParams(cl_context h_context, int n_device_index)
	:m_h_device(0), // mark error
	m_n_memory_size(0)
{
	memset(m_p_device_caps, 0, 2 * sizeof(int));
	memset(&m_t_devprop, 0, sizeof(CLdevprop));

	{
		std::vector<cl_device_id> device_list;
		if(CCLUtils::n_GetDeviceList(h_context, device_list) != CL_SUCCESS)
			return;
		// get all the devices

		if(n_device_index < 0 || unsigned(n_device_index) >= device_list.size())
			return;
		// check index

		m_h_device = device_list[n_device_index];
	}
	// query all devices available to the context, get device handle

	if(!QueryDeviceParams()) {
		m_h_device = 0; // mark error
		return;
	}
	// get device params
}

bool CCLDeviceParams::QueryDeviceParams()
{
	if(n_GetDeviceInfoString(m_s_name, CL_DEVICE_NAME) != CL_SUCCESS)
		return false;
	// get device name

	bool b_nv_device_attribute_query = b_IsExtensionSupported("cl_nv_device_attribute_query");
	// determines wheter we have cl_nv_device_attribute_query

	{
		/*std::string s_version;
		if(n_GetDeviceInfoString(s_version, CL_DEVICE_VERSION) != CL_SUCCESS)
			return;
		m_p_device_caps[0] = atol(s_version.c_str());
		m_p_device_caps[1] = (strchr(s_version.c_str(), '.'))? atol(strchr(s_version.c_str(), '.') + 1) : 0;*/ // todo

		if(b_nv_device_attribute_query) {
			cl_uint n_major, n_minor;
			if(clGetDeviceInfo(m_h_device, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &n_major, NULL) != CL_SUCCESS ||
			   clGetDeviceInfo(m_h_device, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof(cl_uint), &n_minor, NULL) != CL_SUCCESS)
				return false;
			m_p_device_caps[0] = n_major;
			m_p_device_caps[1] = n_minor;
		} else {
			m_p_device_caps[0] = 1;
			m_p_device_caps[1] = 0;
		}
	}
	// get device OpenCL revision

	{
		if(b_nv_device_attribute_query) {
			cl_uint n_value;
			if(clGetDeviceInfo(m_h_device, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, sizeof(n_value), &n_value, NULL) != CL_SUCCESS)
				return false;
			m_b_kernel_exec_timeout = (n_value != 0);
		} else
			m_b_kernel_exec_timeout = false;
	}
	{
		cl_uint n_value;
		if(clGetDeviceInfo(m_h_device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(n_value), &n_value, NULL) != CL_SUCCESS)
			return false;
		m_n_multiprocessor_num = n_value;
	}
	// get some interesting device attributes

	{
		cl_ulong n_value;
		if(clGetDeviceInfo(m_h_device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(n_value), &n_value, NULL) != CL_SUCCESS)
			return false;
		_ASSERTE(n_value <= UINT_MAX); // this might get crossed pretty soon
		m_n_memory_size = unsigned int(n_value);
	}
	// get device memory size

	return n_Query_DeviceProperties(m_t_devprop, m_h_device) == CL_SUCCESS;
}

int CCLDeviceParams::n_Query_DeviceProperties(CLdevprop &r_t_devprop, cl_device_id h_device)
{
	memset(&r_t_devprop, 0, sizeof(CLdevprop));
	// clear output

	bool b_nv_device_attribute_query = b_IsExtensionSupported(h_device, "cl_nv_device_attribute_query");
	// determines wheter we have cl_nv_device_attribute_query

	{
		cl_uint n_value;
		int n_result;
		if((n_result = clGetDeviceInfo(h_device, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(n_value), &n_value, NULL)) != CL_SUCCESS)
			return n_result;
		r_t_devprop.clockRate = n_value * 1000;
	}
	{
		cl_uint n_value;
		int n_result;
		if((n_result = clGetDeviceInfo(h_device, CL_DEVICE_ADDRESS_BITS, sizeof(n_value), &n_value, NULL)) != CL_SUCCESS)
			return n_result;
		n_value /= 2;
		int n_size = int(min(uint64_t(INT_MAX), (uint64_t(1) << (n_value - 1)) | ((uint64_t(1) << (n_value - 1)) - 1)));
		r_t_devprop.maxGridSize[0] = n_size;
		r_t_devprop.maxGridSize[1] = n_size;
		r_t_devprop.maxGridSize[2] = 1; // this is wrong; todo
	}
	{
		size_t p_value[3]; // 3 is safe value, no device shall have CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS less than 3
		int n_result;
		if((n_result = clGetDeviceInfo(h_device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(p_value), p_value, NULL)) != CL_SUCCESS)
			return n_result;
		r_t_devprop.maxThreadsDim[0] = p_value[0];
		r_t_devprop.maxThreadsDim[1] = p_value[1];
		r_t_devprop.maxThreadsDim[2] = p_value[2]; // block size
	}
	{
		size_t n_value;
		int n_result;
		if((n_result = clGetDeviceInfo(h_device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(n_value), &n_value, NULL)) != CL_SUCCESS)
			return n_result;
		r_t_devprop.maxThreadsPerBlock = n_value;
	}
	{
		cl_ulong n_value;
		int n_result;
		if((n_result = clGetDeviceInfo(h_device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(n_value), &n_value, NULL)) != CL_SUCCESS)
			return n_result;
		_ASSERTE(n_value <= INT_MAX);
		r_t_devprop.memPitch = int(n_value);
	}
	{
		if(b_nv_device_attribute_query) {
			cl_uint n_value;
			int n_result;
			if((n_result = clGetDeviceInfo(h_device, CL_DEVICE_REGISTERS_PER_BLOCK_NV, sizeof(n_value), &n_value, NULL)) != CL_SUCCESS)
				return n_result;
			r_t_devprop.regsPerBlock = n_value;
		} else
			r_t_devprop.regsPerBlock = -1; // unknown
	}
	{
		cl_ulong n_value;
		int n_result;
		if((n_result = clGetDeviceInfo(h_device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(n_value), &n_value, NULL)) != CL_SUCCESS)
			return n_result;
		_ASSERTE(n_value <= INT_MAX);
		r_t_devprop.sharedMemPerBlock = int(n_value);
	}
	{
		if(b_nv_device_attribute_query) {
			cl_uint n_value;
			int n_result;
			if((n_result = clGetDeviceInfo(h_device, CL_DEVICE_WARP_SIZE_NV, sizeof(n_value), &n_value, NULL)) != CL_SUCCESS)
				return n_result;
			r_t_devprop.SIMDWidth = n_value;
		} else
			r_t_devprop.SIMDWidth = -1; // unknown
	}
	{
		r_t_devprop.textureAlign = -1; // unknown; todo
	}
	{
		cl_ulong n_value;
		int n_result;
		if((n_result = clGetDeviceInfo(h_device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(n_value), &n_value, NULL)) != CL_SUCCESS)
			return n_result;
		_ASSERTE(n_value <= INT_MAX);
		r_t_devprop.totalConstantMemory = int(n_value);
	}
	// get device properties

	return CL_SUCCESS;
}

bool CCLDeviceParams::b_ProblemFitsAtOnce(int n_width, int n_height, int n_depth) const
{
	n_width = (n_width + m_t_devprop.maxGridSize[0] - 1) / m_t_devprop.maxGridSize[0];
	n_height = (n_height + m_t_devprop.maxGridSize[1] - 1) / m_t_devprop.maxGridSize[1];
	n_depth = (n_depth + m_t_devprop.maxGridSize[2] - 1) / m_t_devprop.maxGridSize[2];
	// calculate dimensions, relative to maximal grid size (round up)

	if(n_width > m_t_devprop.maxThreadsDim[0] ||
	   n_height > m_t_devprop.maxThreadsDim[1] ||
	   n_depth > m_t_devprop.maxThreadsDim[2])
		return false;
	// those dimensions must not exceed block size

	if(n_width * n_height * n_depth > m_t_devprop.maxThreadsPerBlock)
		return false;
	// number of threads in the block must be below limit

	return true;
}

bool CCLDeviceParams::CalculateGridParams(int *p_block_size, int *p_grid_size,
	int n_width, int n_height, int n_depth) const
{
	if(!b_ProblemFitsAtOnce(n_width, n_height, n_depth))
		return false;
	// @todo - handle subdivided problems too

	int n_blk_width = (n_width + m_t_devprop.maxGridSize[0] - 1) / m_t_devprop.maxGridSize[0];
	int n_blk_height = (n_height + m_t_devprop.maxGridSize[1] - 1) / m_t_devprop.maxGridSize[1];
	int n_blk_depth = (n_depth + m_t_devprop.maxGridSize[2] - 1) / m_t_devprop.maxGridSize[2];
	// calculate block dimensions (lower bound)

	// @todo - optimize block dimensions to approach m_t_devprop.maxThreadsPerBlock as closely as possible (that is good thing to do, right?)

	p_block_size[0] = n_blk_width;
	p_block_size[1] = n_blk_height;
	p_block_size[2] = n_blk_depth;
	p_grid_size[0] = (n_width + n_blk_width - 1) / n_blk_width;
	p_grid_size[1] = (n_height + n_blk_height - 1) / n_blk_height;
	p_grid_size[2] = (n_depth + n_blk_depth - 1) / n_blk_depth;
	// store block sizes, and grid size

	return true;
}

/*
 *								=== ~CCLDeviceParams ===
 */

/*
 *								=== CCLProgramCompiler ===
 */

#define CL_COMPILER_UTIL_BOGUS_DATA_DETECTION

void CCLProgramCompiler::Dump_StatusWord(int n_status_word, FILE *p_fw)
{
	if(n_status_word & cache_ReadAttempted) {
		if(n_status_word & cache_ReadSucceeded)
			fprintf(p_fw, "program loaded from cache\n");
		else {
			if(n_status_word & cache_ReadFailed_FileNotFound)
				fprintf(p_fw, "program not cached\n");
			else if(n_status_word & cache_ReadFailed_IO)
				fprintf(p_fw, "i/o error while loading program from cache\n");
			else if(n_status_word & cache_ReadFailed_OutOfMemory)
				fprintf(p_fw, "not enough memory while loading program from cache\n");
			else if(n_status_word & cache_ReadFailed_SourceChecksum)
				fprintf(p_fw, "program cache miss\n");
			else if(n_status_word & cache_ReadFailed_BinaryChecksum)
				fprintf(p_fw, "program cache corrupt\n");
			else if(n_status_word & cache_ReadFailed_CreateProgram)
				fprintf(p_fw, "program cached for different device\n");
			else
				fprintf(p_fw, "unknown error while loading program from cache\n");
		}
	}
	if(n_status_word & prog_CompiledFromSource) {
		if(!(n_status_word & prog_CreateSucceeded))
			fprintf(p_fw, "clCreateProgramWithSource() failed while compiling program from source\n");
		else if(!(n_status_word & prog_BuildSucceeded))
			fprintf(p_fw, "clBuildProgram() failed while compiling program from source\n");
		else
			fprintf(p_fw, "program was successfully compiled from source\n");
	}
	if(n_status_word & cache_WriteAttempted) {
		if(n_status_word & cache_WriteSucceeded)
			fprintf(p_fw, "program binaries were successfully written to cache\n");
		else
			fprintf(p_fw, "failed to cache program binaries\n");
	}
}

int CCLProgramCompiler::n_CompileProgram(cl_context h_context, cl_program *p_program,
	const char *p_s_source, size_t n_device_num, const cl_device_id *p_device_list,
	const char *p_s_build_options, const char *p_s_cache_file, int *p_status_word)
{
	int n_dummy_status_word;
	if(!p_status_word)
		p_status_word = &n_dummy_status_word;
	// makes sure p_status_word is not null

	*p_status_word = 0;

	TSHA1 t_hash = t_Hash_ProgramSource_BuildOptions(p_s_source, p_s_build_options);
	// calculate program and build options hash

	if(p_s_cache_file) {
		*p_status_word |= cache_ReadAttempted;

		int n_result;
		if((n_result = n_ReadProgramBinaries(p_s_cache_file, t_hash, h_context,
		   p_program, n_device_num, p_device_list)) == cache_ReadSucceeded) {
			*p_status_word |= cache_ReadSucceeded;

			return CL_SUCCESS;
		}

		*p_status_word |= n_result;
	}
	// attempt to load program binaries from file

	cl_program h_prog;
	{
		*p_status_word |= prog_CompiledFromSource;

		cl_int n_result;
		h_prog = clCreateProgramWithSource(h_context, 1, &p_s_source, NULL, &n_result);
		if(n_result != CL_SUCCESS)
			return n_result;

		*p_status_word |= prog_CreateSucceeded;

		if((n_result = clBuildProgram(h_prog, n_device_num, p_device_list, p_s_build_options, NULL, NULL)) != CL_SUCCESS) {
#ifdef CL_PROGRAM_COMPILER_DISPLAY_BUILD_ERRORS
			if(n_device_num) {
				std::string s_build_log, s_device_name;
				for(size_t i = 0; i < n_device_num; ++ i) {
					cl_build_status n_build_status;

					if(Get_BuildLog(s_build_log, n_build_status, h_prog, p_device_list[i]) != CL_SUCCESS)
						return n_result;
					// get buld log and build status

					if(CCLDeviceParams::n_GetDeviceInfoString(s_device_name, p_device_list[i], CL_DEVICE_NAME) != CL_SUCCESS)
						s_device_name.erase();
					const char *p_s_device_name = (s_device_name.empty())? "(null)" : s_device_name.c_str();
					// get device name

					if(n_build_status == CL_BUILD_NONE) {
						fprintf(stderr, "error: program wasn't built for device %d (%s)\n", i, p_s_device_name);
						continue;
					} else if(n_build_status == CL_BUILD_ERROR) {
						fprintf(stderr, "error: there were errors while building program for device %d (%s)\n",
							i, p_s_device_name);
					} else if(n_build_status != CL_BUILD_SUCCESS)
						fprintf(stderr, "error: unknown program build status for device %d (%s)\n", i, p_s_device_name);
					// show build status

					if(n_build_status != CL_BUILD_SUCCESS || !s_build_log.empty()) {
						fprintf(stderr, "=== OpenCL build log for device %d (%s) ===\n%s\n",
							i, p_s_device_name, s_build_log.c_str());
					}
					// show build log
				}
			}
			// show build error(s)
#endif //CL_PROGRAM_COMPILER_DISPLAY_BUILD_ERRORS

			return n_result;
		}
		// build program

#ifdef CL_PROGRAM_COMPILER_DISPLAY_SUCCESSFUL_BUILD_WARNINGS
		if(n_device_num) {
			std::string s_build_log, s_device_name;
			for(size_t i = 0; i < n_device_num; ++ i) {
				cl_build_status n_build_status;

				if(Get_BuildLog(s_build_log, n_build_status, h_prog, p_device_list[i]) != CL_SUCCESS)
					return n_result;
				// get buld log and build status

				if(CCLDeviceParams::n_GetDeviceInfoString(s_device_name, p_device_list[i], CL_DEVICE_NAME) != CL_SUCCESS)
					s_device_name.erase();
				const char *p_s_device_name = (s_device_name.empty())? "(null)" : s_device_name.c_str();
				// get device name

				if(n_build_status == CL_BUILD_NONE) {
					fprintf(stderr, "warning: program wasn't built for device %d (%s)\n", i, p_s_device_name);
					continue;
				} else if(n_build_status == CL_BUILD_ERROR) {
					fprintf(stderr, "warning: there were errors while building program for device %d (%s)\n",
						i, p_s_device_name);
				} else if(n_build_status != CL_BUILD_SUCCESS)
					fprintf(stderr, "warning: unknown program build status for device %d (%s)\n", i, p_s_device_name);
				// show build status

				if(n_build_status != CL_BUILD_SUCCESS || !s_build_log.empty()) {
					fprintf(stderr, "=== OpenCL build log for device %d (%s) ===\n%s\n",
						i, p_s_device_name, s_build_log.c_str());
				}
				// show build log
			}
		}
		// show warnings

		*p_status_word |= prog_BuildSucceeded;
	}
	// create program
#endif //CL_PROGRAM_COMPILER_DISPLAY_SUCCESSFUL_BUILD_WARNINGS

	*p_program = h_prog;
	// output

	if(p_s_cache_file) {
		*p_status_word |= cache_WriteAttempted;

		if(WriteProgramBinaries(h_prog, t_hash, p_s_cache_file))
			*p_status_word |= cache_WriteSucceeded;
		else {
#if defined(_WIN32) || defined(_WIN64)
			DeleteFile(p_s_cache_file);
#else // _WIN32 || _WIN64
			remove(p_s_cache_file);
#endif // _WIN32 || _WIN64
			// in case it didn't save correctly, do not leave it arround
		}
	}
	// write file with binaries

	return CL_SUCCESS;
}

int CCLProgramCompiler::Get_BuildLog(std::string &r_s_build_log, cl_build_status &r_n_build_status,
	cl_program h_program, cl_device_id h_device)
{
	r_s_build_log.erase();
	r_n_build_status = CL_BUILD_NONE;
	// clear output

	for(;;) {
		cl_int n_result;
		if((n_result = clGetProgramBuildInfo(h_program, h_device, CL_PROGRAM_BUILD_STATUS,
		   sizeof(cl_build_status), &r_n_build_status, NULL)) != CL_SUCCESS)
			return n_result;
		if(r_n_build_status != CL_BUILD_IN_PROGRESS)
			break;
#if defined(_WIN32) || defined(_WIN64)
		Sleep(100);
#else //_WIN32 || _WIN64
		sleep(1);
#endif //_WIN32 || _WIN64
	}
	// wait while build is in progress

	if(r_n_build_status == CL_BUILD_NONE)
		return CL_SUCCESS;
	// check build status

	size_t n_build_log_size;
	{
		cl_int n_result;
		if((n_result = clGetProgramBuildInfo(h_program, h_device,
		   CL_PROGRAM_BUILD_LOG, 0, NULL, &n_build_log_size)) != CL_SUCCESS)
			return n_result;
	}
	if(!stl_ut::Resize_To_N(r_s_build_log, n_build_log_size + 1)) {
		fprintf(stderr, "error: not enough memory for build log\n");
		return CL_OUT_OF_HOST_MEMORY;
	}
	// get build log length, allocate string buffer

	{
		cl_int n_result;
		if((n_result = clGetProgramBuildInfo(h_program, h_device, CL_PROGRAM_BUILD_LOG,
		   n_build_log_size, &r_s_build_log[0], NULL)) != CL_SUCCESS)
			return n_result;
	}
	r_s_build_log.resize(strlen(r_s_build_log.c_str()));
	// get build log, erase terminating null from string

	return CL_SUCCESS;
}

int CCLProgramCompiler::n_CompileProgramFile(cl_context h_context, cl_program *p_program,
	const char *p_s_source_file, size_t n_device_num, const cl_device_id *p_device_list,
	const char *p_s_build_options, const char *p_s_cache_file, int *p_status_word)
{
	*p_status_word = 0;
	// in case i/o errors occur below

	std::string s_program;
	{
		FILE *p_fr;
		if(!(p_fr = fopen(p_s_source_file, "rb")))
			return CL_BUILD_PROGRAM_FAILURE; // not really great, but there's no better error code
		fseek(p_fr, 0, SEEK_END);
		size_t n_file_size = ftell(p_fr);
		try {
			s_program.resize(n_file_size);
		} catch(std::bad_alloc&) {
			fclose(p_fr);
			return CL_OUT_OF_HOST_MEMORY;
		}
		fseek(p_fr, 0, SEEK_SET);
		if(fread(&s_program[0], 1, n_file_size, p_fr) != n_file_size) {
			fclose(p_fr);
			return CL_BUILD_PROGRAM_FAILURE; // not really great, but there's no better error code
		}
		fclose(p_fr);
	}
	// read program from a file

	return n_CompileProgram(h_context, p_program, s_program.c_str(),
		n_device_num, p_device_list, p_s_build_options, p_s_cache_file, p_status_word);
	// use the other function
}

TSHA1 CCLProgramCompiler::t_Hash_ProgramSource_BuildOptions(const char *p_s_source, const char *p_s_build_options)
{
	CStreamHash<TSHA1> hash;
	hash.Process_Data(p_s_source, strlen(p_s_source) * sizeof(char));
	if(p_s_build_options && strcmp(p_s_build_options, ""))
		hash.Process_Data(p_s_build_options, strlen(p_s_build_options) * sizeof(char));
	return hash.t_Result();
}

int CCLProgramCompiler::n_ReadProgramBinaries(const char *p_s_filename, TSHA1 t_hash, cl_context h_context,
	cl_program *p_program, size_t n_device_num, const cl_device_id *p_device_list)
{
	FILE *p_fr;
	if(!(p_fr = fopen(p_s_filename, "rb")))
		return cache_ReadFailed_FileNotFound;
	// open the file

	TSHA1 t_ref_hash;
	if(fread(&t_ref_hash[0], sizeof(uint32_t), 5, p_fr) != 5) {
		fclose(p_fr);
		return cache_ReadFailed_IO;
	}
	// read reference hash

	if(memcmp(&t_hash[0], &t_ref_hash[0], 5 * sizeof(uint32_t))) {
		fclose(p_fr);
		return cache_ReadFailed_SourceChecksum;
	}
	// compare it to current hash

	size_t n_binary_num;
	std::vector<size_t> binary_size_list;
	{
		uint32_t n_binary_num32;
		if(fread(&n_binary_num32, sizeof(uint32_t), 1, p_fr) != 1 || n_binary_num32 > SIZE_MAX) {
			fclose(p_fr);
			return cache_ReadFailed_IO;
		}
		n_binary_num = size_t(n_binary_num32);
		try {
			binary_size_list.resize(n_binary_num);
		} catch(std::bad_alloc&) {
			fclose(p_fr);
			return cache_ReadFailed_OutOfMemory;
		}
		for(size_t i = 0; i < n_binary_num; ++ i) {
			uint32_t n_binary_size32;
			if(fread(&n_binary_size32, sizeof(uint32_t), 1, p_fr) != 1 || n_binary_size32 > SIZE_MAX) {
				fclose(p_fr);
				return cache_ReadFailed_IO;
			}
			binary_size_list[i] = size_t(n_binary_size32);
		}
	}
	// read binary sizes

	size_t n_binaries_size = 0;
	for(size_t i = 0; i < n_binary_num; ++ i) {
		_ASSERTE(n_binaries_size <= SIZE_MAX - binary_size_list[i]);
		n_binaries_size += binary_size_list[i];
	}
	// sum binary sizes up

	TSHA1 t_binaries_hash;
	std::vector<const unsigned char*> binary_ptr_list;
	unsigned char *p_binaries;
	{
		try {
			binary_ptr_list.resize(n_binary_num);
			p_binaries = new unsigned char[n_binaries_size];
		} catch(std::bad_alloc&) {
			fclose(p_fr);
			return cache_ReadFailed_OutOfMemory;
		}
		// alloc buffers

		CStreamHash<TSHA1> hash;

		unsigned char *p_binary_ptr = p_binaries;
		for(size_t i = 0; i < n_binary_num; ++ i) {
			binary_ptr_list[i] = p_binary_ptr;

			size_t n_binary_size = binary_size_list[i];

			if(fread(p_binary_ptr, n_binary_size, 1, p_fr) != 1) {
				fclose(p_fr);
				return cache_ReadFailed_IO;
			}

			hash.Process_Data(p_binary_ptr, n_binary_size);

			p_binary_ptr += n_binary_size;
		}
		// get pointers, read binaries (so read block size is 32-bit), calculate their hashes

		t_binaries_hash = hash.t_Result();
	}
	// read binaries

	TSHA1 t_binaries_hash_ref;
	if(fread(&t_binaries_hash_ref[0], sizeof(uint32_t), 5, p_fr) != 5) {
		fclose(p_fr);
		return cache_ReadFailed_IO;
	}
	// read checksum

	fclose(p_fr);
	// close file

	if(memcmp(&t_binaries_hash_ref[0], &t_binaries_hash_ref[0], 5 * sizeof(uint32_t)))
		return cache_ReadFailed_BinaryChecksum;
	// make sure checksum is correct

#if defined CL_COMPILER_UTIL_BOGUS_DATA_DETECTION && defined(__DIR_INCLUDED)
	TFileInfo t_file(p_s_filename);
	if(t_file.n_Size64() != 2 * 5 * sizeof(uint32_t) + sizeof(uint32_t) + // size of two hashes, number of binaries
	   n_binary_num * sizeof(uint32_t) + n_binaries_size) // lengths of binaries, binaries data
		return cache_ReadFailed_BinaryChecksum; // ...
	// make sure file doesn't contain bogus data (paranoid)
#endif // CL_COMPILER_UTIL_BOGUS_DATA_DETECTION && __DIR_INCLUDED

	cl_int n_result;
	cl_program h_prog = clCreateProgramWithBinary(h_context, n_device_num, p_device_list,
		&binary_size_list[0], &binary_ptr_list[0], NULL, &n_result);
	if(n_result != CL_SUCCESS)
		return cache_ReadFailed_CreateProgram;
	if((n_result = clBuildProgram(h_prog, 0, NULL, NULL, NULL, NULL)) != CL_SUCCESS)
		return cache_ReadFailed_CreateProgram;
	// use loaded binaries to create the program ...

	*p_program = h_prog;
	// write output

	return cache_ReadSucceeded;
}

bool CCLProgramCompiler::WriteProgramBinaries(cl_program h_program, TSHA1 t_hash, const char *p_s_filename)
{
	cl_uint n_device_num;
	if(clGetProgramInfo(h_program, CL_PROGRAM_NUM_DEVICES,
	   sizeof(size_t), &n_device_num, NULL) != CL_SUCCESS)
		return false;
	// get number of devices

	if(!n_device_num || n_device_num > UINT32_MAX)
		return false;
	// we can't query binaries without devices

	size_t n_binary_num = n_device_num;

	std::vector<size_t> binary_size_list;
	try {
#ifdef _DEBUG
		_ASSERTE(n_binary_num < SIZE_MAX);
		binary_size_list.resize(n_binary_num + 1);
		binary_size_list[n_binary_num] = 0xbaadf00d; // "magic" word
#else //_DEBUG
		binary_size_list.resize(n_binary_num);
#endif //_DEBUG
	} catch(std::bad_alloc&) {
		return false;
	}
	if(clGetProgramInfo(h_program, CL_PROGRAM_BINARY_SIZES,
	   n_binary_num * sizeof(size_t), &binary_size_list[0], NULL) != CL_SUCCESS)
		return false;
#ifdef _DEBUG
	_ASSERTE(binary_size_list[n_binary_num] == 0xbaadf00d); // make sure it's returning the right amount of data
#endif //_DEBUG
	// get binary sizes

	size_t n_binaries_size = 0;
	for(size_t i = 0; i < n_binary_num; ++ i) {
		_ASSERTE(n_binaries_size <= SIZE_MAX - binary_size_list[i]);
		n_binaries_size += binary_size_list[i];
	}
	// sum binary sizes up

	FILE *p_fw;
	if(!(p_fw = fopen(p_s_filename, "wb")))
		return false;
	// open file for writing ...

	if(fwrite(&t_hash[0], sizeof(uint32_t), 5, p_fw) != 5) {
		fclose(p_fw);
		return false;
	}
	// write hash

	uint32_t n_binary_num32 = uint32_t(n_binary_num);
	if(fwrite(&n_binary_num32, sizeof(uint32_t), 1, p_fw) != 1) {
		fclose(p_fw);
		return false;
	}
	// write number of binaries

	for(size_t i = 0; i < n_binary_num; ++ i) {
		_ASSERTE(binary_size_list[i] <= UINT32_MAX);
		uint32_t n_binary_size32 = binary_size_list[i];
		if(fwrite(&n_binary_size32, sizeof(uint32_t), 1, p_fw) != 1) {
			fclose(p_fw);
			return false;
		}
	}
	// write size of each binary

	unsigned char *p_binaries;
	std::vector<const unsigned char*> binary_ptr_list;
	{
		try {
			binary_ptr_list.resize(n_binary_num);
			p_binaries = new unsigned char[n_binaries_size];
		} catch(std::bad_alloc&) {
			fclose(p_fw);
			return false;
		}
		// alloc buffers

		unsigned char *p_binary_ptr = p_binaries;
		for(size_t i = 0; i < n_binary_num; ++ i) {
			binary_ptr_list[i] = p_binary_ptr;
			p_binary_ptr += binary_size_list[i];
		}
		// get pointers
	}
	if(clGetProgramInfo(h_program, CL_PROGRAM_BINARIES,
	   n_binaries_size, &binary_ptr_list[0], NULL) != CL_SUCCESS) {
		fclose(p_fw);
		delete[] p_binaries;
		return false;
	}
	// get binaries

	TSHA1 t_binaries_hash;
	{
		CStreamHash<TSHA1> hash;
		if(fwrite(p_binaries, n_binaries_size, 1, p_fw) != 1) {
			fclose(p_fw);
			delete[] p_binaries;
			return false;
		}
		hash.Process_Data(p_binaries, n_binaries_size);
		t_binaries_hash = hash.t_Result();
	}
	// write binaries, calculate hash

	delete[] p_binaries;
	// cleanup

	if(fwrite(&t_binaries_hash[0], sizeof(uint32_t), 5, p_fw) != 5) {
		fclose(p_fw);
		return false;
	}
	// write hash

	fclose(p_fw);
	// close output file

	return true;
}

/*
 *								=== ~CCLProgramCompiler ===
 */

/*
 *								=== CCLArgLoaderHelper ===
 */

int CCLArgLoaderHelper::__SafeCall(int n_error_code, const char *p_s_file, int n_line)
{
	if(n_error_code != CL_SUCCESS) {
		fprintf(stderr, "error: clParamSet*() failed : error code %04d : "
			"file \'%s\', line %d.\n", n_error_code, p_s_file, n_line);
		exit(-1);
	}

	return n_error_code;
}

/*
 *								=== ~CCLArgLoaderHelper ===
 */
