/** 
 *	@file Main.cpp
 *	@brief similarity matrix calculation demo
 *	@date 2013
 *	@author -tHE SWINe-
 */

#include <stdio.h> // printf()
#include <math.h> // sqrt() for the CPU versions of the routines
#include "VectorKernels.h" // the library
#include "Timer.h" // for timing the results

/**
 *	@def SLIM
 *	@brief enables "slim" memory model, making better use of GPU memory so tests on larger data may be run
 *	@note This effectively disables error checking in results.
 */
//#define SLIM

/**
 *	@def GPU_LENGTHS_5
 *	@def GPU_CROSSCORREL_1
 *	@def GPU_CROSSCORREL_11
 *	@def GPU_CROSSCORREL_12
 *	@def GPU_CROSSCORREL_2
 *	@def GPU_CROSSCORREL_21
 *	@def GPU_CROSSCORREL_22
 *	@def GPU_CORREL_MN_1
 *	@def GPU_CORREL_MN_2
 *	@def GPU_SCALE_3
 *	@brief algorithms which need to be run
 *	@note To perform error-checking of results correctly, SLIM must not be defined
 */
#define GPU_LENGTHS_5
#define GPU_CROSSCORREL_1
#define GPU_CROSSCORREL_11
#define GPU_CROSSCORREL_12
#define GPU_CROSSCORREL_2
#define GPU_CROSSCORREL_21
#define GPU_CROSSCORREL_22
#define GPU_CORREL_MN_1
#define GPU_CORREL_MN_2
#define GPU_SCALE_3

/**
 *	@def USE_DOUBLE
 *	@brief if defined, double precision floating point is used (otherwise single precision is used)
 */
#define USE_DOUBLE

#ifdef USE_DOUBLE
static const char *_TyScalarName = "double";
typedef double _TyScalar;
#else // USE_DOUBLE
static const char *_TyScalarName = "float";
typedef float _TyScalar;
#endif // USE_DOUBLE
// note that the use of typedefs here is only to be able to check the GPU results by comparing to CPU results.
// the library can coexist in both flavors next to each other, can even choose at runtime

/**
 *	@brief the main routine of the program
 *
 *	@param[in] n_arg_num is number of commandline arguments (unused)
 *	@param[in] p_arg_list is a list of commandline arguments (unused)
 *
 *	@return Returns 0 on success, -1 on failure.
 */
int main(int UNUSED(n_arg_num), const char **UNUSED(p_arg_list))
{
	bool b_verbose = true;
	bool b_inplace_normalize = true;

	// note that for small (mobile) GPUs, pass size must be small, otherwise a timeout occurs
	// in windows 7 or above, see http://msdn.microsoft.com/en-us/windows/hardware/gg487368.aspx
	// or [HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Control\GraphicsDrivers] "TdrLevel"=dword:00000000

	size_t n_feature_num = 4353;//4353;
	size_t n_feature2_num = 4460;//4460;
	size_t n_vector_length = 192; // 192 = SIFT

	for(int i = 1; i < n_arg_num; ++ i) {
		if(!strcmp(p_arg_list[i], "--help")) {
			printf("no help for you, muhaha!\n");
			return 0;
		} else if(!strcmp(p_arg_list[i], "--verbose")) {
			b_verbose = true;
		} else if(!strcmp(p_arg_list[i], "--no-verbose")) {
			b_verbose = false;
		} else if(i + 1 == n_arg_num) {
			fprintf(stderr, "error: argument \'%s\': unknown argument or missing a value\n", p_arg_list[i]);
			return -1;
		} else if(!strcmp(p_arg_list[i], "--vector-length")) {
			n_vector_length = atol(p_arg_list[++ i]);
			if(n_vector_length < 128 || n_vector_length > (1 << 22)) {
				fprintf(stderr, "error: invalid vector length, reseting to default 192\n");
				n_vector_length = 192;
			}
		} else if(!strcmp(p_arg_list[i], "--vector-num")) {
			n_feature_num = atol(p_arg_list[++ i]);
			if(n_feature_num <= 0 || n_feature_num > 4500) {
				fprintf(stderr, "error: invalid vector number, reseting to default 4500\n");
				n_feature_num = 4500;
			}
		} else if(!strcmp(p_arg_list[i], "--vector-num2")) {
			n_feature2_num = atol(p_arg_list[++ i]);
			if(n_feature_num <= 0 || n_feature_num > 4500) {
				fprintf(stderr, "error: invalid vector number, reseting to default 4500\n");
				n_feature_num = 4500;
			}
		} else {
			fprintf(stderr, "error: argument \'%s\': unknown argument\n", p_arg_list[i]);
			return -1;
		}
	}
	// parse commandline

	size_t n_feature_num_align = std::min(n_Align_Up_POT(n_feature_num, size_t(1024)), n_Make_POT(n_feature_num));
	_ASSERTE(n_feature_num_align >= n_feature_num);
	// align number of vectors to the nearest power of two, but only up to 1024 away (don't waste memory)

	_TyScalar *p_feature_vectors;
	_TyScalar *p_feature2_vectors;
	_TyScalar *p_vector_lengths;
	_TyScalar *p_vector_correlations;
	_TyScalar *p_normalized_vectors;
#ifndef SLIM
	_TyScalar *p_vector_lengths_gpu;
	_TyScalar *p_vector_correlations_gpu;
	_TyScalar *p_normalized_vectors_gpu;
	if(!(p_feature_vectors = new(std::nothrow) _TyScalar[(n_feature_num + n_feature2_num) * n_vector_length])) {
		fprintf(stderr, "error: not enough memory\n");
		return -1;
	}
	p_feature2_vectors = p_feature_vectors + n_feature_num * n_vector_length;
	if(b_inplace_normalize) {
		if(!(p_normalized_vectors = new(std::nothrow) _TyScalar[n_feature_num * n_vector_length])) {
			fprintf(stderr, "error: not enough memory\n");
			delete[] p_feature_vectors;
			return -1;
		}
		p_normalized_vectors_gpu = (_TyScalar*)p_feature_vectors; // !! loading normalized vectors from GPU will invalidate term vectors !!
	} else {
		if(!(p_normalized_vectors = new(std::nothrow) _TyScalar[n_feature_num * n_vector_length * 2])) {
			fprintf(stderr, "error: not enough memory\n");
			delete[] p_feature_vectors;
			return -1;
		}
		p_normalized_vectors_gpu = p_normalized_vectors + (n_feature_num * n_vector_length);
	}
	if(!(p_vector_lengths = new(std::nothrow) _TyScalar[2 * n_feature_num])) {
		fprintf(stderr, "error: not enough memory\n");
		delete[] p_feature_vectors;
		delete[] p_normalized_vectors;
		return -1;
	}
	p_vector_lengths_gpu = p_vector_lengths + n_feature_num;
	if(!(p_vector_correlations = new(std::nothrow) _TyScalar[(n_feature_num + n_feature_num_align) * std::max(n_feature2_num, n_feature_num)])) { // might be big, might only calculate a small portion of that at a time
		fprintf(stderr, "error: not enough memory\n");
		delete[] p_feature_vectors;
		delete[] p_normalized_vectors;
		delete[] p_vector_lengths;
		return -1;
	}
	p_vector_correlations_gpu = p_vector_correlations + n_feature_num * std::max(n_feature2_num, n_feature_num);
	// allocate input / output arrays
#else //SLIM
	if(!(p_feature_vectors = new(std::nothrow) _TyScalar[(n_feature_num + n_feature2_num) * n_vector_length])) {
		fprintf(stderr, "error: not enough memory\n");
		return -1;
	}
	p_feature2_vectors = p_feature_vectors + n_feature_num * n_vector_length;
	p_normalized_vectors = p_feature_vectors;
	if(!(p_vector_lengths = new(std::nothrow) _TyScalar[n_feature_num])) {
		fprintf(stderr, "error: not enough memory\n");
		delete[] p_feature_vectors;
		return -1;
	}
	if(!(p_vector_correlations = new(std::nothrow) _TyScalar[(n_feature_num + n_feature_num_align) * std::max(n_feature2_num, n_feature_num)])) { // might be big, might only calculate a small portion of that at a time
		fprintf(stderr, "error: not enough memory\n");
		delete[] p_feature_vectors;
		delete[] p_vector_lengths;
		return -1;
	}
#endif //SLIM

	for(size_t i = 0; i < n_feature_num + n_feature2_num; ++ i) {
		for(size_t d = 0; d < n_vector_length; ++ d)
			p_feature_vectors[i * n_vector_length + d] = _TyScalar(float(rand()) / RAND_MAX * 22) - 10;
	}
	// generate random integer vectors with elements of magnitude +- 10

	size_t n_data_size = n_feature_num * n_vector_length * sizeof(_TyScalar);

	if(b_verbose) {
		printf("data type: %s\n", _TyScalarName);
		printf("GPU normalize: %s\n", (b_inplace_normalize)? "inplace" : "to separate array");
		printf("working set size: " PRIsizeB "B\n", PRIsizeBparams(n_data_size));
		printf("output size for cross-corel: " PRIsizeB "B\n",
			PRIsizeBparams(n_feature_num * n_feature_num * sizeof(_TyScalar)));
		printf("output size for cross-corel: " PRIsizeB "B (POT aligned)\n",
			PRIsizeBparams(n_feature_num * n_feature_num_align * sizeof(_TyScalar)));
	} else {
		printf("%d\n", sizeof(_TyScalar));
		printf("%d\n", (b_inplace_normalize)? 1 : 0);
		printf("%d\n", n_data_size);
	}

	CTimer timer;
	// timer to measure performance

	{
		double f_cpu_lengths_start = timer.f_Time();

		for(size_t i = 0; i < n_feature_num; ++ i) {
			_TyScalar n_sum = 0;
			for(size_t d = 0; d < n_vector_length; ++ d)
				n_sum += (p_feature_vectors[i * n_vector_length + d] * p_feature_vectors[i * n_vector_length + d]);
			p_vector_lengths[i] = _TyScalar(1.0 / _TyScalar(sqrt(double(n_sum))));
		}

		double f_cpu_lengths_end = timer.f_Time();

		for(size_t j = 0; j < n_feature_num; ++ j) {
			for(size_t i = 0; i < n_feature_num; ++ i) {
				if(i >= j) {
					_TyScalar n_sum = 0;
					for(size_t d = 0; d < n_vector_length; ++ d) {
						n_sum += (p_feature_vectors[i * n_vector_length + d] - p_feature_vectors[j * n_vector_length + d]) *
							(p_feature_vectors[i * n_vector_length + d] - p_feature_vectors[j * n_vector_length + d]);
					}
					p_vector_correlations[i + n_feature_num * j] =
						p_vector_correlations[j + n_feature_num * i] = _TyScalar(sqrt(double(n_sum)));
				}
				// it is symmetric
			}
		}

		double f_cpu_ccorrel_end = timer.f_Time();

		for(size_t i = 0; i < n_feature_num; ++ i) {
			_TyScalar f_scale = /*1 /*/ p_vector_lengths[i]; // it is inverse already
			for(size_t d = 0; d < n_vector_length; ++ d)
				p_normalized_vectors[i * n_vector_length + d] = f_scale * p_feature_vectors[i * n_vector_length + d];
		}

		double f_cpu_scaling_end = timer.f_Time();

		if(b_verbose) {
			printf("CPU lengths: %.5f\n", f_cpu_lengths_end - f_cpu_lengths_start);
			printf("CPU c-corel: %.5f\n", f_cpu_ccorrel_end - f_cpu_lengths_end);
			printf("CPU scaling: %.5f\n", f_cpu_scaling_end - f_cpu_ccorrel_end);
			//printf("CPU   total: %.5f\n", f_cpu_scaling_end - f_cpu_lengths_start);
		} else {
			printf("%g\n", f_cpu_lengths_end - f_cpu_lengths_start);
			printf("%g\n", f_cpu_ccorrel_end - f_cpu_lengths_end);
			printf("%g\n", f_cpu_scaling_end - f_cpu_ccorrel_end);
			//printf("%g\n", f_cpu_scaling_end - f_cpu_lengths_start);
		}
	}
	// CPU version of the algorithm

	{
		if(b_verbose)
			printf("initializing OpenCL ...\n");

		cl_context h_context;
		if(CCLUtils::n_OpenCL_Init(&h_context) != CL_SUCCESS) {
			fprintf(stderr, "error: failed to initialize OpenCL\n");
			return -1;
		}
		// create OpenCL context

		CCLDeviceParams device_params(h_context, CCLUtils::n_Get_MaxGFlops_DeviceId(h_context));
		cl_device_id h_device = device_params.h_Device();
		// get the fastest device

		if(b_verbose) {
			device_params.Dump();
			printf("this device has SIMD width of %d\n", device_params.t_Properties().SIMDWidth); // NVIDIA specific (will print -1 on ATI)
		}
		// show some info about the device

		cl_command_queue h_cmd_queue;
		{
			cl_int n_result;
			h_cmd_queue = clCreateCommandQueue(h_context, h_device,
				CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to create OpenCL command queue\n");
				return -1;
			}
		}
		// create command queue

		CCLVectorKernels kernels;
		// the library; it must either reside in block, or be allocated on heap using new (not copyable)

		kernels.Set_DataTypes(_TyScalarName, sizeof(_TyScalar)); // this is fixed to float at the moment, lot of hard-coded constants
		kernels.Configure_ReductionOp("((x)*(x))", "((x)+(y))", "(1.0/sqrt((x)))"); // inverse norm
		kernels.Configure_ScalingOp("((x)*(y))"); // scaling by inverse norm to compute normalized vectors
		// configure the kernels here
		// note that one limitation is that the strings must not contain spaces (they are passed in commandline)

		if(!kernels.Compile(h_context, h_device, "Kernels.c", b_verbose)) {
			fprintf(stderr, "error: failed to compile vector kernels with the current configuration\n");
			return -1;
		}
		// compile the function kernels; file Kernels.c must be in path

		cl_mem dp_vectors, dp_vectors2, dp_vector_lengths,
			dp_vectors_normalized, dp_vector_correlations;
		{
			cl_int n_result;
			dp_vector_lengths = clCreateBuffer(h_context, CL_MEM_READ_WRITE, n_feature_num * sizeof(_TyScalar), NULL, &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to alloc device buffer\n");
				return -1;
			}
			dp_vector_correlations = clCreateBuffer(h_context, CL_MEM_READ_WRITE,
				std::max(n_feature_num, n_feature2_num) * n_feature_num_align * sizeof(_TyScalar), NULL, &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to alloc device buffer\n");
				return -1;
			}
			dp_vectors = clCreateBuffer(h_context, (b_inplace_normalize)? CL_MEM_READ_WRITE : CL_MEM_READ_ONLY,
				n_feature_num * n_vector_length * sizeof(_TyScalar), NULL, &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to alloc device buffer\n");
				return -1;
			}
			dp_vectors2 = clCreateBuffer(h_context, (b_inplace_normalize)? CL_MEM_READ_WRITE : CL_MEM_READ_ONLY,
				n_feature2_num * n_vector_length * sizeof(_TyScalar), NULL, &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to alloc device buffer\n");
				return -1;
			}
			if(!b_inplace_normalize) {
				dp_vectors_normalized = clCreateBuffer(h_context, CL_MEM_WRITE_ONLY,
					n_feature_num * n_vector_length * sizeof(_TyScalar), NULL, &n_result); // normalize into separate array
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: failed to alloc device buffer\n");
					return -1;
				}
			} else
				dp_vectors_normalized = dp_vectors; // normalize inplace
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to alloc device buffer\n");
				return -1;
			}
		}
		// allocate OpenCL buffer(s)

		if(clEnqueueWriteBuffer(h_cmd_queue, dp_vectors, true, 0, n_feature_num * n_vector_length * sizeof(_TyScalar),
		   p_feature_vectors, 0, NULL, NULL) != CL_SUCCESS) {
			fprintf(stderr, "error: failed to copy data to the device\n");
			return -1;
		}
		if(clEnqueueWriteBuffer(h_cmd_queue, dp_vectors2, true, 0, n_feature2_num * n_vector_length * sizeof(_TyScalar),
		   p_feature2_vectors, 0, NULL, NULL) != CL_SUCCESS) {
			fprintf(stderr, "error: failed to copy data to the device\n");
			return -1;
		}
		// copy data from host to device

#ifdef GPU_LENGTHS_5
		{
			double f_gpu_lengths_start = timer.f_Time();

			if(!kernels.Calculate_VectorReduction(h_cmd_queue, n_vector_length,
			   n_feature_num, dp_vector_lengths, dp_vectors)) {
				fprintf(stderr, "error: calculating reduction failed\n");
				return -1;
			}
			// execute the operation (is asynchronous - returns immediately)

			{
				int n_result = clFinish(h_cmd_queue);
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: clFinish() failed: %d\n", n_result);
					return -1;
				}
			}
			// wait for the GPU to finish

			double f_gpu_lengths_end = timer.f_Time();

			if(b_verbose)
				printf("GPU lengths5: %.5f (" PRIsizeB "B/sec)\n", f_gpu_lengths_end - f_gpu_lengths_start, PRIsizeBparams(n_data_size / (f_gpu_lengths_end - f_gpu_lengths_start)));
			else
				printf("%g\n", f_gpu_lengths_end - f_gpu_lengths_start);
		}

#ifndef SLIM
		if(clEnqueueReadBuffer(h_cmd_queue, dp_vector_lengths, true, 0, n_feature_num * sizeof(_TyScalar),
		   p_vector_lengths_gpu, 0, NULL, NULL) != CL_SUCCESS) {
			fprintf(stderr, "error: failed to copy data from the device\n");
			return -1;
		}
		// copy results from device back to host

		{
			bool b_differences = false;
			for(size_t i = 0, n_err_num = 0; i < n_feature_num && n_err_num < 10; ++ i) {
				_TyScalar f_diff = _TyScalar(fabs(double(p_vector_lengths_gpu[i] - p_vector_lengths[i])));
				_TyScalar f_mag = std::max(p_vector_lengths_gpu[i], p_vector_lengths[i]);
				if(f_diff > f_mag * 1e-5f) { // results are good to 5 places
					fprintf(stderr, "warning: vector %d: length difference %g (correct magnitude %g, GPU %g)\n",
						i, f_diff, p_vector_lengths[i], p_vector_lengths_gpu[i]);
					b_differences = true;
					++ n_err_num;
				}
			}
			if(!b_differences) {
				if(b_verbose)
					printf("ok, gpu results match cpu results\n");
			} else
				fprintf(stderr, "error: gpu results do NOT match cpu results\n");
		}
		// check results
#endif //SLIM
#endif //GPU_LENGTHS_5

#ifdef GPU_CROSSCORREL_1
		{
			double f_gpu_lengths_start = timer.f_Time();

			if(!kernels.Calculate_VectorsCrossCorelation(h_cmd_queue, n_vector_length,
				n_feature_num, n_feature_num_align, dp_vector_correlations, dp_vectors)) {
				fprintf(stderr, "error: calculating reduction failed1\n");
				return -1;
			}
			// execute the operation (is asynchronous - returns immediately)

			{
				int n_result = clFinish(h_cmd_queue);
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: clFinish() failed: %d\n", n_result);
					return -1;
				}
			}
			// wait for the GPU to finish

			double f_gpu_lengths_end = timer.f_Time();

			if(b_verbose) {
				printf("GPU c-correl1: %.5f (" PRIsizeB "B/sec)\n", f_gpu_lengths_end - f_gpu_lengths_start,
					PRIsizeBparams(double(n_feature_num) * n_data_size / (f_gpu_lengths_end - f_gpu_lengths_start)));
			} else
				printf("%g\n", f_gpu_lengths_end - f_gpu_lengths_start);
		}

#ifndef SLIM
		if(clEnqueueReadBuffer(h_cmd_queue, dp_vector_correlations,
		   true, 0, n_feature_num * n_feature_num_align * sizeof(_TyScalar),
		   p_vector_correlations_gpu, 0, NULL, NULL) != CL_SUCCESS) {
			fprintf(stderr, "error: failed to copy data from the device\n");
			return -1;
		}
		// copy results from device back to host

		{
			bool b_differences = false;
			size_t n_err_num = 0;
			for(size_t i = 0; i < n_feature_num * n_feature_num; ++ i) {
				size_t idx = i % n_feature_num + (i / n_feature_num) * n_feature_num_align;
				_TyScalar f_diff = _TyScalar(fabs(double(p_vector_correlations_gpu[idx] - p_vector_correlations[i])));
				_TyScalar f_mag = std::max(p_vector_correlations_gpu[idx], p_vector_correlations[i]);
				if(f_diff > f_mag * 1e-5f) { // results are good to 5 places
					if(n_err_num < 10) {
						fprintf(stderr, "warning: correlation %d (%d, %d): length difference %g (correct value %g, GPU %g)\n",
							i, i % n_feature_num, i / n_feature_num, f_diff,
							p_vector_correlations[i], p_vector_correlations_gpu[idx]);
						b_differences = true;
					}
					++ n_err_num;
				}
			}
			if(!b_differences) {
				if(b_verbose)
					printf("ok, gpu results match cpu results\n");
			} else {
				fprintf(stderr, "error: gpu results do NOT match cpu results"
					" (%d out of %d different)\n", n_err_num, n_feature_num * n_feature_num);
			}
		}
		// check results
#endif //SLIM
#endif // GPU_CROSSCORREL_1

#ifdef GPU_CROSSCORREL_11
		{
			double f_gpu_lengths_start = timer.f_Time();

			if(!kernels.Calculate_VectorsCrossCorelation_Upper(h_cmd_queue, n_vector_length,
				n_feature_num, n_feature_num_align, dp_vector_correlations, dp_vectors)) {
				fprintf(stderr, "error: calculating reduction failed2\n");
				return -1;
			}
			// execute the operation (is asynchronous - returns immediately)

			{
				int n_result = clFinish(h_cmd_queue);
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: clFinish() failed: %d\n", n_result);
					return -1;
				}
			}
			// wait for the GPU to finish

			double f_gpu_lengths_end = timer.f_Time();

			if(b_verbose) {
				printf("GPU c-correl1.1: %.5f (" PRIsizeB "B/sec)\n", f_gpu_lengths_end - f_gpu_lengths_start,
					PRIsizeBparams(double(n_feature_num) * n_data_size * .5 / (f_gpu_lengths_end - f_gpu_lengths_start)));
			} else
				printf("%g\n", f_gpu_lengths_end - f_gpu_lengths_start);
		}

#ifndef SLIM
		if(clEnqueueReadBuffer(h_cmd_queue, dp_vector_correlations,
		   true, 0, n_feature_num * n_feature_num_align * sizeof(_TyScalar),
		   p_vector_correlations_gpu, 0, NULL, NULL) != CL_SUCCESS) {
			fprintf(stderr, "error: failed to copy data from the device\n");
			return -1;
		}
		// copy results from device back to host

		{
			bool b_differences = false;
			size_t n_err_num = 0;
			for(size_t i = 0; i < n_feature_num * n_feature_num; ++ i) {
				size_t idx = i % n_feature_num + (i / n_feature_num) * n_feature_num_align;
				_TyScalar f_correct;
				if(i % n_feature_num < i / n_feature_num) {
					f_correct = 0; // lower diagonal
					continue; // don't care about lower diag
				} else
					f_correct = p_vector_correlations[i];
				_TyScalar f_diff = _TyScalar(fabs(double(p_vector_correlations_gpu[idx] - f_correct)));
				_TyScalar f_mag = std::max(p_vector_correlations_gpu[idx], f_correct);
				if(f_diff > f_mag * 1e-5f) { // results are good to 5 places
					if(n_err_num < 10) {
						fprintf(stderr, "warning: correlation %d (%d, %d): length difference %g (correct value %g, GPU %g)\n",
							i, i % n_feature_num, i / n_feature_num, f_diff, f_correct, p_vector_correlations_gpu[idx]);
						b_differences = true;
					}
					++ n_err_num;
				}
			}
			if(!b_differences) {
				if(b_verbose)
					printf("ok, gpu results match cpu results\n");
			} else {
				fprintf(stderr, "error: gpu results do NOT match cpu results"
					" (%d out of %d different)\n", n_err_num, n_feature_num * n_feature_num);
			}
		}
		// check results
#endif //SLIM
#endif // GPU_CROSSCORREL_11

#ifdef GPU_CROSSCORREL_12
		{
			double f_gpu_lengths_start = timer.f_Time();

			if(!kernels.Calculate_VectorsCrossCorelation_UpperPacked(h_cmd_queue, n_vector_length,
				n_feature_num, n_feature_num_align, dp_vector_correlations, dp_vectors)) {
				fprintf(stderr, "error: calculating reduction failed3\n");
				return -1;
			}
			// execute the operation (is asynchronous - returns immediately)

			{
				int n_result = clFinish(h_cmd_queue);
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: clFinish() failed: %d\n", n_result);
					return -1;
				}
			}
			// wait for the GPU to finish

			double f_gpu_lengths_end = timer.f_Time();

			if(b_verbose) {
				printf("GPU c-correl1.2: %.5f (" PRIsizeB "B/sec)\n", f_gpu_lengths_end - f_gpu_lengths_start,
					PRIsizeBparams(double(n_feature_num) * n_data_size * .5 / (f_gpu_lengths_end - f_gpu_lengths_start)));
			} else
				printf("%g\n", f_gpu_lengths_end - f_gpu_lengths_start);
		}

#ifndef SLIM
		if(clEnqueueReadBuffer(h_cmd_queue, dp_vector_correlations,
		   true, 0, n_TriangularSize2(n_feature_num_align, n_feature_num) * sizeof(_TyScalar),
		   p_vector_correlations_gpu, 0, NULL, NULL) != CL_SUCCESS) {
			fprintf(stderr, "error: failed to copy data from the device\n");
			return -1;
		}
		// copy results from device back to host

		{
			bool b_differences = false;
			size_t n_err_num = 0;
			for(size_t i = 0; i < n_feature_num * n_feature_num; ++ i) {
				size_t idx = i % n_feature_num + (i / n_feature_num) * n_feature_num;
				size_t triidx = n_TriangularOffset(i / n_feature_num, i % n_feature_num, n_feature_num_align); // it is densely packed in an triangular array
				_TyScalar f_correct;
				if(i % n_feature_num < i / n_feature_num) {
					f_correct = 0; // lower diagonal
					continue; // don't care about lower diag
				} else
					f_correct = p_vector_correlations[idx];
				_TyScalar f_diff = _TyScalar(fabs(double(p_vector_correlations_gpu[triidx] - f_correct)));
				_TyScalar f_mag = std::max(p_vector_correlations_gpu[triidx], f_correct);
				if(f_diff > f_mag * 1e-5f) { // results are good to 5 places
					if(n_err_num < 10) {
						fprintf(stderr, "warning: correlation %d (%d, %d): length difference %g (correct value %g, GPU %g)\n",
							i, i % n_feature_num, i / n_feature_num, f_diff, f_correct, p_vector_correlations_gpu[triidx]);
						b_differences = true;
					}
					++ n_err_num;
				}
			}
			if(!b_differences) {
				if(b_verbose)
					printf("ok, gpu results match cpu results\n");
			} else {
				fprintf(stderr, "error: gpu results do NOT match cpu results"
					" (%d out of %d different)\n", n_err_num, n_feature_num * n_feature_num);
			}
		}
		// check results
#endif //SLIM
#endif // GPU_CROSSCORREL_12

#ifdef GPU_CROSSCORREL_2
		{
			double f_gpu_lengths_start = timer.f_Time();

			if(!kernels.Calculate_VectorsCrossCorelation_Cached(h_cmd_queue, n_vector_length,
				n_feature_num, n_feature_num_align, dp_vector_correlations, dp_vectors)) {
				fprintf(stderr, "error: calculating reduction failed4\n");
				return -1;
			}
			// execute the operation (is asynchronous - returns immediately)

			{
				int n_result = clFinish(h_cmd_queue);
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: clFinish() failed: %d\n", n_result);
					return -1;
				}
			}
			// wait for the GPU to finish

			double f_gpu_lengths_end = timer.f_Time();

			if(b_verbose) {
				printf("GPU c-correl2: %.5f (" PRIsizeB "B/sec)\n", f_gpu_lengths_end - f_gpu_lengths_start,
					PRIsizeBparams(double(n_feature_num) * n_data_size / (f_gpu_lengths_end - f_gpu_lengths_start)));
			} else
				printf("%g\n", f_gpu_lengths_end - f_gpu_lengths_start);
		}

#ifndef SLIM
		if(clEnqueueReadBuffer(h_cmd_queue, dp_vector_correlations,
		   true, 0, n_feature_num * n_feature_num_align * sizeof(_TyScalar),
		   p_vector_correlations_gpu, 0, NULL, NULL) != CL_SUCCESS) {
			fprintf(stderr, "error: failed to copy data from the device\n");
			return -1;
		}
		// copy results from device back to host

		{
			bool b_differences = false;
			size_t n_err_num = 0;
			for(size_t i = 0; i < n_feature_num * n_feature_num; ++ i) {
				size_t idx = i % n_feature_num + (i / n_feature_num) * n_feature_num_align;
				_TyScalar f_diff = _TyScalar(fabs(double(p_vector_correlations_gpu[idx] - p_vector_correlations[i])));
				_TyScalar f_mag = std::max(p_vector_correlations_gpu[idx], p_vector_correlations[i]);
				if(f_diff > f_mag * 1e-5f) { // results are good to 5 places
					if(n_err_num < 10) {
						fprintf(stderr, "warning: correlation %d (%d, %d): length difference %g (correct value %g, GPU %g)\n",
							i, i % n_feature_num, i / n_feature_num, f_diff,
							p_vector_correlations[i], p_vector_correlations_gpu[idx]);
						b_differences = true;
					}
					++ n_err_num;
				}
			}
			if(!b_differences) {
				if(b_verbose)
					printf("ok, gpu results match cpu results\n");
			} else {
				fprintf(stderr, "error: gpu results do NOT match cpu results"
					" (%d out of %d different)\n", n_err_num, n_feature_num * n_feature_num);
			}
		}
		// check results
#endif //SLIM
#endif // GPU_CROSSCORREL_2

#ifdef GPU_CROSSCORREL_21
		{
			double f_gpu_lengths_start = timer.f_Time();

			if(!kernels.Calculate_VectorsCrossCorelation_Upper_Cached(h_cmd_queue, n_vector_length,
				n_feature_num, n_feature_num_align, dp_vector_correlations, dp_vectors)) {
				fprintf(stderr, "error: calculating reduction failed5\n");
				return -1;
			}
			// execute the operation (is asynchronous - returns immediately)

			{
				int n_result = clFinish(h_cmd_queue);
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: clFinish() failed: %d\n", n_result);
					return -1;
				}
			}
			// wait for the GPU to finish

			double f_gpu_lengths_end = timer.f_Time();

			if(b_verbose) {
				printf("GPU c-correl2.1: %.5f (" PRIsizeB "B/sec)\n", f_gpu_lengths_end - f_gpu_lengths_start,
					PRIsizeBparams(double(n_feature_num) * n_data_size * .5 / (f_gpu_lengths_end - f_gpu_lengths_start)));
			} else
				printf("%g\n", f_gpu_lengths_end - f_gpu_lengths_start);
		}

#ifndef SLIM
		if(clEnqueueReadBuffer(h_cmd_queue, dp_vector_correlations,
		   true, 0, n_feature_num * n_feature_num_align * sizeof(_TyScalar),
		   p_vector_correlations_gpu, 0, NULL, NULL) != CL_SUCCESS) {
			fprintf(stderr, "error: failed to copy data from the device\n");
			return -1;
		}
		// copy results from device back to host

		{
			bool b_differences = false;
			size_t n_err_num = 0;
			for(size_t i = 0; i < n_feature_num * n_feature_num; ++ i) {
				size_t idx = i % n_feature_num + (i / n_feature_num) * n_feature_num_align;
				_TyScalar f_correct;
				if(i % n_feature_num < i / n_feature_num) {
					f_correct = 0; // lower diagonal
					continue; // don't care about lower diag
				} else
					f_correct = p_vector_correlations[i];
				_TyScalar f_diff = _TyScalar(fabs(double(p_vector_correlations_gpu[idx] - f_correct)));
				_TyScalar f_mag = std::max(p_vector_correlations_gpu[idx], f_correct);
				if(f_diff > f_mag * 1e-5f) { // results are good to 5 places
					if(n_err_num < 10) {
						fprintf(stderr, "warning: correlation %d (%d, %d): length difference %g (correct value %g, GPU %g)\n",
							i, i % n_feature_num, i / n_feature_num, f_diff, f_correct, p_vector_correlations_gpu[idx]);
						b_differences = true;
					}
					++ n_err_num;
				}
			}
			if(!b_differences) {
				if(b_verbose)
					printf("ok, gpu results match cpu results\n");
			} else {
				fprintf(stderr, "error: gpu results do NOT match cpu results"
					" (%d out of %d different)\n", n_err_num, n_feature_num * n_feature_num);
			}
		}
		// check results
#endif //SLIM
#endif // GPU_CROSSCORREL_21

#ifdef GPU_CROSSCORREL_22
		{
			double f_gpu_lengths_start = timer.f_Time();

			if(!kernels.Calculate_VectorsCrossCorelation_UpperPacked_Cached(h_cmd_queue, n_vector_length,
				n_feature_num, n_feature_num_align, dp_vector_correlations, dp_vectors)) {
				fprintf(stderr, "error: calculating reduction failed6\n");
				return -1;
			}
			// execute the operation (is asynchronous - returns immediately)

			{
				int n_result = clFinish(h_cmd_queue);
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: clFinish() failed: %d\n", n_result);
					return -1;
				}
			}
			// wait for the GPU to finish

			double f_gpu_lengths_end = timer.f_Time();

			if(b_verbose) {
				printf("GPU c-correl2.2: %.5f (" PRIsizeB "B/sec)\n", f_gpu_lengths_end - f_gpu_lengths_start,
					PRIsizeBparams(double(n_feature_num) * n_data_size * .5 / (f_gpu_lengths_end - f_gpu_lengths_start)));
			} else
				printf("%g\n", f_gpu_lengths_end - f_gpu_lengths_start);
		}

#ifndef SLIM
		if(clEnqueueReadBuffer(h_cmd_queue, dp_vector_correlations,
		   true, 0, n_TriangularSize2(n_feature_num_align, n_feature_num) * sizeof(_TyScalar),
		   p_vector_correlations_gpu, 0, NULL, NULL) != CL_SUCCESS) {
			fprintf(stderr, "error: failed to copy data from the device\n");
			return -1;
		}
		// copy results from device back to host

		{
			bool b_differences = false;
			size_t n_err_num = 0;
			for(size_t i = 0; i < n_feature_num * n_feature_num; ++ i) {
				size_t idx = i % n_feature_num + (i / n_feature_num) * n_feature_num;
				size_t triidx = n_TriangularOffset(i / n_feature_num, i % n_feature_num, n_feature_num_align); // it is densely packed in an triangular array
				_TyScalar f_correct;
				if(i % n_feature_num < i / n_feature_num) {
					f_correct = 0; // lower diagonal
					continue; // don't care about lower diag
				} else
					f_correct = p_vector_correlations[idx];
				_TyScalar f_diff = _TyScalar(fabs(double(p_vector_correlations_gpu[triidx] - f_correct)));
				_TyScalar f_mag = std::max(p_vector_correlations_gpu[triidx], f_correct);
				if(f_diff > f_mag * 1e-5f) { // results are good to 5 places
					if(n_err_num < 10) {
						fprintf(stderr, "warning: correlation %d (%d, %d): length difference %g (correct value %g, GPU %g)\n",
							i, i % n_feature_num, i / n_feature_num, f_diff, f_correct, p_vector_correlations_gpu[triidx]);
						b_differences = true;
					}
					++ n_err_num;
				}
			}
			if(!b_differences) {
				if(b_verbose)
					printf("ok, gpu results match cpu results\n");
			} else {
				fprintf(stderr, "error: gpu results do NOT match cpu results"
					" (%d out of %d different)\n", n_err_num, n_feature_num * n_feature_num);
			}
		}
		// check results
#endif //SLIM
#endif // GPU_CROSSCORREL_22

		{
			double f_cpu_correl_start = timer.f_Time();

			for(size_t j = 0; j < n_feature2_num; ++ j) {
				for(size_t i = 0; i < n_feature_num; ++ i) {
					_TyScalar n_sum = 0;
					for(size_t d = 0; d < n_vector_length; ++ d) {
						n_sum += (p_feature_vectors[i * n_vector_length + d] - p_feature2_vectors[j * n_vector_length + d]) *
							(p_feature_vectors[i * n_vector_length + d] - p_feature2_vectors[j * n_vector_length + d]);
					}
					p_vector_correlations[i + n_feature_num * j] = _TyScalar(sqrt(double(n_sum)));
				}
			}

			double f_cpu_correl_end = timer.f_Time();

			if(b_verbose)
				printf("CPU correlation: %.5f\n", f_cpu_correl_end - f_cpu_correl_start);
			else
				printf("%g\n", f_cpu_correl_end - f_cpu_correl_start);
		}
		// CPU version of the algorithm

#ifdef GPU_CORREL_MN_1
		{
			double f_gpu_lengths_start = timer.f_Time();

			if(!kernels.Calculate_VectorsCorelation(h_cmd_queue, n_vector_length,
				n_feature_num, n_feature_num_align, n_feature2_num, dp_vector_correlations, dp_vectors, dp_vectors2)) {
				fprintf(stderr, "error: calculating reduction failed7\n");
				return -1;
			}
			// execute the operation (is asynchronous - returns immediately)

			{
				int n_result = clFinish(h_cmd_queue);
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: clFinish() failed: %d\n", n_result);
					return -1;
				}
			}
			// wait for the GPU to finish

			double f_gpu_lengths_end = timer.f_Time();

			if(b_verbose) {
				printf("GPU correl1: %.5f (" PRIsizeB "B/sec)\n", f_gpu_lengths_end - f_gpu_lengths_start,
					PRIsizeBparams(double(n_feature_num) * n_data_size / (f_gpu_lengths_end - f_gpu_lengths_start)));
			} else
				printf("%g\n", f_gpu_lengths_end - f_gpu_lengths_start);
		}

#ifndef SLIM
		if(clEnqueueReadBuffer(h_cmd_queue, dp_vector_correlations,
		   true, 0, n_feature2_num * n_feature_num_align * sizeof(_TyScalar),
		   p_vector_correlations_gpu, 0, NULL, NULL) != CL_SUCCESS) {
			fprintf(stderr, "error: failed to copy data from the device\n");
			return -1;
		}
		// copy results from device back to host

		{
			bool b_differences = false;
			size_t n_err_num = 0;
			for(size_t i = 0; i < n_feature_num * n_feature2_num; ++ i) {
				size_t idx = i % n_feature_num + (i / n_feature_num) * n_feature_num_align;
				_TyScalar f_diff = _TyScalar(fabs(double(p_vector_correlations_gpu[idx] - p_vector_correlations[i])));
				_TyScalar f_mag = std::max(p_vector_correlations_gpu[idx], p_vector_correlations[i]);
				if(f_diff > f_mag * 1e-5f) { // results are good to 5 places
					if(n_err_num < 10) {
						fprintf(stderr, "warning: correlation %d (%d, %d): length difference %g (correct value %g, GPU %g)\n",
							i, i % n_feature_num, i / n_feature_num, f_diff,
							p_vector_correlations[i], p_vector_correlations_gpu[idx]);
						b_differences = true;
					}
					++ n_err_num;
				}
			}
			if(!b_differences) {
				if(b_verbose)
					printf("ok, gpu results match cpu results\n");
			} else {
				fprintf(stderr, "error: gpu results do NOT match cpu results"
					" (%d out of %d different)\n", n_err_num, n_feature_num * n_feature2_num);
			}
		}
		// check results
#endif //SLIM
#endif // GPU_CORREL_MN_1

#ifdef GPU_CORREL_MN_2
		{
			double f_gpu_lengths_start = timer.f_Time();

			if(!kernels.Calculate_VectorsCorelation_Cached(h_cmd_queue, n_vector_length,
				n_feature_num, n_feature_num_align, n_feature2_num, dp_vector_correlations, dp_vectors, dp_vectors2)) {
				fprintf(stderr, "error: calculating reduction failed8\n");
				return -1;
			}
			// execute the operation (is asynchronous - returns immediately)

			{
				int n_result = clFinish(h_cmd_queue);
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: clFinish() failed: %d\n", n_result);
					return -1;
				}
			}
			// wait for the GPU to finish

			double f_gpu_lengths_end = timer.f_Time();

			if(b_verbose) {
				printf("GPU correl2: %.5f (" PRIsizeB "B/sec)\n", f_gpu_lengths_end - f_gpu_lengths_start,
					PRIsizeBparams(double(n_feature_num) * n_data_size / (f_gpu_lengths_end - f_gpu_lengths_start)));
			} else
				printf("%g\n", f_gpu_lengths_end - f_gpu_lengths_start);
		}

#ifndef SLIM
		if(clEnqueueReadBuffer(h_cmd_queue, dp_vector_correlations,
		   true, 0, n_feature2_num * n_feature_num_align * sizeof(_TyScalar),
		   p_vector_correlations_gpu, 0, NULL, NULL) != CL_SUCCESS) {
			fprintf(stderr, "error: failed to copy data from the device\n");
			return -1;
		}
		// copy results from device back to host

		{
			bool b_differences = false;
			size_t n_err_num = 0;
			for(size_t i = 0; i < n_feature_num * n_feature2_num; ++ i) {
				size_t idx = i % n_feature_num + (i / n_feature_num) * n_feature_num_align;
				_TyScalar f_diff = _TyScalar(fabs(double(p_vector_correlations_gpu[idx] - p_vector_correlations[i])));
				_TyScalar f_mag = std::max(p_vector_correlations_gpu[idx], p_vector_correlations[i]);
				if(f_diff > f_mag * 1e-5f) { // results are good to 5 places
					if(n_err_num < 10) {
						fprintf(stderr, "warning: correlation %d (%d, %d): length difference %g (correct value %g, GPU %g)\n",
							i, i % n_feature_num, i / n_feature_num, f_diff,
							p_vector_correlations[i], p_vector_correlations_gpu[idx]);
						b_differences = true;
					}
					++ n_err_num;
				}
			}
			if(!b_differences) {
				if(b_verbose)
					printf("ok, gpu results match cpu results\n");
			} else {
				fprintf(stderr, "error: gpu results do NOT match cpu results"
					" (%d out of %d different)\n", n_err_num, n_feature_num * n_feature2_num);
			}
		}
		// check results
#endif //SLIM
#endif // GPU_CORREL_MN_2

#ifdef GPU_SCALE_3
		{
			double f_gpu_scaling_start = timer.f_Time();

			if(!kernels.Calculate_ScaledVectors(h_context, h_cmd_queue, n_vector_length,
				n_feature_num, dp_vectors_normalized, dp_vectors, dp_vector_lengths)) {
				fprintf(stderr, "error: calculating reduction failed9\n");
				return -1;
			}
			// execute the operation (is asynchronous - returns immediately)
			// note that dp_vectors_normalized and dp_vectors might point to the same array - can work inplace

			{
				int n_result = clFinish(h_cmd_queue);
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: clFinish() failed: %d\n", n_result);
					return -1;
				}
			}
			// wait for the GPU to finish

			double f_gpu_scaling_end = timer.f_Time();

			if(b_verbose)
				printf("GPU scaling3: %.5f (" PRIsizeB "B/sec)\n", f_gpu_scaling_end - f_gpu_scaling_start, PRIsizeBparams(n_data_size * 2 / (f_gpu_scaling_end - f_gpu_scaling_start)));
			else
				printf("%g\n", f_gpu_scaling_end - f_gpu_scaling_start);
		}
#endif //GPU_SCALE_3

#ifndef SLIM
		if(clEnqueueReadBuffer(h_cmd_queue, dp_vectors_normalized, true, 0, n_feature_num * n_vector_length * sizeof(_TyScalar),
		   p_normalized_vectors_gpu, 0, NULL, NULL) != CL_SUCCESS) {
			fprintf(stderr, "error: failed to copy data from the device\n");
			return -1;
		}
		// copy results from device back to host
		// in case b_inplace_normalize is set, p_normalized_vectors_gpu == p_feature_vectors !!!! (but this is the end of the program)

		{
			bool b_differences = false;
			for(size_t i = 0, n_err_num = 0; i < (n_feature_num * n_vector_length) && n_err_num < 10; ++ i) {
				_TyScalar f_diff = _TyScalar(fabs(double(p_normalized_vectors_gpu[i] - p_normalized_vectors[i])));
				_TyScalar f_mag = std::max(_TyScalar(1), std::max(p_normalized_vectors_gpu[i], p_normalized_vectors[i]));
				if(f_diff > f_mag * 1e-5f) { // results are good to 5 places
					fprintf(stderr, "warning: vector %d, elem %d: difference %g (correct %g, GPU %g)\n",
						i / n_vector_length, i % n_vector_length, f_diff, p_normalized_vectors[i], p_normalized_vectors_gpu[i]);
					b_differences = true;
					++ n_err_num;
				}
			}
			if(!b_differences) {
				if(b_verbose)
					printf("ok, gpu results match cpu results\n");
			} else
				fprintf(stderr, "error: gpu results do NOT match cpu results\n");
		}
		// check results
#endif //SLIM

		clReleaseMemObject(dp_vector_lengths);
		clReleaseMemObject(dp_vector_correlations);
		clReleaseMemObject(dp_vectors);
		clReleaseMemObject(dp_vectors2);
		if(!b_inplace_normalize)
			clReleaseMemObject(dp_vectors_normalized);
		// free memory

		clReleaseCommandQueue(h_cmd_queue);
		clReleaseContext(h_context);
		// cleanup

		if(b_verbose)
			printf("finished ...\n");
	}

	delete[] p_vector_lengths;
	delete[] p_feature_vectors;
#ifndef SLIM
	delete[] p_normalized_vectors; // this equals p_feature_vectors and has just been freed
#endif //SLIM
	delete[] p_vector_correlations;
	// host cleanup

	return 0;
}
