/**
 *	@file test.cl
 *	@author -tHE SWINe-
 *	@brief testing OpenCL vector reduction
 */

//#define REDUCTION_LOCAL_WORK_SIZE 64 // defined by the compiler
// for kernels, processing short vectors

#pragma OPENCL EXTENSION cl_khr_fp64 : enable
// in case someone is about to use doubles

typedef SCALAR_TYPE _TyScalar;
typedef REDUCTION_ACCUM_TYPE _TyReductionAccum; // someone might consider using doubles here
typedef REDUCTION_TYPE _TyReduction;
typedef FACTOR_TYPE _TyFactor;
// *_TYPE defined by the compiler

/*#define ScaleElemOp ((x) / (y))

#define CrossCorelElemOp (((x) - (y)) * ((x) - (y)))
#define CrossCorelReduceOp ((x) + (y))
#define CrossCorelFinalOp ((_TyReduction)sqrt((float)x))

#define CorelElemOp (((x) - (y)) * ((x) - (y)))
#define CorelReduceOp ((x) + (y))
#define CorelFinalOp ((_TyReduction)sqrt((float)x))

#define ReductionElemOp ((x) * (x))
#define ReductionReduceOp ((x) + (y))
#define ReductionFinalOp ((_TyReduction)sqrt((float)x))*/
// defined by the compiler

/**
 *	@def n_TriangularSize
 *	@brief calculates number of elements of triangular matrix (including the diagonal)
 *	@param[in] n is number of columns of a square triangular matrix, in elements
 */
#define n_TriangularSize(n) (((n) * ((n) + 1)) >> 1)

/**
 *	@def n_TriangularSize2
 *
 *	@brief calculates number of elements in upper submatrix
 *		of upper triangular matrix (including the diagonal)
 *
 *	@param[in] n is number of columns of a rectangular triangular matrix, in elements
 *	@param[in] m is number of rows of a rectangular triangular matrix, in elements, must not exceed n
 */
#define n_TriangularSize2(n,m) (((m) * ((n) + (n) + 1 - (m))) >> 1)

/**
 *	@def n_TriangularOffset
 *
 *	@brief calculates offset in densely packed buffer storing
 *		an upper triangular matrix (including the diagonal)
 *
 *	@param[in] r is zero-based index of row, in elements
 *	@param[in] c is zero-based index of column, in elements
 *	@param[in] n is number of columns of a square triangular matrix, in elements
 *
 *	@note A part of this offset can be precalculated, see n_TriangularOffset2().
 */
#define n_TriangularOffset(r,c,n) ((((r) * ((n) + (n) - 1 - (r))) >> 1) + (c))

/**
 *	@def n_TriangularOffset2
 *
 *	@brief calculates offset in densely packed buffer storing
 *		an upper triangular matrix (including the diagonal)
 *
 *	@param[in] r is zero-based index of row, in elements
 *	@param[in] c is zero-based index of column, in elements
 *	@param[in] _2n1 is twice number of columns of a square triangular matrix (in elements), minus one
 *
 *	@note A part of this offset can be precalculated, see n_TriangularOffset2().
 */
#define n_TriangularOffset2(r,c,_2n1) ((((r) * ((_2n1) - (r))) >> 1) + (c))

/**
 *	@def n_TriangularOffset2
 *
 *	@brief calculates offset of a starting element in a row in densely
 *		packed buffer storing an upper triangular matrix (including the diagonal)
 *
 *	@param[in] r is zero-based index of row, in elements
 *	@param[in] _2n1 is twice number of columns of a square triangular matrix (in elements), minus one
 *
 *	@note To calculate element offset, just add zero-based index of a column.
 */
#define n_TriangularRowOffset(r,_2n1) ((((r) * ((_2n1) - (r))) >> 1))

#ifndef WANT_SECOND_CONST

/**
 *	@brief reduces an array in local memory
 *
 *	@param[in] l is local thread id
 *	@param[in] p_sh_mem is pointer to shared memory (only REDUCTION_LOCAL_WORK_SIZE elements is accessed)
 *
 *	@note The result is left in p_sh_mem[0].
 *	@note This version uses ReductionReduceOp operation.
 */
inline void ReduceInLocalMemory(const unsigned int l, __local _TyReductionAccum *p_sh_mem)
{
#if REDUCTION_LOCAL_WORK_SIZE >= 512
	barrier(CLK_LOCAL_MEM_FENCE);
	if(l < 256) {
		_TyReductionAccum x = p_sh_mem[l];
		_TyReductionAccum y = p_sh_mem[l + 256];
		p_sh_mem[l] = (ReductionReduceOp);
	}
#endif
#if REDUCTION_LOCAL_WORK_SIZE >= 256
	barrier(CLK_LOCAL_MEM_FENCE);
	if(l < 128) {
		_TyReductionAccum x = p_sh_mem[l];
		_TyReductionAccum y = p_sh_mem[l + 128];
		p_sh_mem[l] = (ReductionReduceOp);
	}
#endif
#if REDUCTION_LOCAL_WORK_SIZE >= 128
	barrier(CLK_LOCAL_MEM_FENCE);
	if(l < 64) {
		_TyReductionAccum x = p_sh_mem[l];
		_TyReductionAccum y = p_sh_mem[l + 64];
		p_sh_mem[l] = (ReductionReduceOp);
	}
#endif
	// reduce down to 32 elements stored in the shared memory

	// the below section is within a single warp, want to avoid divergence
	// even though unneccessary reductions are made no barriers required,
	// just a memory fence to avoid compiler optimization
#if REDUCTION_LOCAL_WORK_SIZE > 32
	barrier(CLK_LOCAL_MEM_FENCE);
	if(l < 32) {
#elif REDUCTION_LOCAL_WORK_SIZE > 16
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	if(l < 16) {
#elif REDUCTION_LOCAL_WORK_SIZE > 8
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	if(l < 8) {
#elif REDUCTION_LOCAL_WORK_SIZE > 4
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	if(l < 4) {
#elif REDUCTION_LOCAL_WORK_SIZE > 2
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	if(l < 2) {
#else
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	{
#endif
#if REDUCTION_LOCAL_WORK_SIZE >= 64
		{
			_TyReductionAccum x = p_sh_mem[l];
			_TyReductionAccum y = p_sh_mem[l + 32];
			p_sh_mem[l] = (ReductionReduceOp);
		}
		write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif
#if REDUCTION_LOCAL_WORK_SIZE >= 32
		{
			_TyReductionAccum x = p_sh_mem[l];
			_TyReductionAccum y = p_sh_mem[l + 16];
			p_sh_mem[l] = (ReductionReduceOp);
		}
		write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif
#if REDUCTION_LOCAL_WORK_SIZE >= 16
		{
			_TyReductionAccum x = p_sh_mem[l];
			_TyReductionAccum y = p_sh_mem[l + 8];
			p_sh_mem[l] = (ReductionReduceOp);
		}
		write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif
#if REDUCTION_LOCAL_WORK_SIZE >= 8
		{
			_TyReductionAccum x = p_sh_mem[l];
			_TyReductionAccum y = p_sh_mem[l + 4];
			p_sh_mem[l] = (ReductionReduceOp);
		}
		write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif
#if REDUCTION_LOCAL_WORK_SIZE >= 4
		{
			_TyReductionAccum x = p_sh_mem[l];
			_TyReductionAccum y = p_sh_mem[l + 2];
			p_sh_mem[l] = (ReductionReduceOp);
		}
		write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif
#if REDUCTION_LOCAL_WORK_SIZE >= 2
		{
			_TyReductionAccum x = p_sh_mem[l];
			_TyReductionAccum y = p_sh_mem[l + 1];
			p_sh_mem[l] = (ReductionReduceOp);
		}
#endif
	}
	// reduce results down to one result per work group
}

/**
 *	@brief reduces an array in local memory
 *
 *	@param[in] l is local thread id
 *	@param[in] p_sh_mem is pointer to shared memory (only REDUCTION_LOCAL_WORK_SIZE elements is accessed)
 *
 *	@note The result is left in p_sh_mem[0].
 *	@note This version uses CorelReduceOp operation.
 */
inline void CorelReduceInLocalMemory(const unsigned int l, __local _TyReductionAccum *p_sh_mem)
{
#if REDUCTION_LOCAL_WORK_SIZE >= 512
	barrier(CLK_LOCAL_MEM_FENCE);
	if(l < 256) {
		_TyReductionAccum x = p_sh_mem[l];
		_TyReductionAccum y = p_sh_mem[l + 256];
		p_sh_mem[l] = (CorelReduceOp);
	}
#endif
#if REDUCTION_LOCAL_WORK_SIZE >= 256
	barrier(CLK_LOCAL_MEM_FENCE);
	if(l < 128) {
		_TyReductionAccum x = p_sh_mem[l];
		_TyReductionAccum y = p_sh_mem[l + 128];
		p_sh_mem[l] = (CorelReduceOp);
	}
#endif
#if REDUCTION_LOCAL_WORK_SIZE >= 128
	barrier(CLK_LOCAL_MEM_FENCE);
	if(l < 64) {
		_TyReductionAccum x = p_sh_mem[l];
		_TyReductionAccum y = p_sh_mem[l + 64];
		p_sh_mem[l] = (CorelReduceOp);
	}
#endif
	// reduce down to 32 elements stored in the shared memory

	// the below section is within a single warp, want to avoid divergence
	// even though unneccessary reductions are made no barriers required,
	// just a memory fence to avoid compiler optimization
#if REDUCTION_LOCAL_WORK_SIZE > 32
	barrier(CLK_LOCAL_MEM_FENCE);
	if(l < 32) {
#elif REDUCTION_LOCAL_WORK_SIZE > 16
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	if(l < 16) {
#elif REDUCTION_LOCAL_WORK_SIZE > 8
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	if(l < 8) {
#elif REDUCTION_LOCAL_WORK_SIZE > 4
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	if(l < 4) {
#elif REDUCTION_LOCAL_WORK_SIZE > 2
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	if(l < 2) {
#else
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	{
#endif
#if REDUCTION_LOCAL_WORK_SIZE >= 64
		{
			_TyReductionAccum x = p_sh_mem[l];
			_TyReductionAccum y = p_sh_mem[l + 32];
			p_sh_mem[l] = (CorelReduceOp);
		}
		write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif
#if REDUCTION_LOCAL_WORK_SIZE >= 32
		{
			_TyReductionAccum x = p_sh_mem[l];
			_TyReductionAccum y = p_sh_mem[l + 16];
			p_sh_mem[l] = (CorelReduceOp);
		}
		write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif
#if REDUCTION_LOCAL_WORK_SIZE >= 16
		{
			_TyReductionAccum x = p_sh_mem[l];
			_TyReductionAccum y = p_sh_mem[l + 8];
			p_sh_mem[l] = (CorelReduceOp);
		}
		write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif
#if REDUCTION_LOCAL_WORK_SIZE >= 8
		{
			_TyReductionAccum x = p_sh_mem[l];
			_TyReductionAccum y = p_sh_mem[l + 4];
			p_sh_mem[l] = (CorelReduceOp);
		}
		write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif
#if REDUCTION_LOCAL_WORK_SIZE >= 4
		{
			_TyReductionAccum x = p_sh_mem[l];
			_TyReductionAccum y = p_sh_mem[l + 2];
			p_sh_mem[l] = (CorelReduceOp);
		}
		write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif
#if REDUCTION_LOCAL_WORK_SIZE >= 2
		{
			_TyReductionAccum x = p_sh_mem[l];
			_TyReductionAccum y = p_sh_mem[l + 1];
			p_sh_mem[l] = (CorelReduceOp);
		}
#endif
	}
	// reduce results down to one result per work group
}

/**
 *	@brief reduces an array in local memory
 *
 *	@param[in] l is local thread id
 *	@param[in] p_sh_mem is pointer to shared memory (only REDUCTION_LOCAL_WORK_SIZE elements is accessed)
 *
 *	@note The result is left in p_sh_mem[0].
 *	@note This version uses CrossCorelReduceOp operation.
 */
inline void CrossCorelReduceInLocalMemory(const unsigned int l, __local _TyReductionAccum *p_sh_mem)
{
#if REDUCTION_LOCAL_WORK_SIZE >= 512
	barrier(CLK_LOCAL_MEM_FENCE);
	if(l < 256) {
		_TyReductionAccum x = p_sh_mem[l];
		_TyReductionAccum y = p_sh_mem[l + 256];
		p_sh_mem[l] = (CrossCorelReduceOp);
	}
#endif
#if REDUCTION_LOCAL_WORK_SIZE >= 256
	barrier(CLK_LOCAL_MEM_FENCE);
	if(l < 128) {
		_TyReductionAccum x = p_sh_mem[l];
		_TyReductionAccum y = p_sh_mem[l + 128];
		p_sh_mem[l] = (CrossCorelReduceOp);
	}
#endif
#if REDUCTION_LOCAL_WORK_SIZE >= 128
	barrier(CLK_LOCAL_MEM_FENCE);
	if(l < 64) {
		_TyReductionAccum x = p_sh_mem[l];
		_TyReductionAccum y = p_sh_mem[l + 64];
		p_sh_mem[l] = (CrossCorelReduceOp);
	}
#endif
	// reduce down to 32 elements stored in the shared memory

	// the below section is within a single warp, want to avoid divergence
	// even though unneccessary reductions are made no barriers required,
	// just a memory fence to avoid compiler optimization
#if REDUCTION_LOCAL_WORK_SIZE > 32
	barrier(CLK_LOCAL_MEM_FENCE);
	if(l < 32) {
#elif REDUCTION_LOCAL_WORK_SIZE > 16
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	if(l < 16) {
#elif REDUCTION_LOCAL_WORK_SIZE > 8
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	if(l < 8) {
#elif REDUCTION_LOCAL_WORK_SIZE > 4
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	if(l < 4) {
#elif REDUCTION_LOCAL_WORK_SIZE > 2
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	if(l < 2) {
#else
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	{
#endif
#if REDUCTION_LOCAL_WORK_SIZE >= 64
		{
			_TyReductionAccum x = p_sh_mem[l];
			_TyReductionAccum y = p_sh_mem[l + 32];
			p_sh_mem[l] = (CrossCorelReduceOp);
		}
		write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif
#if REDUCTION_LOCAL_WORK_SIZE >= 32
		{
			_TyReductionAccum x = p_sh_mem[l];
			_TyReductionAccum y = p_sh_mem[l + 16];
			p_sh_mem[l] = (CrossCorelReduceOp);
		}
		write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif
#if REDUCTION_LOCAL_WORK_SIZE >= 16
		{
			_TyReductionAccum x = p_sh_mem[l];
			_TyReductionAccum y = p_sh_mem[l + 8];
			p_sh_mem[l] = (CrossCorelReduceOp);
		}
		write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif
#if REDUCTION_LOCAL_WORK_SIZE >= 8
		{
			_TyReductionAccum x = p_sh_mem[l];
			_TyReductionAccum y = p_sh_mem[l + 4];
			p_sh_mem[l] = (CrossCorelReduceOp);
		}
		write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif
#if REDUCTION_LOCAL_WORK_SIZE >= 4
		{
			_TyReductionAccum x = p_sh_mem[l];
			_TyReductionAccum y = p_sh_mem[l + 2];
			p_sh_mem[l] = (CrossCorelReduceOp);
		}
		write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif
#if REDUCTION_LOCAL_WORK_SIZE >= 2
		{
			_TyReductionAccum x = p_sh_mem[l];
			_TyReductionAccum y = p_sh_mem[l + 1];
			p_sh_mem[l] = (CrossCorelReduceOp);
		}
#endif
	}
	// reduce results down to one result per work group
}

/**
 *	@brief simple vector reduction kernel, each kernel launch calculates several reductions
 *		of several vectors (as many vectors as there are work-groups)
 *
 *	@param[out] p_length is the output array of reductions
 *		(indexed n_first_vector through n_last_vector)
 *	@param[in] p_vector is the array with dense vectors
 *		(indexed n_first_vector * n_vector_length through n_last_vector * n_vector_length)
 *	@param[in] n_first_vector is zero-based index of the first vector to be processed
 *	@param[in] n_last_vector is zero-based index of one past the last vector to be processed
 *	@param[in] n_vector_length is number of vector elements
 */
__kernel void VectorLengths_v3_multi(__global _TyReduction *p_length,
	__global const _TyScalar *p_vector, const unsigned int n_first_vector,
	const unsigned int n_vector_length, const unsigned int n_last_vector)
{
	__local _TyReductionAccum p_sh_mem[REDUCTION_LOCAL_WORK_SIZE];
	// local memory known at compile time

	unsigned int g = get_group_id(0) + n_first_vector;
	// get work-group id (each work-group processes a single vector)

	const unsigned int l = get_local_id(0);
	// get local thread id (each thread processes (several) vector elements)

	while(g < n_last_vector) {
		_TyReductionAccum f_length2 = 0;
		{
			__global const _TyScalar *p_vec_ptr = p_vector + g * n_vector_length;
			unsigned int i = l; // local id
			const unsigned int n_ls = get_local_size(0);
			while(i < n_vector_length) { // !! i might be greater than n_vector_lenght already
				_TyReductionAccum y;
				{
					_TyScalar x = p_vec_ptr[i];
					y = (_TyReductionAccum)(ReductionElemOp);
				}
				{
					_TyReductionAccum x = f_length2;
					f_length2 = (ReductionReduceOp);
				}
				i += n_ls;
			}
		}
		// read (several) vector elements, calculate sum of squares

		p_sh_mem[l] = f_length2;
		// write result to shared memory for further reduction

		ReduceInLocalMemory(l, p_sh_mem);

		if(!l) {
			_TyReductionAccum x = p_sh_mem[0];
			p_length[g] = (ReductionFinalOp);
		}
		// store vector length

		g += get_num_groups(0);
		// go to another set of vectors
	}
}

/**
 *	@brief calculates a tile of cross-corelation matrix
 *
 *	This function calculates a tile of cross-corelation matrix on vectors
 *	(n_first_vector - n_last_vector) x (n_first_vector2 - n_last_vector2).
 *	Note that this includes vectors in both triangular parts, although the matrix
 *	may be symmetric.
 *
 *	@param[out] p_length is the cross-corelation array
 *	@param[in] n_length_off is offset to the cross-corelation array where to put output
 *	@param[in] n_length_stride is the length of each line in the cross-corelation array
 *	@param[in] p_vector is pointer to the dense vector array
 *	@param[in] n_first_vector is zero-based index of the first vector to be processed
 *	@param[in] n_last_vector is zero-based index of one past the last vector to be processed
 *	@param[in] n_vector_length is number of vector elements
 *	@param[in] n_first_vector2 is zero-based index of the first vector to be processed
 *	@param[in] n_last_vector2 is zero-based index of one past the last vector to be processed
 *
 *	@t_odo Provide versions with two different vector sets (correlation instead of crosscorelation).
 *	@t_odo Provide version for symmetric matrix (some workgroups just write null and skip quickly).
 *	@t_odo See if there is an elegent way of storing diagonal matrix in dense array.
 *	@t_odo Provide a version that keeps one of the vectors in local memory for as long as possible.
 */
__kernel void VectorCorelN_to_N(__global _TyReduction *p_length,
	const unsigned int n_length_off, const unsigned int n_length_stride,
	__global const _TyScalar *p_vector, const unsigned int n_first_vector,
	const unsigned int n_vector_length, const unsigned int n_last_vector,
	const unsigned int n_first_vector2, const unsigned int n_last_vector2)
{
	__local _TyReductionAccum p_sh_mem[REDUCTION_LOCAL_WORK_SIZE];
	// local memory known at compile time

	p_length += n_length_off;
	// shift by offset

	p_length -= n_first_vector + n_first_vector2 * n_length_stride;
	// shift back so that the first elem written is on the index 0

	const unsigned int l = get_local_id(0);
	// get local thread id (each thread processes (several) vector elements)

	unsigned int m = n_last_vector - n_first_vector;
	//unsigned int n = n_last_vector2 - n_first_vector2; // unused
	// get dimension of the 2D tile we want to process

	unsigned int g = n_first_vector + get_group_id(0) % m; // vectors 0 to n
	unsigned int h = n_first_vector2 + get_group_id(0) / m; // vectors 0 to m
	// 2D loop counters

	while(h < n_last_vector2) {
		while(g < n_last_vector) {
			_TyReductionAccum f_length2 = 0;
			{
				__global const _TyScalar *p_vec0_ptr = p_vector + g * n_vector_length;
				__global const _TyScalar *p_vec1_ptr = p_vector + h * n_vector_length; // this assumes that a copy of vector h doesn't fit in local memory
				unsigned int i = l; // local id
				const unsigned int n_ls = get_local_size(0);
				while(i < n_vector_length) { // !! i might be greater than n_vector_lenght already
					_TyReductionAccum z;
					{
						_TyScalar x = p_vec0_ptr[i];
						_TyScalar y = p_vec1_ptr[i];
						z = (_TyReductionAccum)(CrossCorelElemOp);
					}
					{
						_TyReductionAccum x = f_length2, y = z;
						f_length2 = (CrossCorelReduceOp);
					}
					i += n_ls;
				}
			}
			// read (several) vector elements, calculate sum of squares

			p_sh_mem[l] = f_length2;
			// write result to shared memory for further reduction

			CrossCorelReduceInLocalMemory(l, p_sh_mem);

			if(!l) {
				_TyReductionAccum x = p_sh_mem[0];
				p_length[g + n_length_stride * h] = (CrossCorelFinalOp);
			}
			// store vector length

			g += get_num_groups(0);
			// go to another set of vectors
		}

		g -= n_first_vector; // must be zero-based for this to work
		h += g / m; // at least one (do *not* try to simplify, you will screw up easily)
		g %= m; // move back
		g += n_first_vector;
		// go to another set of vectors (note that simpler code would suffice if get_num_groups(0) < 2 * m)
	}
}

/**
 *	@brief calculates a tile of corelation matrix
 *
 *	This function calculates a tile of cross-corelation matrix on vectors
 *	(n_first_vector - n_last_vector) x (n_first_vector2 - n_last_vector2).
 *	Note that this includes vectors in both triangular parts, although the matrix
 *	may be symmetric.
 *
 *	@param[out] p_length is the cross-corelation array
 *	@param[in] n_length_off is offset to the cross-corelation array where to put output
 *	@param[in] n_length_stride is the length of each line in the cross-corelation array
 *	@param[in] p_vector is pointer to the dense vector array
 *	@param[in] n_first_vector is zero-based index of the first vector to be processed
 *	@param[in] n_last_vector is zero-based index of one past the last vector to be processed
 *	@param[in] n_vector_length is number of vector elements
 *	@param[in] p_vector2 is pointer to the second dense vector array
 *	@param[in] n_first_vector2 is zero-based index of the first vector to be processed
 *	@param[in] n_last_vector2 is zero-based index of one past the last vector to be processed
 *
 *	@t_odo Provide version for symmetric matrix (some workgroups just write null and skip quickly).
 *	@t_odo See if there is an elegent way of storing diagonal matrix in dense array.
 *	@t_odo Provide a version that keeps one of the vectors in local memory for as long as possible.
 */
__kernel void VectorCorelN_to_M(__global _TyReduction *p_length,
	const unsigned int n_length_off, const unsigned int n_length_stride,
	__global const _TyScalar *p_vector, const unsigned int n_first_vector,
	const unsigned int n_vector_length, const unsigned int n_last_vector,
	__global const _TyScalar *p_vector2, const unsigned int n_first_vector2,
	const unsigned int n_last_vector2)
{
	__local _TyReductionAccum p_sh_mem[REDUCTION_LOCAL_WORK_SIZE];
	// local memory known at compile time

	p_length += n_length_off;
	// shift by offset

	p_length -= n_first_vector + n_first_vector2 * n_length_stride;
	// shift back so that the first elem written is on the index 0

	const unsigned int l = get_local_id(0);
	// get local thread id (each thread processes (several) vector elements)

	unsigned int m = n_last_vector - n_first_vector;
	//unsigned int n = n_last_vector2 - n_first_vector2; // unused
	// get dimension of the 2D tile we want to process

	unsigned int g = n_first_vector + get_group_id(0) % m; // vectors 0 to n
	unsigned int h = n_first_vector2 + get_group_id(0) / m; // vectors 0 to m
	// 2D loop counters

	while(h < n_last_vector2) {
		while(g < n_last_vector) {
			_TyReductionAccum f_length2 = 0;
			{
				__global const _TyScalar *p_vec0_ptr = p_vector + g * n_vector_length;
				__global const _TyScalar *p_vec1_ptr = p_vector2 + h * n_vector_length; // this assumes that a copy of vector h doesn't fit in local memory
				unsigned int i = l; // local id
				const unsigned int n_ls = get_local_size(0);
				while(i < n_vector_length) { // !! i might be greater than n_vector_lenght already
					_TyReductionAccum z;
					{
						_TyScalar x = p_vec0_ptr[i];
						_TyScalar y = p_vec1_ptr[i];
						z = (_TyReductionAccum)(CorelElemOp);
					}
					{
						_TyReductionAccum x = f_length2, y = z;
						f_length2 = (CorelReduceOp);
					}
					i += n_ls;
				}
			}
			// read (several) vector elements, calculate sum of squares

			p_sh_mem[l] = f_length2;
			// write result to shared memory for further reduction

			CorelReduceInLocalMemory(l, p_sh_mem);

			if(!l) {
				_TyReductionAccum x = p_sh_mem[0];
				p_length[g + n_length_stride * h] = (CorelFinalOp);
			}
			// store vector length

			g += get_num_groups(0);
			// go to another set of vectors
		}

		g -= n_first_vector; // must be zero-based for this to work
		h += g / m; // at least one (do *not* try to simplify, you will screw up easily)
		g %= m; // move back
		g += n_first_vector;
		// go to another set of vectors (note that simpler code would suffice if get_num_groups(0) < 2 * m)
	}
}

/**
 *	@brief calculates a tile of upper-triangular part of cross-corelation matrix
 *
 *	This function calculates a tile of cross-corelation matrix on vectors
 *	(n_first_vector - n_last_vector) x (n_first_vector2 - n_last_vector2).
 *	Note that this includes vectors in both triangular parts, although the matrix
 *	may be symmetric.
 *
 *	@param[out] p_length is the cross-corelation array
 *	@param[in] n_length_off is offset to the cross-corelation array where to put output
 *	@param[in] n_length_stride is the length of each line in the cross-corelation array
 *	@param[in] p_vector is pointer to the dense vector array
 *	@param[in] n_first_vector is zero-based index of the first vector to be processed
 *	@param[in] n_last_vector is zero-based index of one past the last vector to be processed
 *	@param[in] n_vector_length is number of vector elements
 *	@param[in] n_first_vector2 is zero-based index of the first vector to be processed
 *	@param[in] n_last_vector2 is zero-based index of one past the last vector to be processed
 *
 *	@t_odo Provide version for symmetric matrix (some workgroups just write null and skip quickly).
 *	@t_odo See if there is an elegent way of storing diagonal matrix in dense array.
 *	@t_odo Provide a version that keeps one of the vectors in local memory for as long as possible.
 */
__kernel void VectorCorelN_to_N_upper(__global _TyReduction *p_length,
	const unsigned int n_length_off, const unsigned int n_length_stride,
	__global const _TyScalar *p_vector, const unsigned int n_first_vector,
	const unsigned int n_vector_length, const unsigned int n_last_vector,
	const unsigned int n_first_vector2, const unsigned int n_last_vector2)
{
	__local _TyReductionAccum p_sh_mem[REDUCTION_LOCAL_WORK_SIZE];
	// local memory known at compile time

	p_length += n_length_off;
	// shift by offset

	p_length -= n_first_vector + n_first_vector2 * n_length_stride;
	// shift back so that the first elem written is on the index 0

	const unsigned int l = get_local_id(0);
	// get local thread id (each thread processes (several) vector elements)

	unsigned int m = n_last_vector - n_first_vector;
	//unsigned int n = n_last_vector2 - n_first_vector2; // unused
	// get dimension of the 2D tile we want to process

	unsigned int g = n_first_vector + get_group_id(0) % m; // vectors 0 to n
	unsigned int h = n_first_vector2 + get_group_id(0) / m; // vectors 0 to m
	// 2D loop counters

	while(h < n_last_vector2) {
		while(g < n_last_vector) {
			if(h <= g) {
				_TyReductionAccum f_length2 = 0;
				{
					__global const _TyScalar *p_vec0_ptr = p_vector + g * n_vector_length;
					__global const _TyScalar *p_vec1_ptr = p_vector + h * n_vector_length; // this assumes that a copy of vector h doesn't fit in local memory
					unsigned int i = l; // local id
					const unsigned int n_ls = get_local_size(0);
					while(i < n_vector_length) { // !! i might be greater than n_vector_lenght already
						_TyReductionAccum z;
						{
							_TyScalar x = p_vec0_ptr[i];
							_TyScalar y = p_vec1_ptr[i];
							z = (_TyReductionAccum)(CrossCorelElemOp);
						}
						{
							_TyReductionAccum x = f_length2, y = z;
							f_length2 = (CrossCorelReduceOp);
						}
						i += n_ls;
					}
				}
				// read (several) vector elements, calculate sum of squares

				p_sh_mem[l] = f_length2;
				// write result to shared memory for further reduction

				CrossCorelReduceInLocalMemory(l, p_sh_mem);

				if(!l) {
					_TyReductionAccum x = p_sh_mem[0];
					p_length[g + n_length_stride * h] = (CrossCorelFinalOp);
				}
				// store vector length
			}

			g += get_num_groups(0);
			// go to another set of vectors
		}

		g -= n_first_vector; // must be zero-based for this to work
		h += g / m; // at least one (do *not* try to simplify, you will screw up easily)
		g %= m; // move back
		g += n_first_vector;
		// go to another set of vectors (note that simpler code would suffice if get_num_groups(0) < 2 * m)
	}
}

/**
 *	@brief calculates a tile of upper-triangular part of cross-corelation matrix
 *		where only the upper triangular part is stored
 *
 *	This function calculates a tile of cross-corelation matrix on vectors
 *	(n_first_vector - n_last_vector) x (n_first_vector2 - n_last_vector2).
 *	Note that this includes vectors in both triangular parts, although the matrix
 *	may be symmetric.
 *
 *	@param[out] p_length is the cross-corelation array storing only the upper triangle
 *	@param[in] n_length_off is offset to the cross-corelation array where to put output
 *	@param[in] n_length_stride is the length of each line in the cross-corelation array
 *	@param[in] p_vector is pointer to the dense vector array
 *	@param[in] n_first_vector is zero-based index of the first vector to be processed
 *	@param[in] n_last_vector is zero-based index of one past the last vector to be processed
 *	@param[in] n_vector_length is number of vector elements
 *	@param[in] n_first_vector2 is zero-based index of the first vector to be processed
 *	@param[in] n_last_vector2 is zero-based index of one past the last vector to be processed
 *
 *	@t_odo Provide version for symmetric matrix (some workgroups just write null and skip quickly).
 *	@t_odo See if there is an elegent way of storing diagonal matrix in dense array.
 *	@t_odo Provide a version that keeps one of the vectors in local memory for as long as possible.
 */
__kernel void VectorCorelN_to_N_upper_packed(__global _TyReduction *p_length,
	const unsigned int n_length_off, const unsigned int n_length_stride,
	__global const _TyScalar *p_vector, const unsigned int n_first_vector,
	const unsigned int n_vector_length, const unsigned int n_last_vector,
	const unsigned int n_first_vector2, const unsigned int n_last_vector2)
{
	__local _TyReductionAccum p_sh_mem[REDUCTION_LOCAL_WORK_SIZE];
	// local memory known at compile time

	p_length += n_length_off;
	// shift by offset

	const unsigned int _2n1 = 2 * n_length_stride - 1;
	// precalculate for determining row offsets

	p_length -= n_TriangularOffset2(n_first_vector2, n_first_vector, _2n1);
	// shift back so that the first elem written is on the index 0

	const unsigned int l = get_local_id(0);
	// get local thread id (each thread processes (several) vector elements)

	unsigned int m = n_last_vector - n_first_vector;
	//unsigned int n = n_last_vector2 - n_first_vector2; // unused
	// get dimension of the 2D tile we want to process

	unsigned int g = n_first_vector + get_group_id(0) % m; // vectors 0 to n
	unsigned int h = n_first_vector2 + get_group_id(0) / m; // vectors 0 to m
	// 2D loop counters

	while(h < n_last_vector2) {
		unsigned int n_store_off = n_TriangularRowOffset(h, _2n1);
		// calculate an offset where to store the elements

		while(g < n_last_vector) {
			if(h <= g) {
				_TyReductionAccum f_length2 = 0;
				{
					__global const _TyScalar *p_vec0_ptr = p_vector + g * n_vector_length;
					__global const _TyScalar *p_vec1_ptr = p_vector + h * n_vector_length; // this assumes that a copy of vector h doesn't fit in local memory
					unsigned int i = l; // local id
					const unsigned int n_ls = get_local_size(0);
					while(i < n_vector_length) { // !! i might be greater than n_vector_lenght already
						_TyReductionAccum z;
						{
							_TyScalar x = p_vec0_ptr[i];
							_TyScalar y = p_vec1_ptr[i];
							z = (_TyReductionAccum)(CrossCorelElemOp);
						}
						{
							_TyReductionAccum x = f_length2, y = z;
							f_length2 = (CrossCorelReduceOp);
						}
						i += n_ls;
					}
				}
				// read (several) vector elements, calculate sum of squares

				p_sh_mem[l] = f_length2;
				// write result to shared memory for further reduction

				CrossCorelReduceInLocalMemory(l, p_sh_mem);

				if(!l) {
					_TyReductionAccum x = p_sh_mem[0];
					p_length[n_store_off + g] = (CrossCorelFinalOp);
				}
				// store vector length
			}

			g += get_num_groups(0);
			// go to another set of vectors
		}

		g -= n_first_vector; // must be zero-based for this to work
		h += g / m; // at least one (do *not* try to simplify, you will screw up easily)
		g %= m; // move back
		g += n_first_vector;
		// go to another set of vectors (note that simpler code would suffice if get_num_groups(0) < 2 * m)
	}
}

/**
 *	@brief calculates a tile of cross-corelation matrix with second vector storage
 *
 *	This function calculates a tile of cross-corelation matrix on vectors
 *	(n_first_vector - n_last_vector) x (n_first_vector2 - n_last_vector2).
 *	Note that this includes vectors in both triangular parts, although the matrix
 *	may be symmetric.
 *
 *	@param[out] p_length is the cross-corelation array
 *	@param[in] n_length_off is offset to the cross-corelation array where to put output
 *	@param[in] n_length_stride is the length of each line in the cross-corelation array
 *	@param[in] p_vector is pointer to the dense vector array
 *	@param[in] n_first_vector is zero-based index of the first vector to be processed
 *	@param[in] n_last_vector is zero-based index of one past the last vector to be processed
 *	@param[in] n_vector_length is number of vector elements
 *	@param[in] n_first_vector2 is zero-based index of the first vector to be processed
 *	@param[in] n_last_vector2 is zero-based index of one past the last vector to be processed
 *	@param[in,out] p_vector2 is space to store the second vector during the computation
 *
 *	@t_odo Provide versions with two different vector sets (correlation instead of crosscorelation).
 *	@t_odo Provide version for symmetric matrix (some workgroups just write null and skip quickly).
 *	@t_odo See if there is an elegent way of storing diagonal matrix in dense array.
 */
__kernel void VectorCorelN_to_N_v2(__global _TyReduction *p_length,
	const unsigned int n_length_off, const unsigned int n_length_stride,
	__global const _TyScalar *p_vector, const unsigned int n_first_vector,
	const unsigned int n_vector_length, const unsigned int n_last_vector,
	const unsigned int n_first_vector2, const unsigned int n_last_vector2,
	__local _TyScalar *p_vector2)
{
	__local _TyReductionAccum p_sh_mem[REDUCTION_LOCAL_WORK_SIZE];
	// local memory known at compile time

	p_length += n_length_off;
	// shift by offset

	p_length -= n_first_vector + n_first_vector2 * n_length_stride;
	// shift back so that the first elem written is on the index 0

	const unsigned int l = get_local_id(0);
	// get local thread id (each thread processes (several) vector elements)

	unsigned int m = n_last_vector - n_first_vector;
	//unsigned int n = n_last_vector2 - n_first_vector2; // unused
	// get dimension of the 2D tile we want to process

	unsigned int g = n_first_vector + get_group_id(0) % m; // vectors 0 to n
	unsigned int h = n_first_vector2 + get_group_id(0) / m; // vectors 0 to m
	// 2D loop counters

	while(h < n_last_vector2) {
		//copy(p_vector2, p_vector + h * n_vector_length, n_vector_length * sizeof(_TyScalar));
		{
			__global const _TyScalar *p_vec1_ptr = p_vector + h * n_vector_length; // todo - might want to align indices for coalesced reads (applies to strange-sized vectors)
			unsigned int i = l; // local id
			const unsigned int n_ls = get_local_size(0);
			while(i < n_vector_length) { // !! i might be greater than n_vector_lenght already
				p_vector2[i] = p_vec1_ptr[i];
				i += n_ls;
			}
		}
		// copies one vector data to local memory
		// this assumes that a copy of vector h does fit in local memory (SIFTs should)

		while(g < n_last_vector) {
			_TyReductionAccum f_length2 = 0;
			{
				__global const _TyScalar *p_vec0_ptr = p_vector + g * n_vector_length;
				__local const _TyScalar *p_vec1_ptr = p_vector2;
				unsigned int i = l; // local id
				const unsigned int n_ls = get_local_size(0);
				while(i < n_vector_length) { // !! i might be greater than n_vector_lenght already
					_TyReductionAccum z;
					{
						_TyScalar x = p_vec0_ptr[i];
						_TyScalar y = p_vec1_ptr[i];
						z = (_TyReductionAccum)(CrossCorelElemOp);
					}
					{
						_TyReductionAccum x = f_length2, y = z;
						f_length2 = (CrossCorelReduceOp);
					}
					i += n_ls;
				}
			}
			// read (several) vector elements, calculate sum of squares

			p_sh_mem[l] = f_length2;
			// write result to shared memory for further reduction

			CrossCorelReduceInLocalMemory(l, p_sh_mem);

			if(!l) {
				_TyReductionAccum x = p_sh_mem[0];
				p_length[g + n_length_stride * h] = (CrossCorelFinalOp);
			}
			// store vector length

			g += get_num_groups(0);
			// go to another set of vectors
		}

		g -= n_first_vector; // must be zero-based for this to work
		h += g / m; // at least one (do *not* try to simplify, you will screw up easily)
		g %= m; // move back
		g += n_first_vector;
		// go to another set of vectors (note that simpler code would suffice if get_num_groups(0) < 2 * m)
	}
}

/**
 *	@brief calculates a tile of corelation matrix with second vector storage
 *
 *	This function calculates a tile of cross-corelation matrix on vectors
 *	(n_first_vector - n_last_vector) x (n_first_vector2 - n_last_vector2).
 *	Note that this includes vectors in both triangular parts, although the matrix
 *	may be symmetric.
 *
 *	@param[out] p_length is the cross-corelation array
 *	@param[in] n_length_off is offset to the cross-corelation array where to put output
 *	@param[in] n_length_stride is the length of each line in the cross-corelation array
 *	@param[in] p_vector is pointer to the dense vector array
 *	@param[in] n_first_vector is zero-based index of the first vector to be processed
 *	@param[in] n_last_vector is zero-based index of one past the last vector to be processed
 *	@param[in] n_vector_length is number of vector elements
 *	@param[in] p_vector1 is pointer to the second dense vector array
 *	@param[in] n_first_vector2 is zero-based index of the first vector to be processed
 *	@param[in] n_last_vector2 is zero-based index of one past the last vector to be processed
 *	@param[in,out] p_vec_cache is space to store the second vector during the computation
 *
 *	@t_odo Provide versions with two different vector sets (correlation instead of crosscorelation).
 *	@t_odo Provide version for symmetric matrix (some workgroups just write null and skip quickly).
 *	@t_odo See if there is an elegent way of storing diagonal matrix in dense array.
 */
__kernel void VectorCorelN_to_M_v2(__global _TyReduction *p_length,
	const unsigned int n_length_off, const unsigned int n_length_stride,
	__global const _TyScalar *p_vector, const unsigned int n_first_vector,
	const unsigned int n_vector_length, const unsigned int n_last_vector,
	__global const _TyScalar *p_vector2, const unsigned int n_first_vector2,
	const unsigned int n_last_vector2, __local _TyScalar *p_vec_cache)
{
	__local _TyReductionAccum p_sh_mem[REDUCTION_LOCAL_WORK_SIZE];
	// local memory known at compile time

	p_length += n_length_off;
	// shift by offset

	p_length -= n_first_vector + n_first_vector2 * n_length_stride;
	// shift back so that the first elem written is on the index 0

	const unsigned int l = get_local_id(0);
	// get local thread id (each thread processes (several) vector elements)

	unsigned int m = n_last_vector - n_first_vector;
	//unsigned int n = n_last_vector2 - n_first_vector2; // unused
	// get dimension of the 2D tile we want to process

	unsigned int g = n_first_vector + get_group_id(0) % m; // vectors 0 to n
	unsigned int h = n_first_vector2 + get_group_id(0) / m; // vectors 0 to m
	// 2D loop counters

	while(h < n_last_vector2) {
		//copy(p_vec_cache, p_vector + h * n_vector_length, n_vector_length * sizeof(_TyScalar));
		{
			__global const _TyScalar *p_vec1_ptr = p_vector2 + h * n_vector_length; // todo - might want to align indices for coalesced reads (applies to strange-sized vectors)
			unsigned int i = l; // local id
			const unsigned int n_ls = get_local_size(0);
			while(i < n_vector_length) { // !! i might be greater than n_vector_lenght already
				p_vec_cache[i] = p_vec1_ptr[i];
				i += n_ls;
			}
		}
		// copies one vector data to local memory
		// this assumes that a copy of vector h does fit in local memory (SIFTs should)

		while(g < n_last_vector) {
			_TyReductionAccum f_length2 = 0;
			{
				__global const _TyScalar *p_vec0_ptr = p_vector + g * n_vector_length;
				__local const _TyScalar *p_vec1_ptr = p_vec_cache;
				unsigned int i = l; // local id
				const unsigned int n_ls = get_local_size(0);
				while(i < n_vector_length) { // !! i might be greater than n_vector_lenght already
					_TyReductionAccum z;
					{
						_TyScalar x = p_vec0_ptr[i];
						_TyScalar y = p_vec1_ptr[i];
						z = (_TyReductionAccum)(CorelElemOp);
					}
					{
						_TyReductionAccum x = f_length2, y = z;
						f_length2 = (CorelReduceOp);
					}
					i += n_ls;
				}
			}
			// read (several) vector elements, calculate sum of squares

			p_sh_mem[l] = f_length2;
			// write result to shared memory for further reduction

			CorelReduceInLocalMemory(l, p_sh_mem);

			if(!l) {
				_TyReductionAccum x = p_sh_mem[0];
				p_length[g + n_length_stride * h] = (CorelFinalOp);
			}
			// store vector length

			g += get_num_groups(0);
			// go to another set of vectors
		}

		g -= n_first_vector; // must be zero-based for this to work
		h += g / m; // at least one (do *not* try to simplify, you will screw up easily)
		g %= m; // move back
		g += n_first_vector;
		// go to another set of vectors (note that simpler code would suffice if get_num_groups(0) < 2 * m)
	}
}

/**
 *	@brief calculates a tile of cross-corelation matrix with second vector storage
 *
 *	This function calculates a tile of cross-corelation matrix on vectors
 *	(n_first_vector - n_last_vector) x (n_first_vector2 - n_last_vector2).
 *	Note that this includes vectors in both triangular parts, although the matrix
 *	may be symmetric.
 *
 *	@param[out] p_length is the cross-corelation array
 *	@param[in] n_length_off is offset to the cross-corelation array where to put output
 *	@param[in] n_length_stride is the length of each line in the cross-corelation array
 *	@param[in] p_vector is pointer to the dense vector array
 *	@param[in] n_first_vector is zero-based index of the first vector to be processed
 *	@param[in] n_last_vector is zero-based index of one past the last vector to be processed
 *	@param[in] n_vector_length is number of vector elements
 *	@param[in] n_first_vector2 is zero-based index of the first vector to be processed
 *	@param[in] n_last_vector2 is zero-based index of one past the last vector to be processed
 *	@param[in,out] p_vec_cache is space to store the second vector during the computation
 *
 *	@t_odo Provide version for symmetric matrix (some workgroups just write null and skip quickly).
 *	@t_odo See if there is an elegent way of storing diagonal matrix in dense array.
 */
__kernel void VectorCorelN_to_N_v2_upper(__global _TyReduction *p_length,
	const unsigned int n_length_off, const unsigned int n_length_stride,
	__global const _TyScalar *p_vector, const unsigned int n_first_vector,
	const unsigned int n_vector_length, const unsigned int n_last_vector,
	const unsigned int n_first_vector2, const unsigned int n_last_vector2,
	__local _TyScalar *p_vec_cache)
{
	__local _TyReductionAccum p_sh_mem[REDUCTION_LOCAL_WORK_SIZE];
	// local memory known at compile time

	p_length += n_length_off;
	// shift by offset

	p_length -= n_first_vector + n_first_vector2 * n_length_stride;
	// shift back so that the first elem written is on the index 0

	const unsigned int l = get_local_id(0);
	// get local thread id (each thread processes (several) vector elements)

	unsigned int m = n_last_vector - n_first_vector;
	//unsigned int n = n_last_vector2 - n_first_vector2; // unused
	// get dimension of the 2D tile we want to process

	unsigned int g = n_first_vector + get_group_id(0) % m; // vectors 0 to n
	unsigned int h = n_first_vector2 + get_group_id(0) / m; // vectors 0 to m
	// 2D loop counters

	while(h < n_last_vector2) {
		//copy(p_vec_cache, p_vector + h * n_vector_length, n_vector_length * sizeof(_TyScalar));
		{
			__global const _TyScalar *p_vec1_ptr = p_vector + h * n_vector_length; // todo - might want to align indices for coalesced reads (applies to strange-sized vectors)
			unsigned int i = l; // local id
			const unsigned int n_ls = get_local_size(0);
			while(i < n_vector_length) { // !! i might be greater than n_vector_lenght already
				p_vec_cache[i] = p_vec1_ptr[i];
				i += n_ls;
			}
		}
		// copies one vector data to local memory
		// this assumes that a copy of vector h does fit in local memory (SIFTs should)

		while(g < n_last_vector) {
			if(h <= g) {
				_TyReductionAccum f_length2 = 0;
				{
					__global const _TyScalar *p_vec0_ptr = p_vector + g * n_vector_length;
					__local const _TyScalar *p_vec1_ptr = p_vec_cache;
					unsigned int i = l; // local id
					const unsigned int n_ls = get_local_size(0);
					while(i < n_vector_length) { // !! i might be greater than n_vector_lenght already
						_TyReductionAccum z;
						{
							_TyScalar x = p_vec0_ptr[i];
							_TyScalar y = p_vec1_ptr[i];
							z = (_TyReductionAccum)(CrossCorelElemOp);
						}
						{
							_TyReductionAccum x = f_length2, y = z;
							f_length2 = (CrossCorelReduceOp);
						}
						i += n_ls;
					}
				}
				// read (several) vector elements, calculate sum of squares

				p_sh_mem[l] = f_length2;
				// write result to shared memory for further reduction

				CrossCorelReduceInLocalMemory(l, p_sh_mem);

				if(!l) {
					_TyReductionAccum x = p_sh_mem[0];
					p_length[g + n_length_stride * h] = (CrossCorelFinalOp);
				}
				// store vector length
			}

			g += get_num_groups(0);
			// go to another set of vectors
		}

		g -= n_first_vector; // must be zero-based for this to work
		h += g / m; // at least one (do *not* try to simplify, you will screw up easily)
		g %= m; // move back
		g += n_first_vector;
		// go to another set of vectors (note that simpler code would suffice if get_num_groups(0) < 2 * m)
	}
}

/**
 *	@brief calculates a tile of cross-corelation matrix with second vector storage
 *		where only the upper triangular part of the matrix is stored
 *
 *	This function calculates a tile of cross-corelation matrix on vectors
 *	(n_first_vector - n_last_vector) x (n_first_vector2 - n_last_vector2).
 *	Note that this includes vectors in both triangular parts, although the matrix
 *	may be symmetric.
 *
 *	@param[out] p_length is the cross-corelation array
 *	@param[in] n_length_off is offset to the cross-corelation array where to put output
 *	@param[in] n_length_stride is the length of each line in the cross-corelation array
 *	@param[in] p_vector is pointer to the dense vector array
 *	@param[in] n_first_vector is zero-based index of the first vector to be processed
 *	@param[in] n_last_vector is zero-based index of one past the last vector to be processed
 *	@param[in] n_vector_length is number of vector elements
 *	@param[in] n_first_vector2 is zero-based index of the first vector to be processed
 *	@param[in] n_last_vector2 is zero-based index of one past the last vector to be processed
 *	@param[in,out] p_vec_cache is space to store the second vector during the computation
 *
 *	@t_odo Provide version for symmetric matrix (some workgroups just write null and skip quickly).
 *	@t_odo See if there is an elegent way of storing diagonal matrix in dense array.
 */
__kernel void VectorCorelN_to_N_v2_upper_pack(__global _TyReduction *p_length,
	const unsigned int n_length_off, const unsigned int n_length_stride,
	__global const _TyScalar *p_vector, const unsigned int n_first_vector,
	const unsigned int n_vector_length, const unsigned int n_last_vector,
	const unsigned int n_first_vector2, const unsigned int n_last_vector2,
	__local _TyScalar *p_vec_cache)
{
	__local _TyReductionAccum p_sh_mem[REDUCTION_LOCAL_WORK_SIZE];
	// local memory known at compile time

	p_length += n_length_off;
	// shift by offset

	const unsigned int _2n1 = 2 * n_length_stride - 1;
	// precalculate for determining row offsets

	p_length -= n_TriangularOffset2(n_first_vector2, n_first_vector, _2n1);
	// shift back so that the first elem written is on the index 0

	const unsigned int l = get_local_id(0);
	// get local thread id (each thread processes (several) vector elements)

	unsigned int m = n_last_vector - n_first_vector;
	//unsigned int n = n_last_vector2 - n_first_vector2; // unused
	// get dimension of the 2D tile we want to process

	unsigned int g = n_first_vector + get_group_id(0) % m; // vectors 0 to n
	unsigned int h = n_first_vector2 + get_group_id(0) / m; // vectors 0 to m
	// 2D loop counters

	while(h < n_last_vector2) {
		unsigned int n_store_off = n_TriangularRowOffset(h, _2n1);
		// calculate an offset where to store the elements

		//copy(p_vec_cache, p_vector + h * n_vector_length, n_vector_length * sizeof(_TyScalar));
		{
			__global const _TyScalar *p_vec1_ptr = p_vector + h * n_vector_length; // todo - might want to align indices for coalesced reads (applies to strange-sized vectors)
			unsigned int i = l; // local id
			const unsigned int n_ls = get_local_size(0);
			while(i < n_vector_length) { // !! i might be greater than n_vector_lenght already
				p_vec_cache[i] = p_vec1_ptr[i];
				i += n_ls;
			}
		}
		// copies one vector data to local memory
		// this assumes that a copy of vector h does fit in local memory (SIFTs should)

		while(g < n_last_vector) {
			if(h <= g) {
				_TyReductionAccum f_length2 = 0;
				{
					__global const _TyScalar *p_vec0_ptr = p_vector + g * n_vector_length;
					__local const _TyScalar *p_vec1_ptr = p_vec_cache;
					unsigned int i = l; // local id
					const unsigned int n_ls = get_local_size(0);
					while(i < n_vector_length) { // !! i might be greater than n_vector_lenght already
						_TyReductionAccum z;
						{
							_TyScalar x = p_vec0_ptr[i];
							_TyScalar y = p_vec1_ptr[i];
							z = (_TyReductionAccum)(CrossCorelElemOp);
						}
						{
							_TyReductionAccum x = f_length2, y = z;
							f_length2 = (CrossCorelReduceOp);
						}
						i += n_ls;
					}
				}
				// read (several) vector elements, calculate sum of squares

				p_sh_mem[l] = f_length2;
				// write result to shared memory for further reduction

				CrossCorelReduceInLocalMemory(l, p_sh_mem);

				if(!l) {
					_TyReductionAccum x = p_sh_mem[0];
					p_length[n_store_off + g] = (CrossCorelFinalOp);
				}
				// store vector length
			}

			g += get_num_groups(0);
			// go to another set of vectors
		}

		g -= n_first_vector; // must be zero-based for this to work
		h += g / m; // at least one (do *not* try to simplify, you will screw up easily)
		g %= m; // move back
		g += n_first_vector;
		// go to another set of vectors (note that simpler code would suffice if get_num_groups(0) < 2 * m)
	}
}

/**
 *	@brief coalesced vector scaling routine
 *
 */
__kernel void ScaleVectors_v2_const_NPOT(__constant const _TyFactor *p_length,
	const unsigned int n_first_length, __global _TyScalar *p_vector_dst,
	__global const _TyScalar *p_vector_src, const unsigned int n_first_vector_elem,
	const unsigned int n_vector_length, const unsigned int n_last_vector_elem)
{
	unsigned int i = n_first_vector_elem + get_global_id(0);
	const unsigned int n_gs = get_global_size(0);

	while(i < n_last_vector_elem) {
		_TyScalar x = p_vector_src[i];
		_TyReduction y = p_length[(i / n_vector_length) - n_first_length];
		p_vector_dst[i] = (ScaleElemOp); // fdiv and division per vector element (:'()
		i += n_gs; // this is multiple of local work size -> reads are coalesced
	}
}

#else //WANT_SECOND_CONST

/**
 *	@brief coalesced vector scaling routine for power-of-two dimensional vector
 *
 */
__kernel void ScaleVectors_v2_const_POT(__constant const _TyFactor *p_length,
	const unsigned int n_first_length, __global _TyScalar *p_vector_dst,
	__global const _TyScalar *p_vector_src, const unsigned int n_first_vector_elem,
	const unsigned int n_vector_length_log2, const unsigned int n_last_vector_elem)
{
	unsigned int i = n_first_vector_elem + get_global_id(0);
	const unsigned int n_gs = get_global_size(0);

	while(i < n_last_vector_elem) {
		_TyScalar x = p_vector_src[i];
		_TyReduction y = p_length[(i >> n_vector_length_log2) - n_first_length];
		p_vector_dst[i] = (ScaleElemOp); // fdiv and shr per vector element (:|)
		i += n_gs; // this is multiple of local work size -> reads are coalesced
	}
}

#endif //WANT_SECOND_CONST
