#ifndef __CL_KERNEL_INTEGER_UNIT_TESTS_INCLUDED
#define __CL_KERNEL_INTEGER_UNIT_TESTS_INCLUDED

/**
 *	@file gpgpu/kernel_utils/Integer_UnitTests.h
 *	@date 2016
 *	@author -tHE SWINe-
 *	@brief unit tests for bit hacks found in gpgpu/kernel_utils/Integer.h
 *
 *	You can use:
 *	@code
 *	bool Integer_UnitTests(cl_context h_context, cl_device_id h_device)
 *	{
 *		CCLUniqueProgram program(h_context,
 *			"../UberLame_src/gpgpu/kernel_utils/Integer_UnitTests.h",
 *			CCLUniqueProgram::from_file, "-I ../UberLame_src/gpgpu/kernel_utils");
 *		if(!program.b_Status())
 *			return false;
 *		// build kernel
 *
 *		CCLContextWrapper context(h_context);
 *		CCLUniqueKernel h_ut(program.h_Get_Kernel("BitHacks_UnitTests"));
 *		CCLUniqueKernel h_ut64(program.h_Get_Kernel("BitHacks_UnitTests64"));
 *		const int n_string_size = 1048576;
 *		CCLUniqueMem dp_string_buffer(context.h_CreateBuffer(n_string_size * sizeof(char)));
 *		uint32_t n_string_length = 0;
 *		CCLUniqueMem dp_string_ptr(context.h_CreateBuffer(sizeof(uint32_t),
 *			&n_string_length, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR));
 *		CCLUniqueCommandqueue cmd_queue(clCreateCommandQueue(h_context,
 *			h_device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0));
 *
 *		const uint32_t n_batch_size = 1048576 * 10; // "* 100" is making the mouse stuck
 *		printf("now 32-bit:\n");
 *		for(uint32_t i = 0;; i = min(i, UINT32_MAX - n_batch_size) + n_batch_size) { // saturated arithmetics
 *			printf("%u\r", i);
 *			int n_batch = min(UINT32_MAX - i, n_batch_size);
 *			int n_local_size = 256;
 *			int n_global_size = n_local_size * 64;
 *			_ASSERTE(!(n_batch % n_global_size)); // must be divisible
 *			CLresult n_result = clCall1D(cmd_queue, h_ut, n_global_size, n_local_size,
 *				dp_string_buffer, dp_string_ptr, n_string_size, int(i), n_batch / n_global_size);
 *			if(n_result != cl_Success) {
 *				fprintf(stderr, "OpenCL error: %d in %s %d\n", n_result, __FILE__, __LINE__);
 *				return false;
 *			}
 *			n_result = cmd_queue.n_Finish();
 *			if(n_result != cl_Success) {
 *				fprintf(stderr, "OpenCL error: %d in %s %d\n", n_result, __FILE__, __LINE__);
 *				return false;
 *			}
 *			n_result = cmd_queue.n_Enqueue_Memcpy_DtoH(&n_string_length, dp_string_ptr, 0, sizeof(uint32_t));
 *			if(n_result != cl_Success) {
 *				fprintf(stderr, "OpenCL error: %d in %s %d\n", n_result, __FILE__, __LINE__);
 *				return false;
 *			}
 *			if(n_string_length) {
 *				if(n_string_length > n_string_size - 1024) {
 *					fprintf(stderr, "warning: there may not be enough space in the printf buffer\n");
 *					n_string_length = min(int(n_string_length), n_string_size);
 *				}
 *				std::string s_string;
 *				s_string.resize(n_string_length);
 *				n_result = cmd_queue.n_Enqueue_Memcpy_DtoH(&s_string[0], dp_string_buffer, 0,
 *					n_string_length * sizeof(char));
 *				if(n_result != cl_Success) {
 *					fprintf(stderr, "OpenCL error: %d in %s %d\n", n_result, __FILE__, __LINE__);
 *					return false;
 *				}
 *				n_string_length = 0;
 *				n_result = cmd_queue.n_Enqueue_Memcpy_HtoD(dp_string_ptr, 0, &n_string_length, sizeof(uint32_t));
 *				if(n_result != cl_Success) {
 *					fprintf(stderr, "OpenCL error: %d in %s %d\n", n_result, __FILE__, __LINE__);
 *					return false;
 *				}
 *				printf("%s", s_string.c_str());
 *			}
 *			if(i == UINT32_MAX)
 *				break;
 *		}
 *		for(int n_pass = 0; n_pass < 2; ++ n_pass) {
 *			printf("now 64-bit (%s):\n", (n_pass)? "high range" : "low range");
 *			uint64_t n_max = (n_pass)? UINT64_MAX : (uint64_t(UINT32_MAX) * 4);
 *			uint64_t n_min = (n_pass)? UINT64_MAX - UINT32_MAX : 0;
 *			for(uint64_t i = n_min;; i = min(i, n_max - n_batch_size) + n_batch_size) { // saturated arithmetics
 *				printf(PRIu64 "\r", i);
 *				int n_batch = (int)min(UINT64_MAX - i, uint64_t(n_batch_size));
 *				int n_local_size = 256;
 *				int n_global_size = n_local_size * 64;
 *				_ASSERTE(!(n_batch % n_global_size)); // must be divisible
 *				CLresult n_result = clCall1D(cmd_queue, h_ut64, n_global_size, n_local_size,
 *					dp_string_buffer, dp_string_ptr, n_string_size, int(i & 0xffffffffU), int(i >> 32), n_batch / n_global_size);
 *				if(n_result != cl_Success) {
 *					fprintf(stderr, "OpenCL error: %d in %s %d\n", n_result, __FILE__, __LINE__);
 *					return false;
 *				}
 *				n_result = cmd_queue.n_Finish();
 *				if(n_result != cl_Success) {
 *					fprintf(stderr, "OpenCL error: %d in %s %d\n", n_result, __FILE__, __LINE__);
 *					return false;
 *				}
 *				n_result = cmd_queue.n_Enqueue_Memcpy_DtoH(&n_string_length, dp_string_ptr, 0, sizeof(uint32_t));
 *				if(n_result != cl_Success) {
 *					fprintf(stderr, "OpenCL error: %d in %s %d\n", n_result, __FILE__, __LINE__);
 *					return false;
 *				}
 *				if(n_string_length) {
 *					if(n_string_length > n_string_size - 1024) {
 *						fprintf(stderr, "warning: there may not be enough space in the printf buffer\n");
 *						n_string_length = min(int(n_string_length), n_string_size);
 *					}
 *					std::string s_string;
 *					s_string.resize(n_string_length);
 *					n_result = cmd_queue.n_Enqueue_Memcpy_DtoH(&s_string[0], dp_string_buffer, 0,
 *						n_string_length * sizeof(char));
 *					if(n_result != cl_Success) {
 *						fprintf(stderr, "OpenCL error: %d in %s %d\n", n_result, __FILE__, __LINE__);
 *						return false;
 *					}
 *					n_string_length = 0;
 *					n_result = cmd_queue.n_Enqueue_Memcpy_HtoD(dp_string_ptr, 0, &n_string_length, sizeof(uint32_t));
 *					if(n_result != cl_Success) {
 *						fprintf(stderr, "OpenCL error: %d in %s %d\n", n_result, __FILE__, __LINE__);
 *						return false;
 *					}
 *					printf("%s", s_string.c_str());
 *				}
 *				if(i == n_max)
 *					break;
 *			}
 *		}
 *
 *		return true;
 *	}
 *	@endcode
 */

#include "Integer.h"

#pragma OPENCL EXTENSION cl_khr_fp64: enable
// needs double log()

#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics: enable
// needs atomics for my_printf1 emulation

typedef struct {
	__global char *p_s_dest;
	__global uint32_t *p_ptr;
	uint32_t n_max_size;
} TStringStream;

inline uint32_t numlen(int n_value)
{
	uint32_t n_number_length = (n_value < 0)? 2 : 1; // zero has one digit, negative numbers have an extra digit
	for(int v = abs(n_value); v >= 10; v /= 10)
		++ n_number_length;
	// calculate the length of the number

	return n_number_length;
}

inline uint32_t numlen_u(unsigned int n_value)
{
	uint32_t n_number_length = 1; // zero has one digit
	for(unsigned int v = n_value; v >= 10; v /= 10)
		++ n_number_length;
	// calculate the length of the number

	return n_number_length;
}

inline bool my_printf1(TStringStream ostream,
	__constant const char *p_s_pattern, int n_value)
{
	uint32_t n_length = 0;

	__constant const char *e = p_s_pattern;
	while(*e != 0) {
		if(*e == '%' && (e[1] == 'd')) {
			n_length += numlen(n_value);
			++ e; // skip '%' and the below increment will skip 'd'
		} else if(*e == '%' && (e[1] == 'u')) {
			n_length += numlen_u(n_value);
			++ e; // skip '%' and the below increment will skip 'd'
		} else
			++ n_length;
		++ e;
	}
	// find end, calculate the length of the output

	uint32_t n_dest = atom_add(ostream.p_ptr, n_length);
	// allocate space in the output

	int n_space_left = ostream.n_max_size - n_dest;
	if(n_space_left <= 0)
		return false;
	// don't write past the boundary

	__global char *p_s_dest = ostream.p_s_dest + n_dest;
	for(__constant const char *p = p_s_pattern; p != e && n_space_left; ++ p) {
		if(*p == '%' && p[1] == 'd') {
			int n_number_length = numlen(n_value);
			if(n_value < 0) {
				n_value = -n_value; // take abs!
				*p_s_dest = '-';
				++ p_s_dest;
				-- n_number_length;
				-- n_space_left;
			} // minus sign
			if(!n_value) {
				if(!n_space_left)
					return false;
				*p_s_dest = '0';
				++ p_s_dest;
				-- n_space_left;
			} else { // zero
				const int n_skip = n_number_length; // remember how much to skip
				for(int v = n_value; n_number_length > 0; v /= 10) {
					if(!n_space_left)
						return false;
					p_s_dest[-- n_number_length] = v % 10 + '0';
					-- n_space_left;
				}
				p_s_dest += n_skip;
			} // numbers
			++ p; // skip one more
		} else if(*p == '%' && p[1] == 'u') {
			unsigned int n_value_u = (unsigned int)n_value;
			int n_number_length = numlen_u(n_value_u);
			if(!n_value_u) {
				if(!n_space_left)
					return false;
				*p_s_dest = '0';
				++ p_s_dest;
				-- n_space_left;
			} else { // zero
				const int n_skip = n_number_length; // remember how much to skip
				for(unsigned int v = n_value; n_number_length > 0; v /= 10) {
					if(!n_space_left)
						return false;
					p_s_dest[-- n_number_length] = v % 10 + '0';
					-- n_space_left;
				}
				p_s_dest += n_skip;
			} // numbers
			++ p; // skip one more
		} else {
			*p_s_dest = *p; // copy the character to output
			++ p_s_dest;
			-- n_space_left;
		}
	}
	// copy the data to the output

	while(p_s_dest < ostream.p_s_dest + n_dest + n_length)
		*(p_s_dest ++) = 'r';
	// debug - insert dummy chars*/

	return n_space_left >= 0;
}

#define saturated_add(a,b,max) (min((a), (max) - (b)) + (b))

inline bool UnitTests32u(TStringStream ostream, uint32_t n_min, uint32_t n_vpt)
{
	n_min = saturated_add(n_min, get_global_id(0) * n_vpt, UINT32_MAX);
	const uint32_t n_max = saturated_add(n_min, n_vpt, UINT32_MAX); // inclusive maximum

	typedef uint32_t Int;

	uint32_t n_fail_num = 0;
	for(Int i = n_min;; ++ i) {
		bool b_is_pot = b_Is_POT(i);
		bool b_is_pot_ref = false;
		if(!i) // need to handle zero explicitly
			b_is_pot_ref = true;
		else {
			for(Int j = 1; j <= i && j != 0; j <<= 1) {
				if(i == j) {
					b_is_pot_ref = true;
					break;
				}
			}
			// search for powers of two by shifting a single one
		}
		if(b_is_pot != b_is_pot_ref) {
			my_printf1(ostream, "is_pot.32u %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("is_pot ");
	for(Int i = n_min;; ++ i) {
		Int n_pot = n_Make_Lower_POT(i); // next greater or equal
		if(!b_Is_POT(n_pot) || // not por
		   n_pot > i || // not lower
		   (n_pot && n_pot < n_max / 2 && n_pot * 2 <= i)) { // lower, but there is a larger lower pot
			my_printf1(ostream, "lpot.32u %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("lpot ");
	for(Int i = n_min;; ++ i) {
		Int n_pot = n_Make_POT(i); // next greater or equal
		if(!b_Is_POT(n_pot) || // not por
		   n_pot < i || // not greater or equal
		   (n_pot && n_pot / 2 >= i)) { // greater or equal, but there is a smaller greater or equal pot
			Int n_lpot = n_Make_Lower_POT(i);
			if(n_lpot > n_max / 2) // the next power of two would be too high, this is not an error
				break;
			my_printf1(ostream, "pot.32u %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("pot ");
	for(Int i = max(n_min, (Int)1);; ++ i) {
		for(Int j = 0; j < n_max / i; ++ j) {
			Int n_aligned = n_Align_Up(j, i);
			if(n_aligned % i != 0 || // is not aligned
			   n_aligned < j || // is not aligned up
			   (n_aligned != 0 && n_aligned - i >= j)) { // is not the next aligned
				//my_printf1("align %d %d\n", (int)j, (int)i);
				my_printf1(ostream, "align.32u %u\n", (int)j);
				i = n_max - 1; // break out of the outer loop
				++ n_fail_num;
				break;
			}
			if(b_Is_POT(i) && n_Align_Up_POT(j, i) != n_aligned) {
				//my_printf1("align_POT %d %d\n", (int)j, (int)i);
				my_printf1(ostream, "align_POT.32u %u\n", (int)j);
				i = n_max - 1; // break out of the outer loop
				++ n_fail_num;
				break;
			}
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("align align_POT ");
	for(Int i = n_min;; ++ i) {
		int n_bits = n_SetBit_Num(i);
		int n_bits_ref = 0;
		for(Int j = i; j; j >>= 1)
			n_bits_ref += j & 1;
		if(n_bits != n_bits_ref) {
			my_printf1(ostream, "popcnt.32u %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("popcnt ");
	for(Int i = 0, n = 0;; ++ n, i = (i << 1) | 1) {
		Int n_mask = n_Mask_32(n);
		if(n_mask != i) {
			my_printf1(ostream, "mask.32u %u\n", (int)n);
			++ n_fail_num;
			break;
		}
		if(i == ((Int)-1))
			break;
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("mask ");
	for(Int i = n_min;; ++ i) {
		int n_width = (int)n_Bit_Width(i);
		if(n_width <= 0 || // no number can be stored with 0 bits
		   (n_Mask((Int)n_width) & i) != i || // does not fit
		   (~n_Mask((Int)n_width) & i) != 0 || // does not fit
		   (n_width > 1 && (n_Mask((Int)n_width - 1) & i) == i)) { // fits even smaller
			my_printf1(ostream, "width.32u %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("width ");
	for(Int i = n_min;; ++ i) {
		int n_bandwidth = (int)n_Bit_Bandwidth(i);
		Int n = i;
		if(!(n & 0xffff))
			n >>= 16;
		if(!(n & 0xff))
			n >>= 8;
		if(!(n & 0xf))
			n >>= 4;
		if(!(n & 0x3))
			n >>= 2;
		while(n && !(n & 1)) // in case it is wider than 32 bits (has fewer loops for 32 bits this way, functional albeit suboptimal for 64 bits - but it is not expected to test full-range of the 64-bit numbers)
			n >>= 1;
		//_ASSERTE(!i || n);
		// shift the value all the way to the right so that LSB is one

		int n_bandwidth_ref = (n)? n_Bit_Width(n) : 0;
		// calculate bandwidth

		if(n_bandwidth != n_bandwidth_ref) {
			my_printf1(ostream, "bandwidth.32u %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("bandwidth ");
	for(Int i = n_min;; ++ i) {
		int n_lzcnt = (int)n_LeadingZero_Num(i);
		int n_lzcnt_naive = 8 * sizeof(Int);
		for(int j = 0; j < 8 * sizeof(Int); ++ j) {
			if(i >> j == 1) {
				n_lzcnt_naive = 8 * sizeof(Int) - j - 1;
				break;
			}
		}
		if(n_lzcnt != n_lzcnt_naive) { // fits even smaller
			my_printf1(ostream, "lzcnt.32u %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("lzcnt ");
	for(Int i = n_min;; ++ i) {
		int n_tzcnt = (int)n_TrailingZero_Num(i);
		int n_tzcnt_naive;
		uint64_t v = i; // must be unsigned otherwise would shift indefinitely
		if(v) {
			v = (v ^ (v - 1)) >> 1;  // set v's trailing 0s to 1s and zero rest
			for(n_tzcnt_naive = 0; v; ++ n_tzcnt_naive)
				v >>= 1;
		} else
			n_tzcnt_naive = 8 * sizeof(Int);
		if(n_tzcnt != n_tzcnt_naive) {
			my_printf1(ostream, "tzcnt.32u %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("tzcnt ");
	for(Int i = n_min;; ++ i) {
		int n_width = (int)n_Bit_Width(i);
		Int n_rfo = n_RightFill_32(i);
		if((i && n_Mask((Int)n_width) != n_rfo) || (!i && n_rfo != 0)) { // fits even smaller
			my_printf1(ostream, "rfo.32u %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("rfo ");
	for(Int i = max(n_min, (Int)1);; ++ i) {
		Int n_log2floor = (i)? (Int)(floor(log((double)i) / log(2.0))) : 0;
		if(n_log2floor != n_Log2(i)) {
			if(i < 1e+9) { // otherwise acceptable
				my_printf1(ostream, "log.32u %u\n", (int)i);
				++ n_fail_num;
			} else
				my_printf1(ostream, "log.32u %u (acceptable)\n", (int)i);
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("log ");
	for(Int i = max(n_min, (Int)1);; ++ i) {
		Int n_log2ceil = (i)? (Int)(ceil(log((double)i) / log(2.0))) : 0;
		if(n_log2ceil != n_Log2_Ceil(i)) {
			if(i > UINT32_MAX / 2)
				break;
			if(i < 0.5e+9) { // otherwise acceptable
				my_printf1(ostream, "log_ceil.32u %u\n", (int)i);
				++ n_fail_num;
			} else
				my_printf1(ostream, "log_ceil.32u %u (acceptable)\n", (int)i);
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("log_ceil ");
	//my_printf1(ostream, ": done (%d fails)\n", n_fail_num); // swamps the buffer

	return !n_fail_num;
}

inline bool UnitTests32s(TStringStream ostream, int32_t n_min, int32_t n_vpt)
{
	n_min = saturated_add(n_min, (int32_t)(get_global_id(0) * n_vpt), INT32_MAX);
	const int32_t n_max = saturated_add(n_min, n_vpt, INT32_MAX); // inclusive maximum

	typedef int32_t Int;

	uint32_t n_fail_num = 0;
	for(Int i = n_min;; ++ i) {
		bool b_is_pot = b_Is_POT(i);
		bool b_is_pot_ref = false;
		if(!i) // need to handle zero explicitly
			b_is_pot_ref = true;
		else {
			for(Int j = 1; j <= i && j != 0; j <<= 1) {
				if(i == j) {
					b_is_pot_ref = true;
					break;
				}
			}
			// search for powers of two by shifting a single one
		}
		if(b_is_pot != b_is_pot_ref) {
			my_printf1(ostream, "is_pot.32s %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("is_pot ");
	for(Int i = n_min;; ++ i) {
		Int n_pot = n_Make_Lower_POT(i); // next greater or equal
		if(!b_Is_POT(n_pot) || // not por
		   n_pot > i || // not lower
		   (n_pot && n_pot < n_max / 2 && n_pot * 2 <= i)) { // lower, but there is a larger lower pot
			my_printf1(ostream, "lpot.32s %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("lpot ");
	for(Int i = n_min;; ++ i) {
		Int n_pot = n_Make_POT(i); // next greater or equal
		if(!b_Is_POT(n_pot) || // not por
		   n_pot < i || // not greater or equal
		   (n_pot && n_pot / 2 >= i)) { // greater or equal, but there is a smaller greater or equal pot
			Int n_lpot = n_Make_Lower_POT(i);
			if(n_lpot > n_max / 2) // the next power of two would be too high, this is not an error
				break;
			my_printf1(ostream, "pot.32s %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("pot ");
	for(Int i = max(n_min, (Int)1);; ++ i) {
		for(Int j = 0; j < n_max / i; ++ j) {
			Int n_aligned = n_Align_Up(j, i);
			if(n_aligned % i != 0 || // is not aligned
			   n_aligned < j || // is not aligned up
			   (n_aligned != 0 && n_aligned - i >= j)) { // is not the next aligned
				//my_printf1("align %d %d\n", (int)j, (int)i);
				my_printf1(ostream, "align.32s %u\n", (int)j);
				i = n_max - 1; // break out of the outer loop
				++ n_fail_num;
				break;
			}
			if(b_Is_POT(i) && n_Align_Up_POT(j, i) != n_aligned) {
				//my_printf1("align_POT %d %d\n", (int)j, (int)i);
				my_printf1(ostream, "align_POT.32s %u\n", (int)j);
				i = n_max - 1; // break out of the outer loop
				++ n_fail_num;
				break;
			}
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("align align_POT ");
	for(Int i = n_min;; ++ i) {
		int n_bits = n_SetBit_Num(i);
		int n_bits_ref = 0;
		for(Int j = i; j; j >>= 1)
			n_bits_ref += j & 1;
		if(n_bits != n_bits_ref) {
			my_printf1(ostream, "popcnt.32s %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("popcnt ");
	for(Int i = 0, n = 0;; ++ n, i = (i << 1) | 1) {
		Int n_mask = n_Mask_32(n);
		if(n_mask != i) {
			my_printf1(ostream, "mask.32s %u\n", (int)n);
			++ n_fail_num;
			break;
		}
		if(i == ((Int)-1))
			break;
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("mask ");
	for(Int i = n_min;; ++ i) {
		int n_width = (int)n_Bit_Width(i);
		if(n_width <= 0 || // no number can be stored with 0 bits
		   (n_Mask((Int)n_width) & i) != i || // does not fit
		   (~n_Mask((Int)n_width) & i) != 0 || // does not fit
		   (n_width > 1 && (n_Mask((Int)n_width - 1) & i) == i)) { // fits even smaller
			my_printf1(ostream, "width.32s %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("width ");
	for(Int i = n_min;; ++ i) {
		int n_bandwidth = (int)n_Bit_Bandwidth(i);
		Int n = i;
		if(!(n & 0xffff))
			n >>= 16;
		if(!(n & 0xff))
			n >>= 8;
		if(!(n & 0xf))
			n >>= 4;
		if(!(n & 0x3))
			n >>= 2;
		while(n && !(n & 1)) // in case it is wider than 32 bits (has fewer loops for 32 bits this way, functional albeit suboptimal for 64 bits - but it is not expected to test full-range of the 64-bit numbers)
			n >>= 1;
		//_ASSERTE(!i || n);
		// shift the value all the way to the right so that LSB is one

		int n_bandwidth_ref = (n)? n_Bit_Width(n) : 0;
		// calculate bandwidth

		if(n_bandwidth != n_bandwidth_ref) {
			my_printf1(ostream, "bandwidth.32s %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("bandwidth ");
	for(Int i = n_min;; ++ i) {
		int n_lzcnt = (int)n_LeadingZero_Num(i);
		int n_lzcnt_naive = 8 * sizeof(Int);
		for(int j = 0; j < 8 * sizeof(Int); ++ j) {
			if(i >> j == 1) {
				n_lzcnt_naive = 8 * sizeof(Int) - j - 1;
				break;
			}
		}
		if(n_lzcnt != n_lzcnt_naive) { // fits even smaller
			my_printf1(ostream, "lzcnt.32s %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("lzcnt ");
	for(Int i = n_min;; ++ i) {
		int n_tzcnt = (int)n_TrailingZero_Num(i);
		int n_tzcnt_naive;
		uint64_t v = i; // must be unsigned otherwise would shift indefinitely
		if(v) {
			v = (v ^ (v - 1)) >> 1;  // set v's trailing 0s to 1s and zero rest
			for(n_tzcnt_naive = 0; v; ++ n_tzcnt_naive)
				v >>= 1;
		} else
			n_tzcnt_naive = 8 * sizeof(Int);
		if(n_tzcnt != n_tzcnt_naive) {
			my_printf1(ostream, "tzcnt.32s %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("tzcnt ");
	for(Int i = n_min;; ++ i) {
		int n_width = (int)n_Bit_Width(i);
		Int n_rfo = n_RightFill_32(i);
		if((i && n_Mask((Int)n_width) != n_rfo) || (!i && n_rfo != 0)) { // fits even smaller
			my_printf1(ostream, "rfo.32s %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("rfo ");
	for(Int i = max(n_min, (Int)1);; ++ i) {
		Int n_log2floor = (i)? (Int)(floor(log((double)i) / log(2.0))) : 0;
		if(n_log2floor != n_Log2(i)) {
			if(i < 1e+9) { // otherwise acceptable
				my_printf1(ostream, "log.32s %u\n", (int)i);
				++ n_fail_num;
			} else
				my_printf1(ostream, "log.32s %u (acceptable)\n", (int)i);
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("log ");
	for(Int i = max(n_min, (Int)1);; ++ i) {
		Int n_log2ceil = (i)? (Int)(ceil(log((double)i) / log(2.0))) : 0;
		if(n_log2ceil != n_Log2_Ceil(i)) {
			if(i > UINT32_MAX / 2)
				break;
			if(i < 0.5e+9) { // otherwise acceptable
				my_printf1(ostream, "log_ceil.32s %u\n", (int)i);
				++ n_fail_num;
			} else
				my_printf1(ostream, "log_ceil.32s %u (acceptable)\n", (int)i);
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("log_ceil ");
	//my_printf1(ostream, ": done (%d fails)\n", n_fail_num); // swamps the buffer

	return !n_fail_num;
}

__kernel void BitHacks_UnitTests(__global char *p_s_dest, __global uint32_t *p_ptr,
	const uint32_t n_max_size, uint32_t n_min, uint32_t n_vpt)
{
	TStringStream ostream = {p_s_dest, p_ptr, n_max_size};

	if(!get_global_id(0) && !n_min)
		my_printf1(ostream, "hello from thread %d: unit test 32 starting\n", 0);
	// just to see that there is something going on

	UnitTests32u(ostream, n_min, n_vpt); // uint32_t
	if(n_min < INT32_MAX)
		UnitTests32s(ostream, n_min, n_vpt); // int32_t
	//my_printf1(ostream, "hello from thread %d\n", /*123 + 0 **/ -get_global_id(0)); // test, seems to work nicely

	if(!get_global_id(0) && n_min >= UINT32_MAX - n_vpt * get_global_size(0))
		my_printf1(ostream, "hello from thread %d: unit test 32 finished\n", 0);
}

inline bool UnitTests64u(TStringStream ostream, uint64_t n_min, uint32_t n_vpt)
{
	n_min = saturated_add(n_min, get_global_id(0) * n_vpt, UINT64_MAX);
	const uint64_t n_max = saturated_add(n_min, n_vpt, UINT64_MAX); // inclusive maximum

	typedef uint64_t Int;

	uint32_t n_fail_num = 0;
	for(Int i = n_min;; ++ i) {
		bool b_is_pot = b_Is_POT(i);
		bool b_is_pot_ref = false;
		if(!i) // need to handle zero explicitly
			b_is_pot_ref = true;
		else {
			for(Int j = 1; j <= i && j != 0; j <<= 1) {
				if(i == j) {
					b_is_pot_ref = true;
					break;
				}
			}
			// search for powers of two by shifting a single one
		}
		if(b_is_pot != b_is_pot_ref) {
			my_printf1(ostream, "is_pot.64u %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("is_pot ");
	for(Int i = n_min;; ++ i) {
		Int n_pot = n_Make_Lower_POT_64(i); // next greater or equal
		if(!b_Is_POT(n_pot) || // not por
		   n_pot > i || // not lower
		   (n_pot && n_pot < n_max / 2 && n_pot * 2 <= i)) { // lower, but there is a larger lower pot
			my_printf1(ostream, "lpot.64u %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("lpot ");
	for(Int i = n_min;; ++ i) {
		Int n_pot = n_Make_POT_64(i); // next greater or equal
		if(!b_Is_POT(n_pot) || // not por
		   n_pot < i || // not greater or equal
		   (n_pot && n_pot / 2 >= i)) { // greater or equal, but there is a smaller greater or equal pot
			Int n_lpot = n_Make_Lower_POT_64(i);
			if(n_lpot > n_max / 2) // the next power of two would be too high, this is not an error
				break;
			my_printf1(ostream, "pot.64u %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("pot ");
	for(Int i = max(n_min, (Int)1);; ++ i) {
		for(Int j = 0; j < n_max / i; ++ j) {
			Int n_aligned = n_Align_Up(j, i);
			if(n_aligned % i != 0 || // is not aligned
			   n_aligned < j || // is not aligned up
			   (n_aligned != 0 && n_aligned - i >= j)) { // is not the next aligned
				//my_printf1("align %d %d\n", (int)j, (int)i);
				my_printf1(ostream, "align.64u %u\n", (int)j);
				i = n_max - 1; // break out of the outer loop
				++ n_fail_num;
				break;
			}
			if(b_Is_POT(i) && n_Align_Up_POT(j, i) != n_aligned) {
				//my_printf1("align_POT %d %d\n", (int)j, (int)i);
				my_printf1(ostream, "align_POT.64u %u\n", (int)j);
				i = n_max - 1; // break out of the outer loop
				++ n_fail_num;
				break;
			}
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("align align_POT ");
	for(Int i = n_min;; ++ i) {
		int n_bits = n_SetBit_Num_64(i);
		int n_bits_ref = 0;
		for(Int j = i; j; j >>= 1)
			n_bits_ref += j & 1;
		if(n_bits != n_bits_ref) {
			my_printf1(ostream, "popcnt.64u %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("popcnt ");
	for(Int i = 0, n = 0;; ++ n, i = (i << 1) | 1) {
		Int n_mask = n_Mask_64(n);
		if(n_mask != i) {
			my_printf1(ostream, "mask.64u %u\n", (int)n);
			++ n_fail_num;
			break;
		}
		if(i == ((Int)-1))
			break;
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("mask ");
	for(Int i = n_min;; ++ i) {
		int n_width = (int)n_Bit_Width_64(i);
		if(n_width <= 0 || // no number can be stored with 0 bits
		   (n_Mask_64((Int)n_width) & i) != i || // does not fit
		   (~n_Mask_64((Int)n_width) & i) != 0 || // does not fit
		   (n_width > 1 && (n_Mask_64((Int)n_width - 1) & i) == i)) { // fits even smaller
			my_printf1(ostream, "width.64u %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("width ");
	for(Int i = n_min;; ++ i) {
		int n_bandwidth = (int)n_Bit_Bandwidth_64(i);
		Int n = i;
		if(!(n & 0xffff))
			n >>= 16;
		if(!(n & 0xff))
			n >>= 8;
		if(!(n & 0xf))
			n >>= 4;
		if(!(n & 0x3))
			n >>= 2;
		while(n && !(n & 1)) // in case it is wider than 32 bits (has fewer loops for 32 bits this way, functional albeit suboptimal for 64 bits - but it is not expected to test full-range of the 64-bit numbers)
			n >>= 1;
		//_ASSERTE(!i || n);
		// shift the value all the way to the right so that LSB is one

		int n_bandwidth_ref = (n)? n_Bit_Width_64(n) : 0;
		// calculate bandwidth

		if(n_bandwidth != n_bandwidth_ref) {
			my_printf1(ostream, "bandwidth.64u %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("bandwidth ");
	for(Int i = n_min;; ++ i) {
		int n_lzcnt = (int)n_LeadingZero_Num_64(i);
		int n_lzcnt_naive = 8 * sizeof(Int);
		for(int j = 0; j < 8 * sizeof(Int); ++ j) {
			if(i >> j == 1) {
				n_lzcnt_naive = 8 * sizeof(Int) - j - 1;
				break;
			}
		}
		if(n_lzcnt != n_lzcnt_naive) { // fits even smaller
			my_printf1(ostream, "lzcnt.64u %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("lzcnt ");
	for(Int i = n_min;; ++ i) {
		int n_tzcnt = (int)n_TrailingZero_Num_64(i);
		int n_tzcnt_naive;
		uint64_t v = i; // must be unsigned otherwise would shift indefinitely
		if(v) {
			v = (v ^ (v - 1)) >> 1;  // set v's trailing 0s to 1s and zero rest
			for(n_tzcnt_naive = 0; v; ++ n_tzcnt_naive)
				v >>= 1;
		} else
			n_tzcnt_naive = 8 * sizeof(Int);
		if(n_tzcnt != n_tzcnt_naive) {
			my_printf1(ostream, "tzcnt.64u %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("tzcnt ");
	for(Int i = n_min;; ++ i) {
		int n_width = (int)n_Bit_Width_64(i);
		Int n_rfo = n_RightFill_64(i);
		if((i && n_Mask_64((Int)n_width) != n_rfo) || (!i && n_rfo != 0)) { // fits even smaller
			my_printf1(ostream, "rfo.64u %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("rfo ");
	for(Int i = max(n_min, (Int)1);; ++ i) {
		if(n_Bit_Width_64(i) > 53) {
			if(i == n_max)
				break;
			continue; // can't be represented as double precisely
		}
		Int n_log2floor = (i)? (Int)(floor(log((double)i) / log(2.0))) : 0;
		if(n_log2floor != n_Log2_64(i)) {
			if(i < 1e+9) { // otherwise acceptable
				my_printf1(ostream, "log.64u %u\n", (int)i);
				++ n_fail_num;
			} else
				my_printf1(ostream, "log.64u %u (acceptable)\n", (int)i);
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("log ");
	for(Int i = max(n_min, (Int)1);; ++ i) {
		if(n_Bit_Width_64(i) > 53) {
			if(i == n_max)
				break;
			continue; // can't be represented as double precisely
		}
		Int n_log2ceil = (i)? (Int)(ceil(log((double)i) / log(2.0))) : 0;
		if(n_log2ceil != n_Log2_Ceil_64(i)) {
			if(i > UINT64_MAX / 2)
				break;
			if(i < 0.5e+9) { // otherwise acceptable
				my_printf1(ostream, "log_ceil.64u %u\n", (int)i);
				++ n_fail_num;
			} else
				my_printf1(ostream, "log_ceil.64u %u (acceptable)\n", (int)i);
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("log_ceil ");
	//my_printf1(ostream, ": done (%d fails)\n", n_fail_num); // swamps the buffer*/

	return !n_fail_num;
}

inline bool UnitTests64s(TStringStream ostream, int64_t n_min, int32_t n_vpt)
{
	n_min = saturated_add(n_min, (int64_t)(get_global_id(0) * n_vpt), INT64_MAX);
	const int64_t n_max = saturated_add(n_min, n_vpt, INT64_MAX); // inclusive maximum

	typedef int64_t Int;

	uint32_t n_fail_num = 0;
	for(Int i = n_min;; ++ i) {
		bool b_is_pot = b_Is_POT(i);
		bool b_is_pot_ref = false;
		if(!i) // need to handle zero explicitly
			b_is_pot_ref = true;
		else {
			for(Int j = 1; j <= i && j != 0; j <<= 1) {
				if(i == j) {
					b_is_pot_ref = true;
					break;
				}
			}
			// search for powers of two by shifting a single one
		}
		if(b_is_pot != b_is_pot_ref) {
			my_printf1(ostream, "is_pot.64s %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("is_pot ");
	for(Int i = n_min;; ++ i) {
		Int n_pot = n_Make_Lower_POT_64(i); // next greater or equal
		if(!b_Is_POT(n_pot) || // not por
		   n_pot > i || // not lower
		   (n_pot && n_pot < n_max / 2 && n_pot * 2 <= i)) { // lower, but there is a larger lower pot
			my_printf1(ostream, "lpot.64s %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("lpot ");
	for(Int i = n_min;; ++ i) {
		Int n_pot = n_Make_POT_64(i); // next greater or equal
		if(!b_Is_POT(n_pot) || // not por
		   n_pot < i || // not greater or equal
		   (n_pot && n_pot / 2 >= i)) { // greater or equal, but there is a smaller greater or equal pot
			Int n_lpot = n_Make_Lower_POT_64(i);
			if(n_lpot > n_max / 2) // the next power of two would be too high, this is not an error
				break;
			my_printf1(ostream, "pot.64s %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("pot ");
	for(Int i = max(n_min, (Int)1);; ++ i) {
		for(Int j = 0; j < n_max / i; ++ j) {
			Int n_aligned = n_Align_Up(j, i);
			if(n_aligned % i != 0 || // is not aligned
			   n_aligned < j || // is not aligned up
			   (n_aligned != 0 && n_aligned - i >= j)) { // is not the next aligned
				//my_printf1("align %d %d\n", (int)j, (int)i);
				my_printf1(ostream, "align.64s %u\n", (int)j);
				i = n_max - 1; // break out of the outer loop
				++ n_fail_num;
				break;
			}
			if(b_Is_POT(i) && n_Align_Up_POT(j, i) != n_aligned) {
				//my_printf1("align_POT %d %d\n", (int)j, (int)i);
				my_printf1(ostream, "align_POT.64s %u\n", (int)j);
				i = n_max - 1; // break out of the outer loop
				++ n_fail_num;
				break;
			}
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("align align_POT ");
	for(Int i = n_min;; ++ i) {
		int n_bits = n_SetBit_Num_64(i);
		int n_bits_ref = 0;
		for(Int j = i; j; j >>= 1)
			n_bits_ref += j & 1;
		if(n_bits != n_bits_ref) {
			my_printf1(ostream, "popcnt.64s %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("popcnt ");
	for(Int i = 0, n = 0;; ++ n, i = (i << 1) | 1) {
		Int n_mask = n_Mask_64(n);
		if(n_mask != i) {
			my_printf1(ostream, "mask.64s %u\n", (int)n);
			++ n_fail_num;
			break;
		}
		if(i == ((Int)-1))
			break;
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("mask ");
	for(Int i = n_min;; ++ i) {
		int n_width = (int)n_Bit_Width_64(i);
		if(n_width <= 0 || // no number can be stored with 0 bits
		   (n_Mask_64((Int)n_width) & i) != i || // does not fit
		   (~n_Mask_64((Int)n_width) & i) != 0 || // does not fit
		   (n_width > 1 && (n_Mask_64((Int)n_width - 1) & i) == i)) { // fits even smaller
			my_printf1(ostream, "width.64s %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("width ");
	for(Int i = n_min;; ++ i) {
		int n_bandwidth = (int)n_Bit_Bandwidth_64(i);
		Int n = i;
		if(!(n & 0xffff))
			n >>= 16;
		if(!(n & 0xff))
			n >>= 8;
		if(!(n & 0xf))
			n >>= 4;
		if(!(n & 0x3))
			n >>= 2;
		while(n && !(n & 1)) // in case it is wider than 32 bits (has fewer loops for 32 bits this way, functional albeit suboptimal for 64 bits - but it is not expected to test full-range of the 64-bit numbers)
			n >>= 1;
		//_ASSERTE(!i || n);
		// shift the value all the way to the right so that LSB is one

		int n_bandwidth_ref = (n)? n_Bit_Width_64(n) : 0;
		// calculate bandwidth

		if(n_bandwidth != n_bandwidth_ref) {
			my_printf1(ostream, "bandwidth.64s %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("bandwidth ");
	for(Int i = n_min;; ++ i) {
		int n_lzcnt = (int)n_LeadingZero_Num_64(i);
		int n_lzcnt_naive = 8 * sizeof(Int);
		for(int j = 0; j < 8 * sizeof(Int); ++ j) {
			if(i >> j == 1) {
				n_lzcnt_naive = 8 * sizeof(Int) - j - 1;
				break;
			}
		}
		if(n_lzcnt != n_lzcnt_naive) { // fits even smaller
			my_printf1(ostream, "lzcnt.64s %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("lzcnt ");
	for(Int i = n_min;; ++ i) {
		int n_tzcnt = (int)n_TrailingZero_Num_64(i);
		int n_tzcnt_naive;
		uint64_t v = i; // must be unsigned otherwise would shift indefinitely
		if(v) {
			v = (v ^ (v - 1)) >> 1;  // set v's trailing 0s to 1s and zero rest
			for(n_tzcnt_naive = 0; v; ++ n_tzcnt_naive)
				v >>= 1;
		} else
			n_tzcnt_naive = 8 * sizeof(Int);
		if(n_tzcnt != n_tzcnt_naive) {
			my_printf1(ostream, "tzcnt.64s %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("tzcnt ");
	for(Int i = n_min;; ++ i) {
		int n_width = (int)n_Bit_Width_64(i);
		Int n_rfo = n_RightFill_64(i);
		if((i && n_Mask_64((Int)n_width) != n_rfo) || (!i && n_rfo != 0)) { // fits even smaller
			my_printf1(ostream, "rfo.64s %u\n", (int)i);
			++ n_fail_num;
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("rfo ");
	for(Int i = max(n_min, (Int)1);; ++ i) {
		if(n_Bit_Width_64(i) > 53) {
			if(i == n_max)
				break;
			continue; // can't be represented as double precisely
		}
		Int n_log2floor = (i)? (Int)(floor(log((double)i) / log(2.0))) : 0;
		if(n_log2floor != n_Log2_64(i)) {
			if(i < 1e+9) { // otherwise acceptable
				my_printf1(ostream, "log.64s %u\n", (int)i);
				++ n_fail_num;
			} else
				my_printf1(ostream, "log.64s %u (acceptable)\n", (int)i);
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("log ");
	for(Int i = max(n_min, (Int)1);; ++ i) {
		if(n_Bit_Width_64(i) > 53) {
			if(i == n_max)
				break;
			continue; // can't be represented as double precisely
		}
		Int n_log2ceil = (i)? (Int)(ceil(log((double)i) / log(2.0))) : 0;
		if(n_log2ceil != n_Log2_Ceil_64(i)) {
			if(i > UINT64_MAX / 2)
				break;
			if(i < 0.5e+9) { // otherwise acceptable
				my_printf1(ostream, "log_ceil.64s %u\n", (int)i);
				++ n_fail_num;
			} else
				my_printf1(ostream, "log_ceil.64s %u (acceptable)\n", (int)i);
			break;
		}
		if(i == n_max) // inclusive maximum, need to handle overflows
			break;
	}
	//my_printf1("log_ceil ");
	//my_printf1(ostream, ": done (%d fails)\n", n_fail_num); // swamps the buffer*/

	return !n_fail_num;
}

__kernel void BitHacks_UnitTests64(__global char *p_s_dest, __global uint32_t *p_ptr,
	const uint32_t n_max_size, uint32_t n_min_lo, uint32_t n_min_hi, uint32_t n_vpt)
{
	TStringStream ostream = {p_s_dest, p_ptr, n_max_size};

	uint64_t n_min = n_min_lo + ((uint64_t)n_min_hi << 32);

	if(!get_global_id(0) && !n_min)
		my_printf1(ostream, "hello from thread %d: unit test 64 starting\n", 0);
	// just to see that there is something going on

	UnitTests64u(ostream, n_min, n_vpt); // uint32_t
	if(n_min < INT64_MAX)
		UnitTests64s(ostream, n_min, n_vpt); // int32_t
	//my_printf1(ostream, "hello from thread %d\n", /*123 + 0 **/ -get_global_id(0)); // test, seems to work nicely

	if(!get_global_id(0) && n_min >= UINT64_MAX - n_vpt * get_global_size(0))
		my_printf1(ostream, "hello from thread %d: unit test 64 finished\n", 0);
}

#endif // !__CL_KERNEL_INTEGER_UNIT_TESTS_INCLUDED
