/*
								+--------------------------------+
								|                                |
								|  ***  OpenCL auto-tuning  ***  |
								|                                |
								|  Copyright  -tHE SWINe- 2016  |
								|                                |
								|           Autotune.h           |
								|                                |
								+--------------------------------+
*/

#pragma once
#ifndef __OPENCL_AUTOTUNING_HELPERS_INCLUDED
#define __OPENCL_AUTOTUNING_HELPERS_INCLUDED

/**
 *	@file gpgpu/Autotune.h
 *	@date 2016
 *	@author -tHE SWINe-
 *	@brief auto-tuning data structures for OpenCL and also some general API-agnostic classes
 */

//#pragma warning(disable:4503) // C4503: 'std::map ...' : decorated name length exceeded
// not needed anymore

#include <vector>
#include <string>
#include <map>
#include <set>
#include <algorithm>
#include <numeric>
#include "../StlUtils.h"
#include "../StdIOUtils.h"
#include "../FormatPrefix.h"
#include "AutotuneInfo.h"

#pragma warning(disable: 4250)
// disable the MSVC inheritance via dominance warning

/**
 *	@brief a simple interface for OpenCL autotuning
 */
class CAutotuneInterface {
public:
	virtual const char *p_s_Benchmark_Id() const = 0;
	virtual void Init_InputVector(std::vector<size_t> &r_input, std::vector<size_t> &r_hidden) const = 0;
	virtual bool Next_InputVector(std::vector<size_t> &r_input, std::vector<size_t> &r_hidden) const = 0;
	virtual void Init_TuningVector(std::vector<size_t> &r_tuning, std::vector<size_t> &r_hidden) const = 0;
	virtual bool Next_TuningVector(std::vector<size_t> &r_tuning, std::vector<size_t> &r_hidden) const = 0;
	virtual bool Prepare_Input(const std::vector<size_t> &r_input) = 0;
};

/**
 *	@brief implementation of input / tuning model over input size, workgroup size and tile size
 */
class CAutotuneModel_DataSize_WorkGroupSize_TileSize : public virtual CAutotuneInterface { // todo - there should be also data type size (4/8) in addition to the input size
public:
	virtual const size_t n_InputSize_Lookup(size_t n_index) const = 0;
	virtual const size_t n_WorkGroupSize_Lookup(size_t n_index) const = 0;
	virtual const size_t n_ItemsPerThread_Lookup(size_t n_index) const = 0;
	// user needs to implement those, the first index is 0, the array is terminated by returning 0

	virtual void Init_InputVector(std::vector<size_t> &r_input, std::vector<size_t> &r_hidden) const // throw(std::bad_alloc)
	{
		r_input.resize(1);
		r_hidden.resize(1);
		r_hidden.front() = 0;
		r_input.front() = n_InputSize_Lookup(0);
	}

	virtual bool Next_InputVector(std::vector<size_t> &r_input, std::vector<size_t> &r_hidden) const
	{
		_ASSERTE(r_input.size() == 1 && r_hidden.size() == 1);
		return (r_input.front() = n_InputSize_Lookup(++ r_hidden.front())) != 0; // are there more sizes?
	}

	virtual void Init_TuningVector(std::vector<size_t> &r_tuning, std::vector<size_t> &r_hidden) const // throw(std::bad_alloc)
	{
		r_tuning.resize(2);
		r_hidden.resize(2);
		r_hidden[0] = 0;
		r_hidden[1] = 0;
		r_tuning[0] = n_WorkGroupSize_Lookup(r_hidden[0]);
		r_tuning[1] = n_WorkGroupSize_Lookup(r_hidden[0]) * n_ItemsPerThread_Lookup(r_hidden[1]);
	}

	virtual bool Next_TuningVector(std::vector<size_t> &r_tuning, std::vector<size_t> &r_hidden) const
	{
		_ASSERTE(r_tuning.size() == 2 && r_hidden.size() == 2);
		if(!(r_tuning[0] = n_WorkGroupSize_Lookup(++ r_hidden[0]))) {
			r_tuning[0] = n_WorkGroupSize_Lookup(r_hidden[0] = 0); // reset
			if(!(r_tuning[1] = r_tuning[0] * n_ItemsPerThread_Lookup(++ r_hidden[1]))) // carry
				return false; // are there more sizes?
		} else
			r_tuning[1] = r_tuning[0] * n_ItemsPerThread_Lookup(r_hidden[1]); // update this as well
		return true;
	}
};

/**
 *	@brief a simple permissible parser which can read results generated by \ref CCLSimpleAutotuneDriver
 */
class CAutotuneResultParser {
public:
	class CIntParser {
	protected:
		const char *b, *e;

	public:
		CIntParser(const std::string &a)
			:b(a.c_str()), e(a.c_str() + a.length())
		{}

		bool SkipToBracket() // unused
		{
			while(b < e && *b != '[')
				++ b;
			if(*b != '[')
				return false;
			++ b;
			return true;
		}

		bool ReadInt(size_t &r_n_value)
		{
			r_n_value = 0;
			if(b == e || !isdigit(uint8_t(*b)))
				return false;
			while(b < e && isdigit(uint8_t(*b))) {
				r_n_value *= 10;
				r_n_value += *b - '0';
				++ b;
			}
			_ASSERTE(b <= e); // we should not cross past the end
			_ASSERTE(b == e || !isdigit(uint8_t(*b))); // if not at the end, then the next char is not a digit
			if(b < e /*&& !isdigit(uint8_t(*b))*/)
				++ b; // skip colon
			return true;
		}
	};

	class CNumericTupleCompare {
	public:
		inline bool operator ()(const std::string &a, const std::string &b) const
		{
			CIntParser ap(a), bp(b);
			/*bool ba = ap.SkipToBracket(), bb = bp.SkipToBracket();
			if(!ba && bb)
				return true; // less
			if(ba && !bb)
				return false; // more
			if(!ba && !bb)
				return false; // equal*/ // no brackets!
			for(;;) {
				size_t na, nb;
				bool ba = ap.ReadInt(na), bb = bp.ReadInt(nb);
				if(!ba && bb)
					return true; // less
				if(ba && !bb)
					return false; // more
				if(!ba && !bb)
					return false; // equal
				if(na < nb)
					return true; // less
				if(na > nb)
					return false; // more
			}
			return false; // equal
		}
	};

	typedef std::map<std::string, std::vector<double>, CNumericTupleCompare> CResultMap;
	typedef std::map<std::string, CResultMap, CNumericTupleCompare> CTuning_ResultMap_LongName;
	struct CTuning_ResultMap : public CTuning_ResultMap_LongName {};
	typedef std::map<std::string, CTuning_ResultMap> CDevice_Tuning_ResultMap_LongName;
	struct CDevice_Tuning_ResultMap : public CDevice_Tuning_ResultMap_LongName {};
	typedef std::map<std::string, CDevice_Tuning_ResultMap> CKernel_Device_Tuning_ResultMap_LongName;
	struct CKernel_Device_Tuning_ResultMap : public CKernel_Device_Tuning_ResultMap_LongName {};
	// trick the compiler into forgetting the long names of std::map<> by inheriting from them
	// this also loses all the constructors but we weren't using them so far, can always be added later
	// this avoids the "C4503: 'std::map ...' : decorated name length exceeded" warning without using
	// a pragma (which does not work if <map> has been #included before)

public:
	static bool b_HaveResult(const CKernel_Device_Tuning_ResultMap &r_results_map,
		const std::string &r_s_kernel_id, const std::string &r_s_device_id,
		const std::vector<size_t> &r_input_config, const std::vector<size_t> &r_tuning_config)
	{
		CKernel_Device_Tuning_ResultMap::const_iterator p_kernel_it = r_results_map.find(r_s_kernel_id);
		if(p_kernel_it == r_results_map.end())
			return false; // no such kernel
		CDevice_Tuning_ResultMap::const_iterator p_device_it = (*p_kernel_it).second.find(r_s_device_id);
		if(p_device_it == (*p_kernel_it).second.end())
			return false; // no such device
		std::string s_input_str, s_tuning_str;
		if(!p_s_FormatVector_Bare(s_input_str, r_input_config) ||
		   !p_s_FormatVector_Bare(s_tuning_str, r_tuning_config))
			return false; // failure to find out
		CTuning_ResultMap::const_iterator p_tuning_it = (*p_device_it).second.find(s_tuning_str);
		if(p_tuning_it == (*p_device_it).second.end())
			return false; // no such tuning
		return (*p_tuning_it).second.count(s_input_str) != 0;
		// kernel / device / tuning / configuration <- result
	}

	static size_t n_Result_Num(const CKernel_Device_Tuning_ResultMap &r_results_map,
		const std::string &r_s_kernel_id, const std::string &r_s_device_id,
		const std::vector<size_t> &r_input_config, const std::vector<size_t> &r_tuning_config)
	{
		CKernel_Device_Tuning_ResultMap::const_iterator p_kernel_it = r_results_map.find(r_s_kernel_id);
		if(p_kernel_it == r_results_map.end())
			return size_t(-1); // no such kernel
		CDevice_Tuning_ResultMap::const_iterator p_device_it = (*p_kernel_it).second.find(r_s_device_id);
		if(p_device_it == (*p_kernel_it).second.end())
			return size_t(-1); // no such device
		std::string s_input_str, s_tuning_str;
		if(!p_s_FormatVector_Bare(s_input_str, r_input_config) ||
		   !p_s_FormatVector_Bare(s_tuning_str, r_tuning_config))
			return size_t(-1); // failure to find out
		CTuning_ResultMap::const_iterator p_tuning_it = (*p_device_it).second.find(s_tuning_str);
		if(p_tuning_it == (*p_device_it).second.end())
			return size_t(-1); // no such tuning
		CResultMap::const_iterator p_result_it = (*p_tuning_it).second.find(s_input_str);
		if(p_result_it == (*p_tuning_it).second.end())
			return size_t(-1); // no such input configuration
		return (*p_result_it).second.size();
		// kernel / device / tuning / configuration <- result
	}

	static bool Parse(CKernel_Device_Tuning_ResultMap &r_results_map, const char *p_s_filename)
	{
		r_results_map.clear(); // !!

		FILE *p_fr;
#if defined(_MSC_VER) && !defined(__MWERKS__) && _MSC_VER >= 1400
		if(fopen_s(&p_fr, p_s_filename, "r"))
#else //_MSC_VER && !__MWERKS__ && _MSC_VER >= 1400
		if(!(p_fr = fopen(p_s_filename, "r")))
#endif //_MSC_VER && !__MWERKS__ && _MSC_VER >= 1400
			return false;
		CFILE_PtrGuard guard(p_fr); // close it automatically
		try {
			std::string s_line;
			std::vector<std::string> tokens;
			while(p_fr && !feof(p_fr)) {
				if(!stl_ut::ReadLine(s_line, p_fr))
					break;
				stl_ut::TrimSpace(s_line);
				if(s_line.empty())
					continue;
				stl_ut::Split(tokens, s_line, " ", 0);
				/*size_t n_input_size;
				char n_input_bracket;*/
				if(tokens.size() == 7 && !tokens[2].find("input[") &&
					tokens[2][tokens[2].length() - 1] == ']' &&
				   /*sscanf(tokens[2].c_str(), "input[" PRIsize "%c",
				   &n_input_size, &n_input_bracket) == 2 && n_input_bracket == ']' &&*/
				   !tokens[3].find("tuning[") && tokens[3][tokens[3].length() - 1] == ']' &&
				   tokens[5] == "//") {
				    tokens[2].erase(tokens[2].length() - 1);
				    tokens[2].erase(0, 6/*strlen("input[")*/);
				    tokens[3].erase(tokens[3].length() - 1);
				    tokens[3].erase(0, 7/*strlen("tuning[")*/);
					double f_result_time;
#if defined(_MSC_VER) && !defined(__MWERKS__) && _MSC_VER >= 1400
					if(tokens[4] != "fail" && sscanf_s(tokens[4].c_str(), "%lf", &f_result_time) == 1)
#else //_MSC_VER && !__MWERKS__ && _MSC_VER >= 1400
					if(tokens[4] != "fail" && sscanf(tokens[4].c_str(), "%lf", &f_result_time) == 1)
#endif //_MSC_VER && !__MWERKS__ && _MSC_VER >= 1400
						(((r_results_map[tokens[1]])[tokens[0]])[tokens[3]])[tokens[2]].push_back(f_result_time);
					else if(tokens[4] == "fail")
						(((r_results_map[tokens[1]])[tokens[0]])[tokens[3]])[tokens[2]].size(); // insert an empty vector to show that someone tried and failed
					// kernel / device / tuning / configuration <- result
				}
			}
			// a very permissive parser, matching only the correct lines and ignoring everything else
		} catch(std::bad_alloc&) {
			return false;
		}
		return true;
	}

	static const char *p_s_FormatVector_Bare(std::string &r_storage, const std::vector<size_t> &r_vec)
	{
		r_storage.clear();
		for(size_t i = 0, n = r_vec.size(); i < n; ++ i) {
			char p_s_num[256];
			if(!stl_ut::Format(p_s_num, sizeof(p_s_num), (i)? "," PRIsize : PRIsize, r_vec[i]))
				return 0;
			r_storage += p_s_num;
		}
		return r_storage.c_str();
	}

	static const char *p_s_FormatVector(std::string &r_storage, const char *p_s_label,
		const std::vector<size_t> &r_vec)
	{
		if(!stl_ut::AssignCStr(r_storage, p_s_label) ||
		   !stl_ut::AppendCStr(r_storage, "["))
			return 0;
		for(size_t i = 0, n = r_vec.size(); i < n; ++ i) {
			char p_s_num[256];
			if(!stl_ut::Format(p_s_num, sizeof(p_s_num), (i)? "," PRIsize : PRIsize, r_vec[i]))
				return 0;
			r_storage += p_s_num;
		}
		return (stl_ut::AppendCStr(r_storage, "]"))? r_storage.c_str() : 0;
	}
};

/**
 *	@brief this algorithm chooses cuts of the 1D configuration space
 *		(e.g. input data sizes) and assigns optimal tuning for each of the cuts
 */
class CAutotuneCuts_1D {
public:
	struct TTuningSpan {
		size_t n_first_configuration;
		size_t n_last_configuration;
		size_t n_tuning;

		TTuningSpan()
		{}

		TTuningSpan(size_t _n_first_configuration,
			size_t _n_last_configuration, size_t _n_tuning)
			:n_first_configuration(_n_first_configuration),
			n_last_configuration(_n_last_configuration),
			n_tuning(_n_tuning)
		{}
	};

	class CFormatTime {
	public:
		inline void operator ()(std::string &r_s_dest,
			size_t UNUSED(n_configuration), double f_value) const
		{
			stl_ut::Format(r_s_dest, PRIvalueMP "s", PRIvalueMPparams(f_value));
			// ignores errors
		}
	};

	class CFormatGBpS {
	protected:
		const std::vector<size_t> &m_r_input_sizes;
		const size_t m_n_elem_size;

	public:
		CFormatGBpS(const std::vector<size_t> &r_input_sizes, size_t n_elem_size)
			:m_r_input_sizes(r_input_sizes), m_n_elem_size(n_elem_size)
		{}

		inline void operator ()(std::string &r_s_dest,
			size_t n_configuration, double f_time) const
		{
			double f_GB = m_r_input_sizes[n_configuration] * m_n_elem_size/* * 1e-9*/; // PRIvalueMP adds the G suffix, also using only 1e-9 rather than 9.313e-10
			stl_ut::Format(r_s_dest, PRIvalueMP, PRIvalueMPparamsExt(f_GB / f_time, 3, 3, false));
			// ignores errors
		}
	};

	class CReturnIndex {
	protected:
		std::string m_s_storage;

	public:
		inline const char *operator [](size_t n_index)
		{
			stl_ut::Format(m_s_storage, PRIsize, n_index);
			return m_s_storage.c_str();
		}
	};

protected:
	class CCumsumIterator {
	protected:
		std::vector<double>::iterator m_p_cost_it;
		std::vector<size_t>::iterator m_p_fail_it;
		double m_f_running_cost;
		size_t m_n_fail_num;

	public:
		CCumsumIterator(std::vector<double>::iterator p_cost_it,
			std::vector<size_t>::iterator p_fail_it)
			:m_p_cost_it(p_cost_it), m_p_fail_it(p_fail_it),
			m_f_running_cost(0), m_n_fail_num(0)
		{}

		inline void operator ()(double f_cost)
		{
			m_f_running_cost += f_cost;
			m_n_fail_num += (f_cost == -1.0);
			*m_p_cost_it = m_f_running_cost;
			*m_p_fail_it = m_n_fail_num;
			++ m_p_cost_it;
			++ m_p_fail_it;
			// calculate inclusive sums
		}
	};

protected:
	const size_t m_n_max_cut_num; /**< @brief maximum number of cuts */
	const size_t m_n_tuning_num; /**< @brief the dimension that is fixed between the cuts */
	const size_t m_n_config_num; /**< @brief the dimension to optimize the cuts along */

	std::vector<double> m_cost_matrix; /**< @brief matrix of (positive) costs or -1.0 for prohibited configuration */
	std::vector<double> m_cost_cumsum_matrix; /**< @brief matrix of configuration-wise cumulative sums of costs (interpreting the prohibited costs as zero costs) */
	std::vector<size_t> m_fail_cumsum_matrix; /**< @brief matrix of configuration-wise cumulative sums of prohibited configurations */
	// all the matrices are stored so that the individual configurations
	// for a specific tuning are contiguous (so "configuration-major" order)

	std::vector<TTuningSpan> m_tuning; /**< @brief the spans (separated by the calculated cuts) and the associated lowest-cost tunings */
	// calculated by Find_Cuts(), has up to m_n_max_cut_num elements

public:
	/**
	 *	@brief default constructor; allocates the input arrays
	 *
	 *	@param[in] n_max_cut_num is the maximum number of 
	 *	@param[in] n_tuning_num
	 *	@param[in] n_configuration_num
	 *
	 *	@note This might fail to allocate the required memory if there are many
	 *		different configurations and tunings; use \ref b_Status() to see if
	 *		this succeeded.
	 */
	CAutotuneCuts_1D(size_t n_max_cut_num, size_t n_tuning_num, size_t n_configuration_num)
		:m_n_max_cut_num(n_max_cut_num), m_n_tuning_num(n_tuning_num),
		m_n_config_num(n_configuration_num)
	{
		_ASSERTE(n_configuration_num > 0 && n_tuning_num <= SIZE_MAX / n_configuration_num);
		stl_ut::Resize_To_N(m_cost_matrix, n_tuning_num * n_configuration_num, -1.0);
		stl_ut::Resize_To_N(m_cost_cumsum_matrix, n_tuning_num * n_configuration_num);
		stl_ut::Resize_To_N(m_fail_cumsum_matrix, n_tuning_num * n_configuration_num);
	}

	bool b_Status() const
	{
		return m_cost_matrix.size() == m_n_tuning_num * m_n_config_num &&
			m_cost_cumsum_matrix.size() == m_n_tuning_num * m_n_config_num &&
			m_fail_cumsum_matrix.size() == m_n_tuning_num * m_n_config_num;
	}

	/**
	 *	@brief sets all the configurations and tuning combinationss as forbidden
	 *	@note Initially, all the configurations and tuning combinationss are forbidden.
	 */
	void Clear_Costs()
	{
		std::fill(m_cost_matrix.begin(), m_cost_matrix.end(), -1.0);
	}

	/**
	 *	@brief marks a specific configuration / tuning combination as forbidden
	 *	@note Use \ref Clear_Costs() to set all the configurations and tuning
	 *		combinationss as forbidden, use \ref Set_Cost() to set a configuration /
	 *		tuning combination as permitted and assign cost.
	 */
	void Set_Forbidden(size_t n_configuration, size_t n_tuning)
	{
		_ASSERTE(b_Status());
		_ASSERTE(n_configuration < m_n_config_num);
		_ASSERTE(n_tuning < m_n_tuning_num);
		m_cost_matrix[m_n_config_num * n_tuning + n_configuration] = -1.0;
	}

	/**
	 *	@brief marks a specific configuration / tuning combination as permitted and assigns cost
	 */
	void Set_Cost(size_t n_configuration, size_t n_tuning, double f_cost)
	{
		_ASSERTE(b_Status());
		_ASSERTE(f_cost >= 0); // costs must be nonnegative; negative values mark failure
		_ASSERTE(n_configuration < m_n_config_num);
		_ASSERTE(n_tuning < m_n_tuning_num);
		m_cost_matrix[m_n_config_num * n_tuning + n_configuration] = f_cost;
	}

	/**
	 *	@brief gets the number of configuration space spans
	 *	@return Returns the number of spans the configuration space is cut into.
	 */
	size_t n_Span_Num() const
	{
		return m_tuning.size();
	}

	/**
	 *	@brief gets a configuration space span and the associated tuning
	 *	@param[in] n_index is zero-based index of configuration space span
	 *	@return Returns the selected configuration space span.
	 */
	const TTuningSpan &t_Span(size_t n_index) const
	{
		return m_tuning[n_index];
	}

	/**
	 *	@brief replaces the costs by their ranks, separately for each input configuration
	 *
	 *	@return Returns true on success, false on failure.
	 *
	 *	@note In case the range of costs is high for different input configurations,
	 *		this hides the difference, possibly biasing the tuning towards the smaller
	 *		costs where the absolute differences are much lower than in the large costs.
	 *	@note An alternative way of normalizing the costs is ranking, see \ref Rank_Costs().
	 */
	bool Rank_Costs()
	{
		_ASSERTE(b_Status());
		try {
			std::vector<std::pair<double, size_t> > sort_list(m_n_tuning_num);
			for(size_t i = 0; i < m_n_config_num; ++ i) {
				sort_list.clear();
				for(size_t j = 0; j < m_n_tuning_num; ++ j) {
					if(m_cost_matrix[j * m_n_config_num + i] == -1.0)
						continue;
					sort_list.push_back(std::make_pair(m_cost_matrix[j * m_n_config_num + i], j));
				}
				// make a list of value / original index pairs

				std::sort(sort_list.begin(), sort_list.end());
				// sort it (value used as primary key, original index only causes stable sort)

				for(size_t k = 0, o = sort_list.size(); k < o; ++ k)
					m_cost_matrix[sort_list[k].second * m_n_config_num + i] = k;
				// assign rank to the original indices in place of cost (does not modify the -1 elements)
			}
		} catch(std::bad_alloc&) {
			return false;
		}
		return true;
	}

	/**
	 *	@brief normalizes costs, separately for each input configuration
	 *
	 *	@param[in] b_divide_by_config_min chooses normalization algorithm
	 *		(if set, the costs are divided by a minimum which leads to a slowdown matrix,
	 *		if cleared, the costs are normalized to fit into the [0, 1] interval)
	 *	@param[in] b_use_log converts the costs to logarithmic scale
	 *	@param[in] b_use_exp converts the costs to exponential scale
	 *
	 *	@note In case the range of costs is high for different input configurations,
	 *		only calculating the slowdown matrix makes the costs less sensitive to the
	 *		changes in the small costs and more sensitive to the changes in the large costs.
	 *	@note An alternative way of normalizing the costs is ranking, see \ref Rank_Costs().
	 */
	void Normalize_Costs(bool b_divide_by_config_min = true,
		bool b_use_log = false, bool b_use_exp = false)
	{
		_ASSERTE(!b_use_exp || !b_use_log); // mutually exclusive
		_ASSERTE(b_Status());
		for(size_t i = 0; i < m_n_config_num; ++ i) {
			double f_min_time = 1, f_max_time = 1; // initialize so that the debugger is not yapping about reading uninitialized variables in the log/exp branches in case there are no results
			for(size_t j = 0; j < m_n_tuning_num; ++ j) {
				if(m_cost_matrix[j * m_n_config_num + i] == -1.0)
					continue;
				f_min_time = f_max_time = m_cost_matrix[j * m_n_config_num + i];
				for(++ j; j < m_n_tuning_num; ++ j) {
					if(m_cost_matrix[j * m_n_config_num + i] == -1.0)
						continue;
					if(f_min_time > m_cost_matrix[j * m_n_config_num + i])
						f_min_time = m_cost_matrix[j * m_n_config_num + i];
					if(f_max_time < m_cost_matrix[j * m_n_config_num + i])
						f_max_time = m_cost_matrix[j * m_n_config_num + i];
				}
				break;
			}
			if(b_use_exp) {
				for(size_t j = 0; j < m_n_tuning_num; ++ j) {
					if(m_cost_matrix[j * m_n_config_num + i] == -1.0)
						continue;
					if(b_divide_by_config_min)
						m_cost_matrix[j * m_n_config_num + i] /= f_min_time; // turns into a slowdown matrix, i guess log-slowdown would work as well
					else {
						m_cost_matrix[j * m_n_config_num + i] -= f_min_time;
						m_cost_matrix[j * m_n_config_num + i] /= f_max_time - f_min_time;
					}
					m_cost_matrix[j * m_n_config_num + i] = exp(m_cost_matrix[j * m_n_config_num + i]);
				}
			} else if(b_use_log) {
				f_min_time = log(f_min_time);
				f_max_time = log(f_max_time);
				for(size_t j = 0; j < m_n_tuning_num; ++ j) {
					if(m_cost_matrix[j * m_n_config_num + i] == -1.0)
						continue;
					if(b_divide_by_config_min)
						m_cost_matrix[j * m_n_config_num + i] = log(m_cost_matrix[j * m_n_config_num + i]) - f_min_time; // turns into a slowdown matrix, i guess log-slowdown would work as well
					else {
						m_cost_matrix[j * m_n_config_num + i] = log(m_cost_matrix[j * m_n_config_num + i]) - f_min_time;
						m_cost_matrix[j * m_n_config_num + i] /= f_max_time - f_min_time;
					}
				}
			} else {
				for(size_t j = 0; j < m_n_tuning_num; ++ j) {
					if(m_cost_matrix[j * m_n_config_num + i] == -1.0)
						continue;
					if(b_divide_by_config_min)
						m_cost_matrix[j * m_n_config_num + i] /= f_min_time; // turns into a slowdown matrix, i guess log-slowdown would work as well
					else {
						m_cost_matrix[j * m_n_config_num + i] -= f_min_time;
						m_cost_matrix[j * m_n_config_num + i] /= f_max_time - f_min_time;
					}
				}
			}
		}
	}

	/**
	 *	@brief calculates the cuts over the configuration space for autotuning
	 *	@return Returns true on success, false on failure.
	 */
	bool Find_Cuts()
	{
		_ASSERTE(b_Status());
		if(!b_Status())
			return false; // fail

		Clear_FailedConfigurations();
		Calculate_Cumsums();

		_ASSERTE(m_n_max_cut_num <= SIZE_MAX - 2); // overflow
		const size_t k = m_n_max_cut_num + 2; // the cut at the beginning / end of the configuration space is implied
		_ASSERTE(m_n_config_num <= SIZE_MAX / k); // overflow

		std::vector<size_t> p_predecessor_table, p_tuning_table;
		std::vector<double> p_cost_pingpong;
		if(!stl_ut::Resize_To_N(p_predecessor_table, k * m_n_config_num, 0) ||
		   !stl_ut::Resize_To_N(p_tuning_table, k * m_n_config_num, 0) ||
		   !stl_ut::Resize_To_N(p_cost_pingpong, 2 * m_n_config_num, .0))
			return false;
		//std::vector<std::pair<double, size_t> > p_cost_tuning_table(k *
		//	m_n_config_num, std::make_pair(0.0, size_t(0))); // t_odo - decouple these arrays and make the cost table only two rows which will be ping-ponged
		// all the tables are in "configuration major" order

		// f_MinStraightRow() can be implemented in O(n_tuning_num) using cumulative sums

		for(size_t i = 0; i < m_n_config_num; ++ i) {
			std::pair<double, size_t> t_min = t_FindMinSpan(0, i); // t_odo - make this just pair<double, int> and make it use the cumsums, maybe figure out if there is a faster way of finding minimum? probably not.;
			p_cost_pingpong[0 * m_n_config_num + i] = t_min.first;
			p_tuning_table[0 * m_n_config_num + i] = t_min.second;
		}
		// bootstrap with costs of staying in a single straight row forever

		// the below algorithm is n_size_num * k - (k * k - k) / 2 evaluations of t_MinStraightRow() which is O(n_tuning_num)
		// so all of it is less than O(n_size_num * n_tuning_num * k) which is essentially the number of benchmarks (linear time then)
		// using BFS would take O(num_edges) which would likely be about O(n_tuning_num + n_size_num * ((n_tuning_num^2 - n_tuning_num) / 2 + n_tuning_num)) which is about O(n_size_num * n_tuning_num^2)

		size_t n_cur = m_n_config_num, n_prev = 0;
		for(size_t j = 1; j < k; ++ j, std::swap(n_cur, n_prev)) {
			for(size_t i = 0; i < j; ++ i) {
				p_cost_pingpong[n_cur + i] = 1e300;
				p_tuning_table[j * m_n_config_num + i] = size_t(-1);
				p_predecessor_table[j * m_n_config_num + i] = -1;
			}
			for(size_t i = j; i < m_n_config_num; ++ i) {
				size_t n_pred = size_t(-1);
				std::pair<double, size_t> t_best(1e300, size_t(-1));
				for(size_t l = 0; l < i; ++ l) { // only look left
					std::pair<double, size_t> t_cost = t_FindMinSpan(l + 1, i); // cost of staying in one row in [l + 1, i]
					t_cost.first += p_cost_pingpong[n_prev + l]; // cost of staying in one row in [0, l]
					if(t_cost.first < t_best.first) {
						t_best = t_cost;
						n_pred = l;
					}
				}
				_ASSERTE(n_pred != size_t(-1));
				p_predecessor_table[j * m_n_config_num + i] = n_pred;
				p_cost_pingpong[n_cur + i] = t_best.first;
				p_tuning_table[j * m_n_config_num + i] = t_best.second;
				//_ASSERTE(i == t_best.n_to); // no need to store it then
				//_ASSERTE(n_pred == t_best.n_from - 1); // no need to store it then
			}
		}
		size_t n_best_cost = m_n_config_num - 1; // must end at m_n_config_num - 1
		/*std::vector<size_t> p_tuning(k);
		p_tuning[k - 1] = n_best_cost;*/
		for(size_t i = k - 1; i > 0;) {
			-- i;
			size_t n_to = n_best_cost, n_from = (i)? p_predecessor_table[i * m_n_config_num + n_best_cost] + 1 : 0;
			if(m_tuning.empty() || m_tuning.front().n_tuning != p_tuning_table[i * m_n_config_num + n_best_cost]) {
				m_tuning.insert(m_tuning.begin(), TTuningSpan(n_from,
					n_to, p_tuning_table[i * m_n_config_num + n_best_cost]));
				// add
			} else {
				_ASSERTE(m_tuning.front().n_first_configuration == n_to + 1);
				m_tuning.front().n_first_configuration = n_from;
				// extend (to the left!)
			}
#if 0 && defined(_DEBUG)
			printf("debug: sizes " PRIsize " - " PRIsize " using tuning " PRIsize "\n", /*p_row_table[i * m_n_config_num + n_best_cost].*/n_from,
				/*p_row_table[i * m_n_config_num + n_best_cost].*/n_to,
				p_tuning_table/*p_row_table*/[i * m_n_config_num + n_best_cost]/*.n_row*/);
#endif // 0 && _DEBUG
			/*_ASSERTE(p_row_table[i * m_n_config_num + n_best_cost].n_to == n_best_cost); // no need to store
			_ASSERTE(p_row_table[i * m_n_config_num + n_best_cost].n_from ==
				((i)? p_predecessor_table[i * m_n_config_num + n_best_cost] + 1 : 0)); // no need to store*/
			n_best_cost = p_predecessor_table[i * m_n_config_num + n_best_cost];
			//p_tuning[i] = n_best_cost;
		}

		_ASSERTE(!m_tuning.empty());
		_ASSERTE(m_tuning.front().n_first_configuration == 0);
		_ASSERTE(m_tuning.back().n_last_configuration + 1 == m_n_config_num);
		// make sure this somehow spans all the configurations

#ifdef _DEBUG
		for(size_t i = 0, n = m_tuning.size(); i < n; ++ i) {
			_ASSERTE(m_tuning[i].n_tuning == t_FindMinSpan(m_tuning[i].n_first_configuration,
				m_tuning[i].n_last_configuration).second); // make sure these are indeed the minimal cost ones
#if 0
			printf("debug: sizes " PRIsize " - " PRIsize " using tuning " PRIsize "\n", m_tuning[i].n_first_configuration,
				m_tuning[i].n_last_configuration, m_tuning[i].n_tuning);
#endif // 0
		}
#endif // _DEBUG

		return true;
	}

	// initialize TKernelAutotuneInfo specialization for autotuning specification at runtime
	template <class CATSpec, class CAlgoSpec, class CSizeDataType, class CTuningDataType>
	bool Initialize_AutotuneInfo(CATSpec &r_dest, const char *p_s_device_id,
		CAlgoSpec t_algorithm_specialization, const CSizeDataType *p_configurations_list,
		size_t UNUSED(n_configuration_num), CSizeDataType t_max_configuration,
		const CTuningDataType *p_tuning_list, size_t UNUSED(n_tuning_num))
	{
		_ASSERTE(n_configuration_num == m_n_config_num);
		_ASSERTE(n_tuning_num == m_n_tuning_num);
		size_t k = m_n_max_cut_num + 1;
		if(CATSpec::n_max_specialization_num < k)
			return false; // not large enough
		// size checks

		r_dest.p_s_device_id = p_s_device_id;
		r_dest.t_algorithm_specialization = t_algorithm_specialization;
		// fill device name and data type size

		std::set<size_t> employed_tunings;
		try {
			for(size_t i = 0, n = m_tuning.size(); i < n; ++ i)
				employed_tunings.insert(m_tuning[i].n_tuning);
		} catch(std::bad_alloc&) {
			return false;
		}
		// collect unique tunings

		r_dest.n_tuning_num = employed_tunings.size();
		std::set<size_t>::const_iterator p_tuning_it = employed_tunings.begin();
		for(size_t i = 0, n = employed_tunings.size(); i < n; ++ i, ++ p_tuning_it)
			r_dest.p_tuning[i] = p_tuning_list[*p_tuning_it];
		size_t n_last_tuning = *(-- employed_tunings.end());
		for(size_t i = employed_tunings.size(); i < CATSpec::n_max_specialization_num; ++ i)
			r_dest.p_tuning[i] = p_tuning_list[n_last_tuning]; // repeat the last one
		// fill the tunings

		for(size_t i = 0, n = m_tuning.size(); i < n; ++ i) {
			r_dest.p_tuning_index[i] = size_t(std::distance(employed_tunings.begin(),
				employed_tunings.find(m_tuning[i].n_tuning)));
			if(i + 1 < CATSpec::n_max_specialization_num) { // the last one is not stored
				if(m_tuning[i].n_last_configuration + 1 == m_n_config_num)
					r_dest.p_configuration[i] = t_max_configuration;
				else
					r_dest.p_configuration[i] = p_configurations_list[m_tuning[i].n_last_configuration + 1];
			}
		}
		for(size_t i = m_tuning.size(); i < CATSpec::n_max_specialization_num; ++ i) {
			r_dest.p_tuning_index[i] = size_t(std::distance(employed_tunings.begin(),
				employed_tunings.find(m_tuning.back().n_tuning))); // repeat the last one
			if(i + 1 < CATSpec::n_max_specialization_num) // the last one is not stored
				r_dest.p_configuration[i] = t_max_configuration;
		}
		// fill the configurations

		return true;
	}

	static size_t n_POT_inBetween(size_t n_left, size_t n_right)
	{
		_ASSERTE(n_right > n_left); // so that the below line does not overflow
		size_t n_mid = n_left + (n_right - n_left) / 2;
		size_t p_pot[4], n_pot_num = 0;
		{
			p_pot[n_pot_num] = n_Make_POT(n_left);
			if(p_pot[n_pot_num] > n_left && p_pot[n_pot_num] < n_right)
				++ n_pot_num;
			p_pot[n_pot_num] = n_Make_Lower_POT(n_mid);
			if(p_pot[n_pot_num] > n_left && p_pot[n_pot_num] < n_right &&
			   (!n_pot_num || p_pot[n_pot_num] != p_pot[n_pot_num - 1]))
				++ n_pot_num;
			p_pot[n_pot_num] = n_Make_POT(n_mid);
			if(p_pot[n_pot_num] > n_left && p_pot[n_pot_num] < n_right &&
			   (!n_pot_num || p_pot[n_pot_num] != p_pot[n_pot_num - 1]))
				++ n_pot_num;
			p_pot[n_pot_num] = n_Make_Lower_POT(n_right);
			if(p_pot[n_pot_num] > n_left && p_pot[n_pot_num] < n_right &&
			   (!n_pot_num || p_pot[n_pot_num] != p_pot[n_pot_num - 1]))
				++ n_pot_num;
			if(!n_pot_num) {
				p_pot[n_pot_num] = n_mid;
				++ n_pot_num;
			}
		}
		_ASSERTE(n_pot_num);
		size_t n_best_pot = p_pot[0];
		for(size_t j = 1; j < n_pot_num; ++ j) {
			if(std::abs(n_best_pot - n_mid) > std::abs(p_pot[j] - n_mid))
				n_best_pot = p_pot[j];
		}
		return n_best_pot;
	}

	// this is somehow limited to the configurations being integers but provides
	// logic for selecting in-betweens (otherwis those would be strings and the
	// caller would have to do that).
	// p_s_trailine_TKernelAutotuneInfo_specializer_list could be something like
	// ", std::pair&lt;size_t, size_t&gt;, size_t, std::less&lt;size_t&gt;"
	// (needs to start with ", ").
	bool Generate_HTML_Report_InitializerCode(std::string &r_s_html,
		const char *p_s_device_id, size_t n_data_type_size,
		const std::vector<size_t> &r_configuration_thresh_list,
		const std::vector<std::string> &r_tuning_constructor_list,
		const char *p_s_trailing_TKernelAutotuneInfo_specializer_list,
		const char *p_s_kernel_autotune_info_data_type_name = 0, bool b_stick_to_POTs = true)
	{
		size_t k = m_n_max_cut_num + 1;
		if(p_s_kernel_autotune_info_data_type_name) {
			stl_ut::Format(r_s_html, "%s(\"%s\", " PRIsize ", // device and data type size\n    ",
				p_s_kernel_autotune_info_data_type_name,
				p_s_device_id, n_data_type_size);
		} else {
			stl_ut::Format(r_s_html, "TKernelAutotuneInfo&lt;" PRIsize
				"%s&gt;(\"%s\", " PRIsize ", // device and data type size\n    ", k,
				p_s_trailing_TKernelAutotuneInfo_specializer_list,
				p_s_device_id, n_data_type_size);
		}

		std::set<size_t> employed_tunings;
		for(size_t i = 0, n = m_tuning.size(); i < n; ++ i)
			employed_tunings.insert(m_tuning[i].n_tuning);
		// collect unique tunings

		_ASSERTE(m_tuning.size() <= k); // not more!
		std::string s_tuning;
		for(size_t i = 0, n = m_tuning.size(); i < n; ++ i) {
			size_t n_thresh_right;
			if(m_tuning[i].n_last_configuration + 1 == r_configuration_thresh_list.size())
				n_thresh_right = SIZE_MAX;
			else {
				size_t n_left = r_configuration_thresh_list[m_tuning[i].n_last_configuration];
				size_t n_right = r_configuration_thresh_list[m_tuning[i].n_last_configuration + 1];
				_ASSERTE(n_right >= n_left);
				n_thresh_right = n_left + (n_right - n_left) / 2; // average without overflowing
				if(b_stick_to_POTs && n_Make_POT(n_thresh_right) > n_left && n_Make_POT(n_thresh_right) < n_right)
					n_thresh_right = n_Make_POT(n_thresh_right);
				else if(b_stick_to_POTs && n_Make_POT(n_left) > n_left && n_Make_POT(n_left) < n_right)
					n_thresh_right = n_Make_POT(n_left);
			}
			if(n_thresh_right == SIZE_MAX) {
				stl_ut::Format(s_tuning, ", " "SIZE_MAX" ", " PRIsize + ((i)? 0 : 2),
					size_t(std::distance(employed_tunings.begin(), employed_tunings.find(m_tuning[i].n_tuning))));
			} else {
				stl_ut::Format(s_tuning, ", " PRIsize ", " PRIsize + ((i)? 0 : 2), n_thresh_right,
					size_t(std::distance(employed_tunings.begin(), employed_tunings.find(m_tuning[i].n_tuning))));
			}
			stl_ut::Append(r_s_html, s_tuning);
		}
		for(size_t i = m_tuning.size(); i < k; ++ i) {
			stl_ut::Format(s_tuning, ", " "SIZE_MAX" ", " PRIsize,
				size_t(std::distance(employed_tunings.begin(), employed_tunings.find(m_tuning.back().n_tuning))));
			stl_ut::Append(r_s_html, s_tuning);
			// repeat the last one
		}
		std::string s_count;
		stl_ut::Format(s_count, ", // configuration threshold / tuning index pairs\n    " PRIsize, employed_tunings.size());
		stl_ut::Append(r_s_html, s_count);
		std::set<size_t>::const_iterator p_tuning_it = employed_tunings.begin();
		for(size_t i = 0, n = employed_tunings.size(); i < n; ++ i, ++ p_tuning_it) {
			stl_ut::AppendCStr(r_s_html, ", ");
			stl_ut::Append(r_s_html, r_tuning_constructor_list[*p_tuning_it]);
		}
		size_t n_last_tuning = *(-- employed_tunings.end());
		for(size_t i = employed_tunings.size(); i < k; ++ i) {
			stl_ut::AppendCStr(r_s_html, ", ");
			stl_ut::Append(r_s_html, r_tuning_constructor_list[n_last_tuning]); // repeat the last one
		}
		stl_ut::AppendCStr(r_s_html, "); // number of used tunings and the list of tunings");
		return true;
	}

	template <class CResultPresentationFormatter,
	class CTuningNameFormatter, class CConfigurationNameFormatter>
	bool Generate_HTML_Report_Table(std::string &r_s_html,
		CResultPresentationFormatter format = CFormatTime(),
		CConfigurationNameFormatter configuration_names = CReturnIndex(),
		CTuningNameFormatter tuning_names = CReturnIndex())
	{
		_ASSERTE(b_Status());
		if(!b_Status())
			return false; // fail

		try {
			std::vector<float> normalized_cost(m_n_tuning_num * m_n_config_num);
			for(size_t j = 0; j < m_n_config_num; ++ j) {
				double f_min = 1e300, f_max = -1e300;
				for(size_t i = 0; i < m_n_tuning_num; ++ i) {
					double f_cost = m_cost_matrix[i * m_n_config_num + j];
					if(f_cost != -1.0) {
						if(f_min > f_cost)
							f_min = f_cost;
						if(f_max < f_cost)
							f_max = f_cost;
					}
				}
				double f_scale = (f_max - f_min == 0)? 1 : 1 / (f_max - f_min);
				for(size_t i = 0; i < m_n_tuning_num; ++ i) {
					normalized_cost[i * m_n_config_num + j] =
						float((m_cost_matrix[i * m_n_config_num + j] - f_min) * f_scale);
				}
			}
			// calculate normalized costs for coloring

			r_s_html += "<tr><th>input [elems]</th>";
			for(size_t j = 0; j < m_n_config_num; ++ j) {
				r_s_html += "<th>";
				r_s_html += p_s_GetStr(configuration_names[j]);
				r_s_html += "</th>";
			}
			r_s_html += "</tr>";
			for(size_t i = 0; i < m_n_tuning_num; ++ i) {
				r_s_html += "<tr>";
				r_s_html += "<th>";
				r_s_html += p_s_GetStr(tuning_names[i]);
				r_s_html += "</th>";
				for(size_t j = 0; j < m_n_config_num; ++ j) {
					if(m_cost_matrix[i * m_n_config_num + j] == -1.0) {
						r_s_html += "<td style=\"background-color: white; border-style: none;\"></td>";
						continue;
					}
					// handle forbidden cells

					std::string s_presentation;
					format(s_presentation, j, m_cost_matrix[i * m_n_config_num + j]);
					// build string to be inside the cell (e.g. GB/sec)

					float f_normalized_cost = 1 - normalized_cost[i * m_n_config_num + j];

					if(f_normalized_cost == 1)
						s_presentation = "<strong>" + s_presentation + "</strong>"; // in bold

					/*float r = 1 - f_normalized_cost, g = f_normalized_cost,
						b = .2f + (.5f - fabs(f_normalized_cost - .5f)) * .1f;
					r = sqrt(r) * .8;
					g = sqrt(g) * .8;
					float n = sqrt(r * r + g * g + b * b);
					r /= n;
					g /= n;
					b /= n;*/
					float r0 = 106 / 255.0f, g0 = 135 / 255.0f, b0 = 66 / 255.0f; // http://www.colourlovers.com/color/6A8742/Natural_Reaction
					//float r0 = 200 / 255.0f, g0 = 219 / 255.0f, b0 = 16 / 255.0f; // http://www.colourlovers.com/color/C8DB10/Vine_Fruit
					float r1 = 249 / 255.0f, g1 = 209 / 255.0f, b1 = 1 / 255.0f; // http://www.colourlovers.com/color/F9D101/Daiseys 
					float r2 = 234 / 255.0f, g2 = 90 / 255.0f, b2 = 69 / 255.0f; // http://www.colourlovers.com/color/EA5A45/ea5a45
					const float c = .75f;
					float r = (f_normalized_cost < c)? r0 + (r1 - r0) * f_normalized_cost / c :
						r1 + (r2 - r1) * (f_normalized_cost - c) / (1 - c);
					float g = (f_normalized_cost < c)? g0 + (g1 - g0) * f_normalized_cost / c :
						g1 + (g2 - g1) * (f_normalized_cost - c) / (1 - c);
					float b = (f_normalized_cost < c)? b0 + (b1 - b0) * f_normalized_cost / c :
						b1 + (b2 - b1) * (f_normalized_cost - c) / (1 - c);
					char p_s_color[32];
					stl_ut::Format(p_s_color, sizeof(p_s_color), "#%02x%02x%02x",
						std::max(0, std::min(255, int(r * 255))),
						std::max(0, std::min(255, int(g * 255))),
						std::max(0, std::min(255, int(b * 255))));
					// get color

					bool b_top_bottom_border = false;
					bool b_left_border = false;
					bool b_right_border = false;
					for(size_t k = 0, o = m_tuning.size(); k < o; ++ k) {
						if(m_tuning[k].n_tuning == i &&
						   j >= m_tuning[k].n_first_configuration &&
						   j <= m_tuning[k].n_last_configuration) {
							b_top_bottom_border = true;
							b_left_border = (j == m_tuning[k].n_first_configuration);
							b_right_border = (j == m_tuning[k].n_last_configuration);
							break;
						}
					}
					// get border

					r_s_html += "<td style=\"background-color: ";
					r_s_html += p_s_color;
					r_s_html += "; border-style: "; // (top right bottom left)
					r_s_html += (b_top_bottom_border)? "solid " : "none ";
					r_s_html += (b_right_border)? "solid " : "none ";
					r_s_html += (b_top_bottom_border)? "solid " : "none ";
					r_s_html += (b_left_border)? "solid;" : "none;";
					r_s_html += "\">";
					r_s_html += s_presentation;
					r_s_html += "</td>";
					// generate cell HTML
				}
				r_s_html += "</tr>\n";
			}
		} catch(std::bad_alloc&) {
			return false;
		}
		return true;
	}

protected:
	static const char *p_s_GetStr(const char *p_s_str)
	{
		return p_s_str;
	}

	static const char *p_s_GetStr(const std::string &r_s_str)
	{
		return r_s_str.c_str();
	}

	std::pair<double, size_t> t_FindMinSpan(size_t n_first_config,
		size_t n_last_config) const // O(m_n_tuning_num)
	{
		_ASSERTE(n_first_config <= n_last_config && n_last_config < m_n_config_num);

		double f_cost = 1e300;
		size_t n_tuning = -1;
		for(size_t i = 0; i < m_n_tuning_num; ++ i) {
			if(n_Span_Fail_Num(i, n_first_config, n_last_config))
				continue;
			// skip over all the failing spans

			n_tuning = i;
			f_cost = f_Span_Cost(i, n_first_config, n_last_config);
			for(++ i; i < m_n_tuning_num; ++ i) {
				if(n_Span_Fail_Num(i, n_first_config, n_last_config))
					continue;
				double f_cost_i = f_Span_Cost(i, n_first_config, n_last_config);
				if(f_cost > f_cost_i) {
					f_cost = f_cost_i;
					n_tuning = i;
				}
			}
			break;
		}

		return std::make_pair(f_cost, n_tuning);
	}

	inline double f_Span_Cost(size_t n_tuning,
		size_t n_first_config, size_t n_last_config) const // O(1)
	{
		_ASSERTE(n_tuning < m_n_tuning_num);
		_ASSERTE(n_first_config <= n_last_config && n_last_config < m_n_config_num);
		n_tuning *= m_n_config_num;
		size_t b = n_tuning + n_first_config, e = n_tuning + n_last_config;
		double f_result = (!n_first_config)? m_cost_cumsum_matrix[e] /*- 0*/ :
			m_cost_cumsum_matrix[e] - m_cost_cumsum_matrix[b - 1];
#ifdef _DEBUG
		double f_result_ref = std::accumulate(m_cost_matrix.begin() + b, m_cost_matrix.begin() + (e + 1), 0.0);
		_ASSERTE(fabs((f_result - f_result_ref) / std::max(fabs(f_result_ref), 1.0)) < 1e-3);
#endif // _DEBUG
		return f_result;
	}

	inline size_t n_Span_Fail_Num(size_t n_tuning,
		size_t n_first_config, size_t n_last_config) const // O(1)
	{
		_ASSERTE(n_tuning < m_n_tuning_num);
		_ASSERTE(n_first_config <= n_last_config && n_last_config < m_n_config_num);
		n_tuning *= m_n_config_num;
		size_t b = n_tuning + n_first_config, e = n_tuning + n_last_config;
		size_t n_result = (!n_first_config)? m_fail_cumsum_matrix[e] /*- 0*/ :
			m_fail_cumsum_matrix[e] - m_fail_cumsum_matrix[b - 1];
#ifdef _DEBUG
		size_t n_result_ref = 0;
		for(size_t i = b; i <= e; ++ i)
			n_result_ref += (m_cost_matrix[i] == -1.0)? 1 : 0;
		_ASSERTE(n_result == n_result_ref);
#endif // _DEBUG
		return n_result;
	}

	/**
	 *	@brief assigns zero costs to configurations where all the tunings failed
	 *		(so that there is nothing to choose from)
	 */
	void Clear_FailedConfigurations()
	{
		for(size_t j = 0; j < m_n_config_num; ++ j) {
			size_t n_fail_num = 0;
			for(size_t i = 0; i < m_n_tuning_num; ++ i) {
				if(m_cost_matrix[i * m_n_config_num + j] == -1.0)
					++ n_fail_num;
			}
			if(n_fail_num == m_n_tuning_num) {
				// this configuration has failed with all tunings - there is nothing for
				// the autotuner to do about it, it must use some tuning and there is no
				// information about which might work ... so we treat all those equally.

				for(size_t i = 0; i < m_n_tuning_num; ++ i)
					m_cost_matrix[i * m_n_config_num + j] = 0;
				// assign zero costs to all tunings in this particular configuration
			}
		}
	}

	/**
	 *	@brief calculates cost and failure cumsum matrix from the cost matrix
	 */
	void Calculate_Cumsums()
	{
		for(size_t i = 0; i < m_n_tuning_num; ++ i) {
			size_t b = i * m_n_config_num, e = (i + 1) * m_n_config_num;
			std::for_each(m_cost_matrix.begin() + b, m_cost_matrix.begin() + e,
				CCumsumIterator(m_cost_cumsum_matrix.begin() + b,
				m_fail_cumsum_matrix.begin() + b));
		}
	}
};

/**
 *	@brief a simple autotune HTML report generator
 */
class CAutotuneCuts_ReportGenerator {
public:
	enum {
		result_Last = 0,
		result_Median,
		result_Mean
	};

	enum {
		normalize_Rank = 0,
		normalize_Slowdown,
		normalize_ZeroOne,
		normalize_Flags = 0xff00,
		normalize_Logarithm_Flag = 0x8000,
		normalize_Exponential_Flag = 0x4000
	};

	typedef CAutotuneResultParser::CKernel_Device_Tuning_ResultMap _TyResults; // just a shorthand

protected:
	// note that the functions are not static intentionally, there should
	// be some configuration of the report stored in the members
	// the values and algorithms are chosen via Report() parameters while the
	// styles and formatting should be chosen via the members

	std::string m_s_tuning_constructor;
	std::string m_s_kernel_autotune_info_trailing_specializer_list;
	std::string m_s_kernel_autotune_info_data_type;

public:
	CAutotuneCuts_ReportGenerator()
	{
		Set_TuningConstructor("std::make_pair");
	}

	bool Set_TKerneAutotuneInfo_TrailingSpecializerList(const char *p_s_TKerneAutotuneInfo_specializer_list)
	{
		m_s_kernel_autotune_info_data_type.clear();
		return stl_ut::AssignCStr(m_s_kernel_autotune_info_trailing_specializer_list,
			p_s_TKerneAutotuneInfo_specializer_list);
	}

	bool Set_TKerneAutotuneInfo_TypeName(const char *p_s_specialized_TKerneAutotuneInfo_name)
	{
		m_s_kernel_autotune_info_trailing_specializer_list.clear();
		return stl_ut::AssignCStr(m_s_kernel_autotune_info_data_type, p_s_specialized_TKerneAutotuneInfo_name);
	}

	bool Set_TuningConstructor(const char *p_s_tuning_constructor)
	{
		return stl_ut::AssignCStr(m_s_tuning_constructor, p_s_tuning_constructor);
	}

	bool Report(const char *p_s_outfile, const char *p_s_results_infile,
		size_t n_cut_num = 2, size_t n_skip_first_config_num = 0,
		int n_result_type = result_Median, int n_normalization_type = normalize_Slowdown,
		int n_elements_to_bytes_bandwidth_ratio = 4 * 2)
	{
		CAutotuneResultParser::CKernel_Device_Tuning_ResultMap results_map;
		if(!CAutotuneResultParser::Parse(results_map, p_s_results_infile))
			return false;
		return Report(p_s_outfile, results_map, p_s_results_infile, n_cut_num,
			n_skip_first_config_num, n_result_type, n_normalization_type,
			n_elements_to_bytes_bandwidth_ratio);
	}
	
	bool Report(const char *p_s_outfile, const _TyResults &results_map,
		size_t n_cut_num = 2, size_t n_skip_first_config_num = 0,
		int n_result_type = result_Median, int n_normalization_type = normalize_Slowdown,
		int n_elements_to_bytes_bandwidth_ratio = 4 * 2)
	{
		return Report(p_s_outfile, results_map, "&lt;unknown&gt;", n_cut_num,
			n_skip_first_config_num, n_result_type, n_normalization_type,
			n_elements_to_bytes_bandwidth_ratio);
	}

	bool Report(const char *p_s_outfile, const _TyResults &results_map, const char *p_s_results_infile,
		size_t n_cut_num = 2, size_t n_skip_first_config_num = 0,
		int n_result_type = result_Median, int n_normalization_type = normalize_Slowdown,
		int n_elements_to_bytes_bandwidth_ratio = 4 * 2)
	{
		std::string s_html_report;
		if(!Report(s_html_report, results_map, p_s_results_infile, n_cut_num,
		   n_skip_first_config_num, n_result_type, n_normalization_type,
		   n_elements_to_bytes_bandwidth_ratio))
			return false;
		FILE *p_fw;
#if defined(_MSC_VER) && !defined(__MWERKS__) && _MSC_VER >= 1400
		if(!fopen_s(&p_fw, p_s_outfile, "w")) {
#else //_MSC_VER && !__MWERKS__ && _MSC_VER >= 1400
		if((p_fw = fopen(p_s_outfile, "w"))) {
#endif //_MSC_VER && !__MWERKS__ && _MSC_VER >= 1400
			fprintf(p_fw, "%s\n", s_html_report.c_str());
			fclose(p_fw);
		} else
			return false;
		return true;
	}

	bool Report(std::string &s_html_report, const char *p_s_results_infile,
		size_t n_cut_num = 2, size_t n_skip_first_config_num = 0,
		int n_result_type = result_Median, int n_normalization_type = normalize_Slowdown,
		int n_elements_to_bytes_bandwidth_ratio = 4 * 2)
	{
		CAutotuneResultParser::CKernel_Device_Tuning_ResultMap results_map;
		if(!CAutotuneResultParser::Parse(results_map, p_s_results_infile))
			return false;
		return Report(s_html_report, results_map, p_s_results_infile, n_cut_num,
			n_skip_first_config_num, n_result_type, n_normalization_type,
			n_elements_to_bytes_bandwidth_ratio);
	}

	bool Report(std::string &s_html_report, const _TyResults &results_map,
		size_t n_cut_num = 2, size_t n_skip_first_config_num = 0,
		int n_result_type = result_Median, int n_normalization_type = normalize_Slowdown,
		int n_elements_to_bytes_bandwidth_ratio = 4 * 2)
	{
		return Report(s_html_report, results_map, "&lt;unknown&gt;", n_cut_num,
			n_skip_first_config_num, n_result_type, n_normalization_type,
			n_elements_to_bytes_bandwidth_ratio);
	}

	bool Report(std::string &s_html_report, const _TyResults &results_map,
		const char *p_s_results_infile, size_t n_cut_num = 2, size_t n_skip_first_config_num = 0,
		int n_result_type = result_Median, int n_normalization_type = normalize_Slowdown,
		int n_elements_to_bytes_bandwidth_ratio = 4 * 2)
	{
		const char *p_s_html_header =
			"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" "
			"\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n"
			"<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\n"
			"<head>\n"
			"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\"/>\n"
			"<title>Autotune report</title>\n"
			"<style type=\"text/css\">\n"
			"table {\n"
			"	border: solid 1px black;\n"
			"	padding: 0px;\n"
			"	border-collapse: collapse;\n"
			"	font-family: tahoma, sans-serif;\n"
			"	font-size: 70%;\n"
			"}\n"
			"table tr {\n"
			"	padding: 0px;\n"
			"	margin: 0px;\n"
			"}\n"
			"table tr td {\n"
			"	border: none 3px black;\n"
			"	text-align: right;\n"
			"	width: 150px;\n"
			"	margin: 0px;\n"
			"}\n"
			"table tr th {\n"
			"	font-weight: normal;\n"
			"	border: solid 1px black;\n"
			"	width: 100px;\n"
			"	margin: 0px;\n"
			"}\n"
			"</style>\n"
			"</head>\n"
			"<body>\n"
			"<div>\n";
		try {
			s_html_report = p_s_html_header;
			s_html_report += "<h2>Autotune report</h2>\n";
			s_html_report += "input file: \"<span style=\"font-family: monospace;\">";
			s_html_report += p_s_results_infile;
			s_html_report += "\"</span><br/>\n";
			s_html_report += "result normalization: ";
			if((n_normalization_type & ~normalize_Flags) == normalize_Rank)
				s_html_report += "ranking";
			else if((n_normalization_type & ~normalize_Flags) == normalize_Slowdown ||
			   (n_normalization_type & ~normalize_Flags) == normalize_ZeroOne) {
				s_html_report += ((n_normalization_type & ~normalize_Flags) == normalize_Slowdown)?
					"slowdown" : "zero-one";
				if((n_normalization_type & normalize_Logarithm_Flag) != 0)
					s_html_report += " in log scale";
				else if((n_normalization_type & normalize_Exponential_Flag) != 0)
					s_html_report += " in exp scale";
			} else
				s_html_report += "none";
			s_html_report += "<br/>\n";
			s_html_report += "result aggregation: ";
			s_html_report += (n_result_type == result_Median)? "median" :
				(n_result_type == result_Mean)? "mean" : /*(n_result_type == result_Last)?*/ "last";
			s_html_report += "<br/>\n";
			std::string s_temp;
			if(!stl_ut::Format(s_temp, "cuts: " PRIsize "<br/>\n"
			   "skipped first configurations: " PRIsize "<br/>\n", n_cut_num, n_skip_first_config_num))
				return false;
			s_html_report += s_temp;
			if(!stl_ut::Format(s_temp, "element to bandwidth ratio: %d B (used in the"
			   " table below only)<br/>\n", n_elements_to_bytes_bandwidth_ratio))
				return false;
			s_html_report += s_temp;

			{
				size_t n_result_fields = 0, n_total_results = 0, n_max_results = 0, n_failed_fields = 0;
				for(CAutotuneResultParser::CKernel_Device_Tuning_ResultMap::const_iterator p_kernel_it = results_map.begin(),
				   p_kernel_end_it = results_map.end(); p_kernel_it != p_kernel_end_it; ++ p_kernel_it) {
					const CAutotuneResultParser::CDevice_Tuning_ResultMap &r_dtm = (*p_kernel_it).second;
					for(CAutotuneResultParser::CDevice_Tuning_ResultMap::const_iterator p_device_it = r_dtm.begin(),
					   p_device_end_it = r_dtm.end(); p_device_it != p_device_end_it; ++ p_device_it) {
						const CAutotuneResultParser::CTuning_ResultMap &r_trm = (*p_device_it).second;
						for(CAutotuneResultParser::CTuning_ResultMap::const_iterator p_tuning_it = r_trm.begin(),
						   p_tuning_end_it = r_trm.end(); p_tuning_it != p_tuning_end_it; ++ p_tuning_it) {
							const CAutotuneResultParser::CResultMap &r_rm = (*p_tuning_it).second;
							for(CAutotuneResultParser::CResultMap::const_iterator p_result_it = r_rm.begin(),
							   p_result_end_it = r_rm.end(); p_result_it != p_result_end_it; ++ p_result_it) {
								const std::vector<double> &r_results = (*p_result_it).second;
								if(r_results.empty()) {
									++ n_failed_fields;
									continue; // a fail
								}
								++ n_result_fields;
								n_total_results += r_results.size();
								if(n_max_results < r_results.size())
									n_max_results = r_results.size();
							}
						}
					}
				}
				if(!stl_ut::Format(s_temp, "total samples: " PRIsize " (in addition to " PRIsize " fails)<br/>\n"
				   "maximum aggregated samples: " PRIsize "<br/>\n"
				   "average aggregated samples: %.2f<br/>\n", n_total_results, n_failed_fields,
				   n_max_results, double(n_total_results) / n_result_fields))
					return false;
				s_html_report += s_temp;
			}

			size_t n_kernel_num = results_map.size();
			for(CAutotuneResultParser::CKernel_Device_Tuning_ResultMap::const_iterator p_kernel_it = results_map.begin(),
			   p_kernel_end_it = results_map.end(); p_kernel_it != p_kernel_end_it; ++ p_kernel_it) {
				const std::string &s_kernel = (*p_kernel_it).first;
				const CAutotuneResultParser::CDevice_Tuning_ResultMap &r_dtm = (*p_kernel_it).second;
				size_t n_device_num = r_dtm.size();
				for(CAutotuneResultParser::CDevice_Tuning_ResultMap::const_iterator p_device_it = r_dtm.begin(),
				   p_device_end_it = r_dtm.end(); p_device_it != p_device_end_it; ++ p_device_it) {
					const std::string &s_device = (*p_device_it).first;
					const CAutotuneResultParser::CTuning_ResultMap &r_trm = (*p_device_it).second;

					std::set<size_t> configuration_space;
					size_t n_tuning_num = r_trm.size();
					size_t n_failed_fields = 0;
					size_t n_result_fields = 0;
					size_t n_total_results = 0;
					size_t n_max_results = 0;
					for(CAutotuneResultParser::CTuning_ResultMap::const_iterator p_tuning_it = r_trm.begin(),
					   p_tuning_end_it = r_trm.end(); p_tuning_it != p_tuning_end_it; ++ p_tuning_it) {
						const CAutotuneResultParser::CResultMap &r_rm = (*p_tuning_it).second;
						for(CAutotuneResultParser::CResultMap::const_iterator p_result_it = r_rm.begin(),
						   p_result_end_it = r_rm.end(); p_result_it != p_result_end_it; ++ p_result_it) {
							//const std::vector<double> &r_results = (*p_result_it).second;
							const std::string &r_s_config = (*p_result_it).first;
							configuration_space.insert(atol(r_s_config.c_str())); // assumed to be 1D
							const std::vector<double> &r_results = (*p_result_it).second;
							if(r_results.empty()) {
								++ n_failed_fields;
								continue; // a fail
							}
							++ n_result_fields;
							n_total_results += r_results.size();
							if(n_max_results < r_results.size())
								n_max_results = r_results.size();
						}
					}
					// look for all the configurations, collect some stats while at it

					/*if(n_tuning_num < n_cut_num + 1) { // this should not actually matter
						fprintf(stderr, "error: not enough tuning types for %s / %s\n",
							s_kernel.c_str(), s_device.c_str());
						continue;
					}*/
					size_t n_config_num = std::max(configuration_space.size(),
						n_skip_first_config_num) - n_skip_first_config_num;
					if(n_config_num < n_cut_num + 1) {
						fprintf(stderr, "warning: not enough configurations for %s / %s\n",
							s_kernel.c_str(), s_device.c_str());
						if(!n_config_num)
							continue; // really nothing to go on
					}

					s_html_report += "<h2>" + s_device + " / " + s_kernel + "</h2>\n";

					if(!stl_ut::Format(s_temp, "total samples: " PRIsize " (in addition to " PRIsize " fails)<br/>\n"
					   "maximum aggregated samples: " PRIsize "<br/>\n"
					   "average aggregated samples: %.2f<br/>\n", n_total_results, n_failed_fields,
					   n_max_results, double(n_total_results) / n_result_fields))
						return false;
					s_html_report += s_temp;

					CAutotuneCuts_1D tuning(min(n_cut_num, max(n_config_num, size_t(1)) - 1), n_tuning_num, n_config_num);
					std::vector<double> med_temp;
					size_t n_tuning = 0;
					for(CAutotuneResultParser::CTuning_ResultMap::const_iterator p_tuning_it = r_trm.begin(),
					   p_tuning_end_it = r_trm.end(); p_tuning_it != p_tuning_end_it; ++ p_tuning_it, ++ n_tuning) {
						const CAutotuneResultParser::CResultMap &r_rm = (*p_tuning_it).second;
						for(CAutotuneResultParser::CResultMap::const_iterator p_result_it = r_rm.begin(),
						   p_result_end_it = r_rm.end(); p_result_it != p_result_end_it; ++ p_result_it) {
							const std::vector<double> &r_results = (*p_result_it).second;
							size_t n_config = std::distance(configuration_space.begin(),
								configuration_space.find(atol((*p_result_it).first.c_str())));
							if(r_results.empty())
								continue; // a fail
							n_total_results += r_results.size();
							if(n_max_results < r_results.size())
								n_max_results = r_results.size();
							double f_result;
							if(n_result_type == 0)
								f_result = r_results.back(); // the last one
							else if(n_result_type == 1) {
								med_temp.resize(r_results.size() / 2 + 1);
								std::partial_sort_copy(r_results.begin(), r_results.end(),
									med_temp.begin(), med_temp.end());
								f_result = med_temp.back(); // median
							} else /*if(n_result_type == 2)*/ {
								f_result = std::accumulate(r_results.begin(),
									r_results.end(), .0) / r_results.size(); // mean
							}
							if(n_config >= n_skip_first_config_num)
								tuning.Set_Cost(n_config - n_skip_first_config_num, n_tuning, f_result); // use the last measurement
						}
					}

					if((n_normalization_type & ~normalize_Flags) == normalize_Rank)
						tuning.Rank_Costs();
					else if((n_normalization_type & ~normalize_Flags) == normalize_Slowdown ||
					   (n_normalization_type & ~normalize_Flags) == normalize_ZeroOne) {
						tuning.Normalize_Costs((n_normalization_type & ~normalize_Flags) == normalize_Slowdown,
							(n_normalization_type & normalize_Logarithm_Flag) != 0,
							(n_normalization_type & normalize_Exponential_Flag) != 0);
					}

					tuning.Find_Cuts();

					std::vector<std::string> tuning_names, tuning_ctors;
					std::vector<std::pair<size_t, size_t> > tuning_values; // only needed for a test of CAutotuneCuts_1D::Initialize_AutotuneInfo()
					{
						n_tuning = 0;
						for(CAutotuneResultParser::CTuning_ResultMap::const_iterator p_tuning_it = r_trm.begin(),
						   p_tuning_end_it = r_trm.end(); p_tuning_it != p_tuning_end_it; ++ p_tuning_it, ++ n_tuning) {
							const CAutotuneResultParser::CResultMap &r_rm = (*p_tuning_it).second;
							for(CAutotuneResultParser::CResultMap::const_iterator p_result_it = r_rm.begin(),
							   p_result_end_it = r_rm.end(); p_result_it != p_result_end_it; ++ p_result_it) {
								const std::vector<double> &r_results = (*p_result_it).second;
								if(r_results.empty())
									continue; // a fail
								size_t n_config = std::distance(configuration_space.begin(),
									configuration_space.find(atol((*p_result_it).first.c_str())));
								double f_result;
								if(n_result_type == 0)
									f_result = r_results.back(); // the last one
								else if(n_result_type == 1) {
									med_temp.resize(r_results.size() / 2 + 1);
									std::partial_sort_copy(r_results.begin(), r_results.end(),
										med_temp.begin(), med_temp.end());
									f_result = med_temp.back(); // median
								} else /*if(n_result_type == 2)*/ {
									f_result = std::accumulate(r_results.begin(),
										r_results.end(), .0) / r_results.size(); // mean
								}
								if(n_config >= n_skip_first_config_num)
									tuning.Set_Cost(n_config - n_skip_first_config_num, n_tuning, f_result); // use the last measurement
							}
							tuning_names.push_back("tuning[" + (*p_tuning_it).first + "]");
							tuning_ctors.push_back(m_s_tuning_constructor + "(" + (*p_tuning_it).first + ")");
							std::string &r_ctor = tuning_ctors.back();
							for(size_t n_comma_pos = r_ctor.find(','); n_comma_pos != std::string::npos;
							   n_comma_pos = r_ctor.find(',', n_comma_pos + 1))
								r_ctor.insert(n_comma_pos + 1, 1, ' ');
							{
								/*std::vector<std::string> params;
								stl_ut::Split(params, (*p_tuning_it).first, ",", 0);
								size_t a = (params.size() >= 1)? atol(params[0].c_str()) : 0,
									b = (params.size() >= 2)? atol(params[1].c_str()) : 0;*/
								CAutotuneResultParser::CIntParser p((*p_tuning_it).first);
								size_t a = 0, b = 0;
								p.ReadInt(a);
								p.ReadInt(b);
								tuning_values.push_back(std::make_pair(a, b));
								// this is only for debugging and doesn't get displayed anywhere, ignore errors here
							}
						}
					}
					// reset the costs to time

					std::vector<size_t> input_sizes(configuration_space.begin(), configuration_space.end());
					if(n_skip_first_config_num) {
						input_sizes.erase(input_sizes.begin(), input_sizes.begin() +
							std::min(input_sizes.size(), n_skip_first_config_num));
					}
					// grab input sizes to indexable vector

					std::vector<std::string> config_names(input_sizes.size());
					for(size_t i = 0, n = input_sizes.size(); i < n; ++ i) {
						if(!stl_ut::Format(config_names[i], PRIvalueMP,
						   PRIvalueMPparamsExt(input_sizes[i], 0, 0, false)))
							return false;
					}
					// format config names (this is supposedly 1D)

					std::string s_html, s_code;
					if(!tuning.Generate_HTML_Report_Table(s_html, CAutotuneCuts_1D::CFormatGBpS(input_sizes, n_elements_to_bytes_bandwidth_ratio/*4 * 2*/), // * 2 because each elem is read and written
					   config_names, tuning_names) || !tuning.Generate_HTML_Report_InitializerCode(s_code,
					   s_device.c_str(), 4, input_sizes, tuning_ctors,
					   m_s_kernel_autotune_info_trailing_specializer_list.c_str(),
					   (m_s_kernel_autotune_info_data_type.empty())? 0 :
					   m_s_kernel_autotune_info_data_type.c_str(), true))
						return false;

					std::vector<size_t> input_cuts(input_sizes.size());
					for(size_t i = 1, n = input_sizes.size(); i < n; ++ i) {
						input_cuts[i - 1] =
							CAutotuneCuts_1D::n_POT_inBetween(input_sizes[i - 1], input_sizes[i]);
					}
					input_cuts.back() = SIZE_MAX;

					if(tuning.n_Span_Num() <= 4) {
						TKernelAutotuneInfo<4> t_autotune;
						tuning.Initialize_AutotuneInfo(t_autotune, s_device.c_str(), 4,
							(input_cuts.empty())? 0 : &input_cuts.front(), input_cuts.size(), SIZE_MAX,
							(tuning_values.empty())? 0 : &tuning_values.front(), tuning_values.size());
						// just to test this
					}

					s_html_report += "<pre>" + s_code + "</pre>\n";
					s_html_report += "<div style=\"text-align: center;\">global memory bandwidth [B/s]<table>" +
						s_html + "</table></div><br/>\n";
				}
			}
			s_html_report += "</div>\n</body>\n</html>\n";
		} catch(std::bad_alloc&) {
			return false;
		}

		return true;
	}
};

// OpenCL-related classes below

#include "../UberLame_src/gpgpu/ClUtils.h"

/**
 *	@brief a simple interface for OpenCL autotuning
 */
class CCLAutotuneInterface : public virtual CAutotuneInterface {
public:
	virtual bool Prepare_Benchmark(cl_context h_context, cl_device_id h_device,
		cl_command_queue h_cmd_queue, const std::vector<size_t> &r_tuning) = 0;
	virtual bool Run_Benchmark(cl_context h_context, cl_command_queue h_cmd_queue, double &r_f_time, bool b_verify) = 0;
};

/**
 *	@brief a simple driving algorithm for running autotuning benchmarks and saving the results to a file
 */
class CCLSimpleAutotuneDriver {
protected:
	CCLAutotuneInterface *m_p_kernel;
	std::string m_s_results_file;
	cl_device_id m_h_device;
	std::string m_s_dev_name;
	std::string m_s_autotune_string;
	std::string m_s_benchmark;
	bool m_b_verify_always;
	size_t m_n_max_repeated_becnhmark_result_num;
	bool m_b_outer_loop_over_tunings;
	int m_n_verbose;

	CAutotuneResultParser::CKernel_Device_Tuning_ResultMap m_results_so_far;
	// note that this is not updated with the new results; we could do that

public:
	/**
	 *	@brief a simple autotune test driver
	 *
	 *	@param[in] p_kernel is pointer to the autotune kernel wrapper (only referenced; must stay allocated)
	 *	@param[in] p_s_results_file is a null-terminated string containing output file name
	 *		(if it already exists, the new results are appended)
	 *	@param[in] h_device is OpenCL device id for autotuning
	 *	@param[in] b_verify_always is result verification flag (if set, the results are verified
	 *		after every run, if cleared, the results are verified only once after the kernel is
	 *		compiled and then the rest of the timing tests runs without verification)
	 *	@param[in] n_max_repeated_becnhmark_result_num is the maximum number of results per
	 *		input configuration and tuning (if the resutls file already exists and contains
	 *		enough results for some particular test, it will be skipped; 0 for no limit (default))
	 *	@param[in] b_transpose_order is transpose order flag (the default order is to change input
	 *		configurations in the outer loop and the tuning parameters in the inner)
	 *	@param[in] n_verbose is verbosity (0 means silent, 1 means only progress bar and 2 means
	 *		echo all the results to the console)
	 */
	CCLSimpleAutotuneDriver(CCLAutotuneInterface *p_kernel, const char *p_s_results_file,
		cl_device_id h_device, bool b_verify_always = false,
		size_t n_max_repeated_becnhmark_result_num = 0,
		bool b_transpose_order = false, int n_verbose = 1)
		:m_p_kernel(p_kernel), m_h_device(h_device), m_b_verify_always(b_verify_always),
		m_n_max_repeated_becnhmark_result_num(n_max_repeated_becnhmark_result_num),
		m_b_outer_loop_over_tunings(b_transpose_order), m_n_verbose(n_verbose)
	{
		_ASSERTE(n_verbose >= 0 && n_verbose < 3); // does not really do anything if out of range

		CAutotuneResultParser::Parse(m_results_so_far, p_s_results_file);
		// ignore errors here

		CCLDeviceParams dev(h_device);
		if(!dev.Get_SafeName(m_s_dev_name, false, '_') ||
		//CCLDeviceParams::n_GetDeviceInfoString_Safe(m_s_dev_name,
		//	opencl.h_Device(0), CL_DEVICE_NAME, false, '_');
		   !CCLDeviceClassId::Get(m_s_autotune_string, dev))
			m_p_kernel = 0; // to mark error
		// get generic id for the results

		if(!stl_ut::AssignCStr(m_s_results_file, p_s_results_file) ||
		   !stl_ut::AssignCStr(m_s_benchmark, p_kernel->p_s_Benchmark_Id()))
			m_p_kernel = 0; // to mark error
		_ASSERTE(std::find_if(m_s_benchmark.begin(),
			m_s_benchmark.end(), isspace) == m_s_benchmark.end()); // benchmark id must not contain spaces
	}

	/**
	 *	@brief enables or disables benchmark loop transposition
	 *
	 *	@param[in] b_transpose_order is transpose order flag (the default order is to change input
	 *		configurations in the outer loop and the tuning parameters in the inner)
	 */
	void Set_TransposeLoopOrder(bool b_transpose_order)
	{
		m_b_outer_loop_over_tunings = b_transpose_order;
	}

	/**
	 *	@brief sets the desired maximum number of results for each input configuration and tuning
	 *
	 *	@param[in] n_max_repeated_becnhmark_result_num is the maximum number of results per
	 *		input configuration and tuning (if the resutls file already exists and contains
	 *		enough results for some particular test, it will be skipped; 0 for no limit)
	 */
	void Set_MaxRepeatedResult_Num(size_t n_max_repeated_becnhmark_result_num)
	{
		m_n_max_repeated_becnhmark_result_num = n_max_repeated_becnhmark_result_num;
	}

	/**
	 *	@brief sets verbosity level
	 *
	 *	@param[in] n_verbose is verbosity (0 means silent, 1 means only progress bar
	 *		and 2 means echo all the results to the console)
	 */
	void Set_Verbosity(int n_verbose)
	{
		m_n_verbose = n_verbose;
	}

	/**
	 *	@brief sets the results verification to run after each and every test
	 */
	void Verify_Always()
	{
		m_b_verify_always = true;
	}

	/**
	 *	@brief sets the results verification to run once for each input configuration and tuning
	 */
	void Verify_Once()
	{
		m_b_verify_always = false;
	}

	/**
	 *	@brief runs the autotune benchmarks
	 *
	 *	@param[in] opencl is reference to the OpenCL instance (only the first device
	 *		is used; must be the same as passed to the constructor)
	 *
	 *	@return Returns true on success, false on failure (failure being all of the tests
	 *		failing; if at least a single test runs successfully then that's a success).
	 */
	bool Run(CCLUniqueInstance &opencl)
	{
		return Run(opencl.h_Context(), opencl.h_Command_Queue(0));
	}

	/**
	 *	@brief runs the autotune benchmarks
	 *
	 *	@param[in] h_context is OpenCL context handle (for memory allocation)
	 *	@param[in] h_cmd_queue is OpenCL command queue for the same device
	 *		as passed to the constructor
	 *
	 *	@return Returns true on success, false on failure (failure being all of the tests
	 *		failing; if at least a single test runs successfully then that's a success).
	 */
	bool Run(cl_context h_context, cl_command_queue h_cmd_queue)
	{
		if(!m_p_kernel)
			return false;

		std::vector<size_t> input_cfg, input_hidden, tuning_cfg, tuning_hidden;

		CTextProgressIndicator progress("autotuning", 1, 10);

		size_t n_config_num = 0, n_config = 0;
		std::vector<size_t> inner_loop_sizes;
		if(m_n_verbose == 1) {
			m_p_kernel->Init_InputVector(input_cfg, input_hidden);
			m_p_kernel->Init_TuningVector(tuning_cfg, tuning_hidden);
			do {
				if(m_b_outer_loop_over_tunings)
					m_p_kernel->Init_InputVector(input_cfg, input_hidden);
				else
					m_p_kernel->Init_TuningVector(tuning_cfg, tuning_hidden);
				size_t n_inner_size = 0;
				do {
					++ n_inner_size;
				} while((m_b_outer_loop_over_tunings)? m_p_kernel->Next_InputVector(input_cfg, input_hidden) :
					m_p_kernel->Next_TuningVector(tuning_cfg, tuning_hidden));
				n_config_num += n_inner_size;
				if(!stl_ut::Resize_Add_1More(inner_loop_sizes, n_inner_size))
					return false;
			} while((m_b_outer_loop_over_tunings)? m_p_kernel->Next_TuningVector(tuning_cfg, tuning_hidden) :
				m_p_kernel->Next_InputVector(input_cfg, input_hidden));
			// count all the configurations for verbose

			progress.Show(n_config, n_config_num);
		}
		// verbose

		size_t n_skip_num = 0, n_fail_num = 0, n_result_num = 0;

		std::vector<double> times; // for median
		std::string s_input_vec, s_tuning_vec;

		m_p_kernel->Init_InputVector(input_cfg, input_hidden);
		m_p_kernel->Init_TuningVector(tuning_cfg, tuning_hidden);
		size_t n_outer_iter = -1;
		do {
			++ n_outer_iter;

			if(m_n_max_repeated_becnhmark_result_num) { // if limit enabled
				bool b_all_skipped = true;
				size_t n_inner_skip_num = 0;
				if(m_b_outer_loop_over_tunings)
					m_p_kernel->Init_InputVector(input_cfg, input_hidden);
				else
					m_p_kernel->Init_TuningVector(tuning_cfg, tuning_hidden);
				do {
					size_t n_prev_result_num = CAutotuneResultParser::n_Result_Num(m_results_so_far,
						m_s_benchmark, m_s_autotune_string, input_cfg, tuning_cfg);
					if(n_prev_result_num == size_t(-1) || // no results at all
					   n_prev_result_num < m_n_max_repeated_becnhmark_result_num) {
						/*if(m_n_verbose >= 2) {
							printf("debug: %s %s %s %s : already have enough results from the previous run(s)\n",
								m_s_autotune_string.c_str(), m_s_benchmark.c_str(),
								CAutotuneResultParser::p_s_FormatVector(s_input_vec, "input", input_cfg),
								CAutotuneResultParser::p_s_FormatVector(s_tuning_vec, "tuning", tuning_cfg));
						}
						++ n_skip_num;
						continue;*/
						b_all_skipped = false;
						break;
					}
					++ n_inner_skip_num;
				} while((m_b_outer_loop_over_tunings)? m_p_kernel->Next_InputVector(input_cfg, input_hidden) :
					m_p_kernel->Next_TuningVector(tuning_cfg, tuning_hidden));
				// go through all the inner loop states and see whether they would skip

				if(b_all_skipped) {
					if(m_n_verbose == 1)
						progress.Show(n_config += n_inner_skip_num, n_config_num);
					if(m_n_verbose >= 2) {
						printf("debug: %s %s %s %s : already have enough results from the previous run(s)\n",
							m_s_autotune_string.c_str(), m_s_benchmark.c_str(),
							(m_b_outer_loop_over_tunings)? "*" : CAutotuneResultParser::p_s_FormatVector(s_input_vec, "input", input_cfg),
							(m_b_outer_loop_over_tunings)? CAutotuneResultParser::p_s_FormatVector(s_tuning_vec, "tuning", tuning_cfg) : "*");
					}
					n_skip_num += n_inner_skip_num;
					continue;
				}
			}
			// skip before doing something

			if(m_b_outer_loop_over_tunings) {
				CAutotuneResultParser::p_s_FormatVector(s_tuning_vec, "tuning", tuning_cfg);

				try {
					//double f_run_time;
					if(!m_p_kernel->Prepare_Benchmark(h_context, m_h_device, h_cmd_queue, tuning_cfg) /*||
					   !m_p_kernel->Run_Benchmark(h_context, h_cmd_queue, f_run_time, true)*/) {
						Report_Fail(s_input_vec, s_tuning_vec);
						if(inner_loop_sizes.size() > n_outer_iter)
							n_fail_num += inner_loop_sizes[n_outer_iter];
						else
							++ n_fail_num;
						continue;
					}
					// prepare tuned code for the benchmark
				} catch(std::exception&) {
					Report_Fail(s_input_vec, s_tuning_vec);
					if(inner_loop_sizes.size() > n_outer_iter)
						n_fail_num += inner_loop_sizes[n_outer_iter];
					else
						++ n_fail_num;
					continue;
				}

				m_p_kernel->Init_InputVector(input_cfg, input_hidden);
			} else {
				CAutotuneResultParser::p_s_FormatVector(s_input_vec, "input", input_cfg);

				try {
					if(!m_p_kernel->Prepare_Input(input_cfg)) {
						Report_Fail(s_input_vec, s_tuning_vec);
						if(inner_loop_sizes.size() > n_outer_iter)
							n_fail_num += inner_loop_sizes[n_outer_iter];
						else
							++ n_fail_num;
						continue;
					}
				} catch(std::exception&) {
					Report_Fail(s_input_vec, s_tuning_vec);
					if(inner_loop_sizes.size() > n_outer_iter)
						n_fail_num += inner_loop_sizes[n_outer_iter];
					else
						++ n_fail_num;
					continue;
				}
				// prepare data for the benchmark

				m_p_kernel->Init_TuningVector(tuning_cfg, tuning_hidden);
			}
			do {
				if(m_n_verbose == 1)
					progress.Show(++ n_config, n_config_num);

				if(m_n_max_repeated_becnhmark_result_num) { // if limit enabled
					size_t n_prev_result_num = CAutotuneResultParser::n_Result_Num(m_results_so_far,
						m_s_benchmark, m_s_autotune_string, input_cfg, tuning_cfg);
					if(n_prev_result_num != size_t(-1) && // no results at all
					   n_prev_result_num >= m_n_max_repeated_becnhmark_result_num) {
						if(m_n_verbose >= 2) {
							printf("debug: %s %s %s %s : already have enough results from the previous run(s)\n",
								m_s_autotune_string.c_str(), m_s_benchmark.c_str(),
								CAutotuneResultParser::p_s_FormatVector(s_input_vec, "input", input_cfg),
								CAutotuneResultParser::p_s_FormatVector(s_tuning_vec, "tuning", tuning_cfg));
						}
						++ n_skip_num;
						continue;
					}
				}
				// skip before doing something

				if(m_b_outer_loop_over_tunings) {
					CAutotuneResultParser::p_s_FormatVector(s_input_vec, "input", input_cfg);

					try {
						double f_run_time;
						if(!m_p_kernel->Prepare_Input(input_cfg) ||
						   !m_p_kernel->Run_Benchmark(h_context, h_cmd_queue, f_run_time, true)) { // verify results at least once
							Report_Fail(s_input_vec, s_tuning_vec);
							++ n_fail_num;
							continue;
						}
					} catch(std::exception&) {
						Report_Fail(s_input_vec, s_tuning_vec);
						++ n_fail_num;
						continue;
					}
					// prepare data for the benchmark
				} else {
					CAutotuneResultParser::p_s_FormatVector(s_tuning_vec, "tuning", tuning_cfg);

					try {
						double f_run_time;
						if(!m_p_kernel->Prepare_Benchmark(h_context, m_h_device, h_cmd_queue, tuning_cfg) ||
						   !m_p_kernel->Run_Benchmark(h_context, h_cmd_queue, f_run_time, true)) { // verify results at least once
							Report_Fail(s_input_vec, s_tuning_vec);
							++ n_fail_num;
							continue;
						}
						// prepare tuned code for the benchmark
					} catch(std::exception&) {
						Report_Fail(s_input_vec, s_tuning_vec);
						++ n_fail_num;
						continue;
					}
				}

				try {
					/*if(!m_p_kernel->Prepare_Benchmark(h_context, m_h_device, h_cmd_queue, tuning_cfg) ||
					   !m_p_kernel->Run_Benchmark(h_context, h_cmd_queue, f_run_time, true)) {
						Report_Fail(s_input_vec, s_tuning_vec);
						++ n_fail_num;
						continue;
					}*/
					// prepare tuned code for the benchmark
					// did that already

					bool b_fail = false;
					double f_time = 0;
					times.clear();
					int n_pass_num = 0;
					for(;;) {
						double f_run_time;
						if(!m_p_kernel->Run_Benchmark(h_context, h_cmd_queue, f_run_time, m_b_verify_always)) {
							b_fail = true;
							break;
						}

						times.push_back(f_run_time);
						f_time += f_run_time;
						++ n_pass_num;

						if((f_time > 10 && n_pass_num >= 25) || n_pass_num >= 100)
							break;
						// make sure the timing is stable, but don't take too long at the same time
					}
					f_time /= n_pass_num;
					// time the thing

					/*if(!times.empty()) {
						std::partial_sort(times.begin(), times.begin() + times.size() / 2, times.end());
						f_time = times[times.size() / 2];
					}*/
					// use median

					if(b_fail) {
						Report_Fail(s_input_vec, s_tuning_vec);
						++ n_fail_num;
					} else {
						Report_Result(s_input_vec, s_tuning_vec, f_time);
						++ n_result_num;
					}
					// show the results
				} catch(std::exception&) {
					Report_Fail(s_input_vec, s_tuning_vec);
					++ n_fail_num;
					continue;
				}
			} while((m_b_outer_loop_over_tunings)? m_p_kernel->Next_InputVector(input_cfg, input_hidden) :
				m_p_kernel->Next_TuningVector(tuning_cfg, tuning_hidden));
		} while((m_b_outer_loop_over_tunings)? m_p_kernel->Next_TuningVector(tuning_cfg, tuning_hidden) :
			m_p_kernel->Next_InputVector(input_cfg, input_hidden));

		if(m_n_verbose == 1) {
			if(n_skip_num || n_fail_num) {
				std::string s_report;
				stl_ut::Format(s_report, "skipped " PRIsize " tests, "
					PRIsize " failed", n_skip_num, n_fail_num);
				progress.Done(s_report.c_str());
			} else
				progress.Done("all tests succeeded");
		}

		return !n_fail_num || n_result_num != 0;
	}

protected:
	void Report_Result(const std::string &s_input_vec,
		const std::string &s_tuning_vec, double f_time) const
	{
		if(m_n_verbose >= 2) {
			printf("%s %s %s %s %.15f // %s\n", m_s_autotune_string.c_str(),
				m_s_benchmark.c_str(), s_input_vec.c_str(), s_tuning_vec.c_str(), f_time, m_s_dev_name.c_str());
		}
		FILE *p_fw;
#if defined(_MSC_VER) && !defined(__MWERKS__) && _MSC_VER >= 1400
		if(!fopen_s(&p_fw, m_s_results_file.c_str(), "a")) {
#else //_MSC_VER && !__MWERKS__ && _MSC_VER >= 1400
		if((p_fw = fopen(m_s_results_file.c_str(), "a"))) {
#endif //_MSC_VER && !__MWERKS__ && _MSC_VER >= 1400
			fprintf(p_fw, "%s %s %s %s %.15f // %s\n", m_s_autotune_string.c_str(),
				m_s_benchmark.c_str(), s_input_vec.c_str(), s_tuning_vec.c_str(), f_time, m_s_dev_name.c_str());
			fflush(p_fw);
			fclose(p_fw);
		}
	}

	void Report_Fail(const std::string &s_input_vec,
		const std::string &s_tuning_vec) const
	{
		if(m_n_verbose >= 2) {
			printf("%s %s %s %s fail // %s\n", m_s_autotune_string.c_str(),
				m_s_benchmark.c_str(), s_input_vec.c_str(), s_tuning_vec.c_str(), m_s_dev_name.c_str());
		}
		FILE *p_fw;
#if defined(_MSC_VER) && !defined(__MWERKS__) && _MSC_VER >= 1400
		if(!fopen_s(&p_fw, m_s_results_file.c_str(), "a")) {
#else //_MSC_VER && !__MWERKS__ && _MSC_VER >= 1400
		if((p_fw = fopen(m_s_results_file.c_str(), "a"))) {
#endif //_MSC_VER && !__MWERKS__ && _MSC_VER >= 1400
			fprintf(p_fw, "%s %s %s %s fail // %s\n", m_s_autotune_string.c_str(),
				m_s_benchmark.c_str(), s_input_vec.c_str(), s_tuning_vec.c_str(), m_s_dev_name.c_str());
			fflush(p_fw);
			fclose(p_fw);
		}
	}
};

#endif // !__OPENCL_AUTOTUNING_HELPERS_INCLUDED
