
/*
 *								=== CCLUtils::TDevice_GFLOPS_Lazy::TObjective ===
 */

inline CCLUtils::TDevice_GFLOPS_Lazy::TObjective::TObjective(cl_device_id _h_device)
	:f_gflops(-1), h_device(_h_device)
{}

inline double CCLUtils::TDevice_GFLOPS_Lazy::TObjective::f_LazyEval() const
{
	if(f_gflops < 0)
		f_gflops = TDevice_GFLOPS()(h_device);
	return f_gflops;
}

inline bool CCLUtils::TDevice_GFLOPS_Lazy::TObjective::operator ==(const TObjective &r_right) const
{
	return f_LazyEval() == r_right.f_LazyEval();
}

inline bool CCLUtils::TDevice_GFLOPS_Lazy::TObjective::operator <(const TObjective &r_right) const
{
	return f_LazyEval() < r_right.f_LazyEval();
}

/*
 *								=== ~CCLUtils::TDevice_GFLOPS_Lazy::TObjective ===
 */

/*
 *								=== CCLUtils::TDevice_GFLOPS_Lazy ===
 */

inline CCLUtils::TDevice_GFLOPS_Lazy::TObjective CCLUtils::TDevice_GFLOPS_Lazy::operator ()(cl_device_id h_device) const
{
	return TObjective(h_device);
}

/*
 *								=== ~CCLUtils::TDevice_GFLOPS_Lazy ===
 */

/*
 *								=== CCLUtils::TDeviceSelect_ChainObjectives ===
 */

template <class CPrimaryObjective, class CSecondaryObjective>
inline CCLUtils::TDeviceSelect_ChainObjectives<CPrimaryObjective,
	CSecondaryObjective>::TDeviceSelect_ChainObjectives(CPrimaryObjective objective
	/*= CPrimaryObjective()*/, CSecondaryObjective objective2 /*= CSecondaryObjective()*/)
	:primary_objective(objective), secondary_objective(objective2)
{}

template <class CPrimaryObjective, class CSecondaryObjective>
inline typename CCLUtils::TDeviceSelect_ChainObjectives<CPrimaryObjective,
	CSecondaryObjective>::TObjective CCLUtils::TDeviceSelect_ChainObjectives<CPrimaryObjective,
	CSecondaryObjective>::operator ()(cl_device_id h_device) const
{
	return TObjective(primary_objective(h_device), secondary_objective(h_device));
	// pairs compare lexicographically (first is more important than second)
}

/*
 *								=== ~CCLUtils::TDeviceSelect_ChainObjectives ===
 */

/*
 *								=== CCLUtils ===
 */

/*template <class CScoringObjective>
static size_t CCLUtils::n_Get_Best_DeviceId(cl_device_id *p_device_id,
	cl_context h_context, CScoringObjective objective)
{
	std::vector<cl_device_id> device_list;
	if(n_GetDeviceList(h_context, device_list) != CL_SUCCESS || device_list.empty())
		return size_t(-1);
	// get all the devices

	size_t n_best_device = 0;
	if(device_list.size() > 0) {
		typename CScoringObjective::TObjective t_best_objective = objective(device_list.front());
		for(size_t i = 1, n = device_list.size(); i < n; ++ i) {
			typename CScoringObjective::TObjective t_objective = objective(device_list[i]);
			if(t_best_objective < t_objective) {
				t_best_objective = t_objective;
				n_best_device = i;
			}
		}
	}
	// go trough devices, and score them

	*p_device_id = device_list[n_best_device];
	// write handle to the best device

	return n_best_device;
}*/

/*
 *								=== ~CCLUtils ===
 */

/*
 *								=== CCLDeviceParams ===
 */

inline bool CCLDeviceParams::b_Status() const
{
	return m_h_device != 0;
}

inline cl_device_id CCLDeviceParams::h_Device() const
{
	return m_h_device;
}

inline CLresult CCLDeviceParams::n_GetPlatform(cl_platform_id &r_h_platform) const
{
	cl_platform_id h_platform;
	CLresult n_result = (CLresult)clGetDeviceInfo(m_h_device, CL_DEVICE_PLATFORM, sizeof(cl_platform_id), &h_platform, 0);
	r_h_platform = h_platform;
	return n_result;
}

inline bool CCLDeviceParams::b_Is_NVIDIA() const
{
	return b_IsExtensionSupported("cl_nv_device_attribute_query");
}

inline unsigned int CCLDeviceParams::n_NV_ComputeCap_Major() const
{
	return m_p_device_caps[0];
}

inline unsigned int CCLDeviceParams::n_NV_ComputeCap_Minor() const
{
	return m_p_device_caps[1];
}

inline unsigned int CCLDeviceParams::n_Revision_Major() const
{
	return n_NV_ComputeCap_Major();
}

inline unsigned int CCLDeviceParams::n_Revision_Minor() const
{
	return n_NV_ComputeCap_Minor();
}

inline size_t CCLDeviceParams::n_Multiprocessor_Num() const
{
	return m_n_multiprocessor_num;
}

inline uint64_t CCLDeviceParams::n_Memory_Size() const
{
	return m_n_memory_size;
}

inline size_t CCLDeviceParams::n_Memory_Size_MB() const
{
	_ASSERTE((m_n_memory_size + 1048575) / 1048576 <= SIZE_MAX); // should fit
	return size_t((m_n_memory_size + 1048575) / 1048576);
}

inline bool CCLDeviceParams::b_Has_KernelExecTimeout() const
{
	return m_b_kernel_exec_timeout;
}

inline const CCLDeviceParams::CLdevprop &CCLDeviceParams::t_Properties() const
{
	return m_t_devprop;
}

inline const std::string &CCLDeviceParams::s_Name() const
{
	return m_s_name;
}

inline const char *CCLDeviceParams::p_s_Name() const
{
	return m_s_name.c_str();
}

bool CCLDeviceParams::b_ProblemFitsAtOnce(int n_width, int n_height, int n_depth) const
{
	n_width = (n_width + m_t_devprop.maxGridSize[0] - 1) / m_t_devprop.maxGridSize[0];
	n_height = (n_height + m_t_devprop.maxGridSize[1] - 1) / m_t_devprop.maxGridSize[1];
	n_depth = (n_depth + m_t_devprop.maxGridSize[2] - 1) / m_t_devprop.maxGridSize[2];
	// calculate dimensions, relative to maximal grid size (round up)

	if(n_width > m_t_devprop.maxThreadsDim[0] ||
	   n_height > m_t_devprop.maxThreadsDim[1] ||
	   n_depth > m_t_devprop.maxThreadsDim[2])
		return false;
	// those dimensions must not exceed block size

	if(n_width * n_height * n_depth > m_t_devprop.maxThreadsPerBlock)
		return false;
	// number of threads in the block must be below limit

	return true;
}

bool CCLDeviceParams::CalculateGridParams(int *p_block_size, int *p_grid_size,
	int n_width, int n_height, int n_depth) const
{
	if(!b_ProblemFitsAtOnce(n_width, n_height, n_depth))
		return false;
	// @todo - handle subdivided problems too

	int n_blk_width = (n_width + m_t_devprop.maxGridSize[0] - 1) / m_t_devprop.maxGridSize[0];
	int n_blk_height = (n_height + m_t_devprop.maxGridSize[1] - 1) / m_t_devprop.maxGridSize[1];
	int n_blk_depth = (n_depth + m_t_devprop.maxGridSize[2] - 1) / m_t_devprop.maxGridSize[2];
	// calculate block dimensions (lower bound)

	// @todo - optimize block dimensions to approach m_t_devprop.maxThreadsPerBlock as closely as possible (that is good thing to do, right?)

	p_block_size[0] = n_blk_width;
	p_block_size[1] = n_blk_height;
	p_block_size[2] = n_blk_depth;
	p_grid_size[0] = (n_width + n_blk_width - 1) / n_blk_width;
	p_grid_size[1] = (n_height + n_blk_height - 1) / n_blk_height;
	p_grid_size[2] = (n_depth + n_blk_depth - 1) / n_blk_depth;
	// store block sizes, and grid size

	return true;
}

/*
 *								=== ~CCLDeviceParams ===
 */

/*
 *								=== CCLNonmanagedWrapper ===
 */

template <class CCLHandle, class CInterface>
inline CCLNonmanagedWrapper<CCLHandle, CInterface>::CCLNonmanagedWrapper(CCLHandle h_handle /*= 0*/)
{
	CInterface::m_h_handle = h_handle;
}

template <class CCLHandle, class CInterface>
inline CCLNonmanagedWrapper<CCLHandle, CInterface>::CCLNonmanagedWrapper(const CCLNonmanagedWrapper &r_other)
{
	CInterface::m_h_handle = r_other.m_h_handle;
}

template <class CCLHandle, class CInterface>
inline CCLNonmanagedWrapper<CCLHandle, CInterface> &CCLNonmanagedWrapper<CCLHandle,
	CInterface>::operator =(const CCLNonmanagedWrapper &r_other)
{
	CInterface::m_h_handle = r_other.m_h_handle;
	return *this;
}

template <class CCLHandle, class CInterface>
inline CCLNonmanagedWrapper<CCLHandle, CInterface> &CCLNonmanagedWrapper<CCLHandle,
	CInterface>::operator =(CCLHandle h_handle)
{
	CInterface::m_h_handle = h_handle;
	return *this;
}

template <class CCLHandle, class CInterface>
inline CCLNonmanagedWrapper<CCLHandle, CInterface>::operator CCLHandle() const
{
	return CInterface::m_h_handle;
}

template <class CCLHandle, class CInterface>
inline const CCLHandle *CCLNonmanagedWrapper<CCLHandle, CInterface>::operator &() const
{
	return &CInterface::m_h_handle;
}

/*
 *								=== ~CCLNonmanagedWrapper ===
 */

/*
 *								=== CCLUniqueWrapper ===
 */

template <class CCLHandle, class CInterface>
inline CCLUniqueWrapper<CCLHandle, CInterface>::CCLUniqueWrapper(CCLHandle h_handle /*= 0*/)
{
	CInterface::m_h_handle = h_handle;
}

template <class CCLHandle, class CInterface>
inline CCLUniqueWrapper<CCLHandle, CInterface>::CCLUniqueWrapper(/*const*/ CCLUniqueWrapper &r_other)
{
	CInterface::m_h_handle = /*const_cast<CCLUniqueWrapper&>*/(r_other).h_YieldOwnership();
}

template <class CCLHandle, class CInterface>
inline CCLUniqueWrapper<CCLHandle, CInterface>::~CCLUniqueWrapper()
{
	CInterface::Destroy(CInterface::m_h_handle);
}

template <class CCLHandle, class CInterface>
inline CCLUniqueWrapper<CCLHandle, CInterface>
	&CCLUniqueWrapper<CCLHandle, CInterface>::operator =(CCLHandle h_handle)
{
	this->CCLUniqueWrapper<CCLHandle, CInterface>::~CCLUniqueWrapper(); // destroy the owned handle, if any
	CInterface::m_h_handle = h_handle;
	return *this;
}

template <class CCLHandle, class CInterface>
inline CCLUniqueWrapper<CCLHandle, CInterface>
	&CCLUniqueWrapper<CCLHandle, CInterface>::operator =(/*const*/ CCLUniqueWrapper &r_other)
{
	CInterface::Destroy(CInterface::m_h_handle);//this->CCLUniqueWrapper<CCLHandle, CInterface>::~CCLUniqueWrapper(); // destroy the owned handle, if any
	CInterface::m_h_handle = /*const_cast<CCLUniqueWrapper&>*/(r_other).h_YieldOwnership();
	return *this;
}

template <class CCLHandle, class CInterface>
inline void CCLUniqueWrapper<CCLHandle, CInterface>::Destroy()
{
	CInterface::Destroy(CInterface::m_h_handle);//this->CCLUniqueWrapper::~CCLUniqueWrapper(); // destroy the owned handle, if any
	CInterface::m_h_handle = 0;
}

template <class CCLHandle, class CInterface>
inline CCLHandle CCLUniqueWrapper<CCLHandle, CInterface>::h_YieldOwnership()
{
	CCLHandle h_result = CInterface::m_h_handle;
	CInterface::m_h_handle = 0;
	return h_result;
}

template <class CCLHandle, class CInterface>
inline void CCLUniqueWrapper<CCLHandle, CInterface>::Swap(CCLUniqueWrapper &r_other)
{
	std::swap(CInterface::m_h_handle, r_other.m_h_handle);
}

template <class CCLHandle, class CInterface>
inline CCLHandle CCLUniqueWrapper<CCLHandle, CInterface>::h_Get/*_Handle*/() const
{
	return CInterface::m_h_handle;
}

/*
 *								=== ~CCLUniqueWrapper ===
 */

/*
 *								=== CCLUniqueWrapperEx ===
 */

template <class CCLHandle, class CInterface>
inline CCLUniqueWrapperEx<CCLHandle, CInterface>::CCLUniqueWrapperEx(CCLHandle h_handle /*= 0*/)
	:CCLUniqueWrapper<CCLHandle, CInterface>(h_handle)
{}

template <class CCLHandle, class CInterface>
inline CCLUniqueWrapperEx<CCLHandle, CInterface>::CCLUniqueWrapperEx(/*const*/ CCLUniqueWrapperEx &r_other)
	:CCLUniqueWrapper<CCLHandle, CInterface>(r_other)
{}

template <class CCLHandle, class CInterface>
inline CCLUniqueWrapperEx<CCLHandle, CInterface>::operator CCLHandle() const
{
	return this->m_h_handle;
}

template <class CCLHandle, class CInterface>
inline const CCLHandle *CCLUniqueWrapperEx<CCLHandle, CInterface>::operator &() const
{
	return &this->m_h_handle;
}

/*template <class CCLHandle, class CInterface>
inline CCLHandle *CCLUniqueWrapperEx<CCLHandle, CInterface>::operator &()
{
	return &this->m_h_handle;
}*/

template <class CCLHandle, class CInterface>
inline CCLUniqueWrapperEx<CCLHandle, CInterface>
	&CCLUniqueWrapperEx<CCLHandle, CInterface>::operator =(/*const*/ CCLUniqueWrapperEx &r_other)
{
	static_cast<CCLUniqueWrapper<CCLHandle, CInterface>&>(*this) =
		static_cast</*const*/ CCLUniqueWrapper<CCLHandle, CInterface>&>(r_other);
	return *this;
}

template <class CCLHandle, class CInterface>
inline CCLUniqueWrapperEx<CCLHandle, CInterface>
	&CCLUniqueWrapperEx<CCLHandle, CInterface>::operator =(CCLHandle h_handle)
{
	CInterface::Destroy(CInterface::m_h_handle);//this->CCLUniqueWrapper<CCLHandle, CInterface>::~CCLUniqueWrapper(); // destroy the owned handle, if any
	CInterface::m_h_handle = h_handle;
	return *this;
}

template <class CCLHandle, class CInterface>
inline void CCLUniqueWrapperEx<CCLHandle, CInterface>::Swap(CCLUniqueWrapperEx &r_other)
{
	std::swap(this->m_h_handle, r_other.m_h_handle);
}

/*
 *								=== ~CCLUniqueWrapperEx ===
 */

/*
 *								=== CCLEventInterface ===
 */

inline void CCLEventInterface::Destroy(cl_event h_event)
{
	if(h_event)
		clReleaseEvent(h_event);
}

/*
 *								=== ~CCLEventInterface ===
 */

/*
 *								=== CCLKernelDestructor ===
 */

inline void CCLKernelDestructor::Destroy(cl_kernel h_kernel)
{
	if(h_kernel)
		clReleaseKernel(h_kernel);
}

/*
 *								=== ~CCLKernelDestructor ===
 */

/*
 *								=== CCLMemObjectDestructor ===
 */

inline void CCLMemObjectDestructor::Destroy(cl_mem h_mem_object)
{
	if(h_mem_object)
		clReleaseMemObject(h_mem_object);
}

/*
 *								=== ~CCLMemObjectDestructor ===
 */

/*
 *								=== CCLMemObjectInterface ===
 */

inline CLresult CCLMemObjectInterface::n_Query_Size(size_t &r_n_size) const
{
	return (CLresult)clGetMemObjectInfo(m_h_handle, CL_MEM_SIZE, sizeof(size_t), &r_n_size, 0);
}

inline CLresult CCLMemObjectInterface::n_Query_Flags(cl_mem_flags &r_n_flags) const
{
	return (CLresult)clGetMemObjectInfo(m_h_handle, CL_MEM_FLAGS, sizeof(size_t), &r_n_flags, 0);
}

inline CLresult CCLMemObjectInterface::n_Query_Type(cl_mem_object_type &r_n_type) const
{
	return (CLresult)clGetMemObjectInfo(m_h_handle, CL_MEM_TYPE, sizeof(size_t), &r_n_type, 0);
}

inline CLresult CCLMemObjectInterface::n_Query_Parent(cl_mem &r_h_parent) const
{
	return (CLresult)clGetMemObjectInfo(m_h_handle, CL_MEM_ASSOCIATED_MEMOBJECT, sizeof(cl_mem), &r_h_parent, 0);
}

inline CLresult CCLMemObjectInterface::n_Query_ParentOffset(size_t &r_n_parent_offset) const
{
	return (CLresult)clGetMemObjectInfo(m_h_handle, CL_MEM_OFFSET, sizeof(size_t), &r_n_parent_offset, 0);
}

inline cl_mem CCLMemObjectInterface::h_Create_SubBuffer(size_t n_offset,
	size_t n_size, cl_mem_flags n_flags /*= CL_MEM_READ_WRITE*/)
{
	cl_mem h_buffer;
	CLresult n_result = n_Create_SubBuffer(h_buffer, n_offset, n_size, n_flags);
	if(n_result != CL_SUCCESS)
		return cl_mem(0);
	else
		return h_buffer;
}

inline CLresult CCLMemObjectInterface::n_Create_SubBuffer(cl_mem &r_h_sub_buffer,
	size_t n_offset, size_t n_size, cl_mem_flags n_flags /*= CL_MEM_READ_WRITE*/) const
{
	cl_buffer_region t_region;
	t_region.origin = n_offset;
	t_region.size = n_size;
	cl_int n_result;
	r_h_sub_buffer = clCreateSubBuffer(m_h_handle, n_flags, CL_BUFFER_CREATE_TYPE_REGION, &t_region, &n_result);
	return (CLresult)n_result;
}

inline CLresult CCLMemObjectInterface::n_Create_SubBuffer(CCLUniqueMem &r_sub_buffer,
	size_t n_offset, size_t n_size, cl_mem_flags n_flags /*= CL_MEM_READ_WRITE*/) const
{
	cl_mem h_buffer;
	CLresult n_result = n_Create_SubBuffer(h_buffer, n_offset, n_size, n_flags);
	r_sub_buffer = h_buffer;
	return n_result;
}

/*
 *								=== ~CCLMemObjectInterface ===
 */

/*
 *								=== CCLCommandQueueInterface ===
 */

inline void CCLCommandQueueInterface::Destroy(cl_command_queue h_cmd_queue)
{
	if(h_cmd_queue)
		clReleaseCommandQueue(h_cmd_queue);
}

#if 0 // deprecated in OpenCL 1.1

inline CLresult CCLCommandQueueInterface::n_SetProperty(cl_command_queue_properties n_properties, bool b_enable,
	cl_command_queue_properties *p_old_properties /*= 0*/)
{
	return clSetCommandQueueProperty(n_properties, b_enable, p_old_properties);
}

inline CLresult CCLCommandQueueInterface::n_Enable_Profiling(bool b_enable /*= true*/)
{
	return n_SetProperty(CL_QUEUE_PROFILING_ENABLE, b_enable);
}

#endif // 0

inline CLresult CCLCommandQueueInterface::n_Enqueue_ReadBuffer(cl_mem h_device_src, bool b_blocking, size_t n_offset,
	size_t n_size, void *p_host_dest, cl_uint n_wait_for_event_num /*= 0*/,
	const cl_event *p_wait_for_event /*= 0*/, cl_event *p_finished_event /*= 0*/) const
{
	return (CLresult)clEnqueueReadBuffer(m_h_handle, h_device_src, b_blocking, n_offset,
		n_size, p_host_dest, n_wait_for_event_num, p_wait_for_event, p_finished_event);
}

inline CLresult CCLCommandQueueInterface::n_Enqueue_WriteBuffer(cl_mem h_device_dest, bool b_blocking, size_t n_offset,
	size_t n_size, const void *p_host_src, cl_uint n_wait_for_event_num /*= 0*/,
	const cl_event *p_wait_for_event /*= 0*/, cl_event *p_finished_event /*= 0*/) const
{
	return (CLresult)clEnqueueWriteBuffer(m_h_handle, h_device_dest, b_blocking, n_offset,
		n_size, p_host_src, n_wait_for_event_num, p_wait_for_event, p_finished_event);
}

inline CLresult CCLCommandQueueInterface::n_Enqueue_CopyBuffer(cl_mem h_device_src, cl_mem h_device_dest,
	size_t n_src_offset, size_t n_dest_offset, size_t n_size,
	cl_uint n_wait_for_event_num /*= 0*/, const cl_event *p_wait_for_event /*= 0*/,
	cl_event *p_finished_event /*= 0*/) const
{
	return (CLresult)clEnqueueCopyBuffer(m_h_handle, h_device_src, h_device_dest,
		n_src_offset, n_dest_offset, n_size, n_wait_for_event_num, p_wait_for_event,
		p_finished_event);
}

inline CLresult CCLCommandQueueInterface::n_Enqueue_Memcpy_DtoH(void *p_host_dest, cl_mem h_device_src, size_t n_offset,
	size_t n_size, cl_uint n_wait_for_event_num /*= 0*/, const cl_event *p_wait_for_event /*= 0*/,
	cl_event *p_finished_event /*= 0*/) const
{
	return (CLresult)n_Enqueue_ReadBuffer(h_device_src, true, n_offset, n_size, p_host_dest,
		n_wait_for_event_num, p_wait_for_event, p_finished_event);
}

inline CLresult CCLCommandQueueInterface::n_Enqueue_Memcpy_DtoH_Async(void *p_host_dest, cl_mem h_device_src, size_t n_offset,
	size_t n_size, cl_uint n_wait_for_event_num /*= 0*/, const cl_event *p_wait_for_event /*= 0*/,
	cl_event *p_finished_event /*= 0*/) const
{
	return (CLresult)n_Enqueue_ReadBuffer(h_device_src, false, n_offset, n_size, p_host_dest,
		n_wait_for_event_num, p_wait_for_event, p_finished_event);
}

inline CLresult CCLCommandQueueInterface::n_Enqueue_Memcpy_HtoD(cl_mem h_device_dest, size_t n_offset, const void *p_host_src,
	size_t n_size, cl_uint n_wait_for_event_num /*= 0*/, const cl_event *p_wait_for_event /*= 0*/,
	cl_event *p_finished_event /*= 0*/) const
{
	return (CLresult)n_Enqueue_WriteBuffer(h_device_dest, true, n_offset, n_size, p_host_src,
		n_wait_for_event_num, p_wait_for_event, p_finished_event);
}

inline CLresult CCLCommandQueueInterface::n_Enqueue_Memcpy_HtoD_Async(cl_mem h_device_dest, size_t n_offset, const void *p_host_src,
	size_t n_size, cl_uint n_wait_for_event_num /*= 0*/, const cl_event *p_wait_for_event /*= 0*/,
	cl_event *p_finished_event /*= 0*/) const
{
	return (CLresult)n_Enqueue_WriteBuffer(h_device_dest, true, n_offset, n_size, p_host_src,
		n_wait_for_event_num, p_wait_for_event, p_finished_event);
}

inline CLresult CCLCommandQueueInterface::n_Enqueue_Memcpy_DtoD(cl_mem h_device_dest, size_t n_dest_offset,
	cl_mem h_device_src, size_t n_src_offset, size_t n_size,
	cl_uint n_wait_for_event_num /*= 0*/, const cl_event *p_wait_for_event /*= 0*/,
	cl_event *p_finished_event /*= 0*/) const
{
	return (CLresult)n_Enqueue_CopyBuffer(h_device_src, h_device_dest, n_src_offset,
		n_dest_offset, n_size, n_wait_for_event_num, p_wait_for_event,
		p_finished_event);
}

inline CLresult CCLCommandQueueInterface::n_Enqueue_MapBuffer(void *&r_p_buffer, cl_mem h_buffer, bool b_blocking_map, 
	cl_map_flags n_map_flags, size_t n_offset, size_t n_size,
	cl_uint n_wait_for_event_num /*= 0*/, const cl_event *p_wait_for_event /*= 0*/,
	cl_event *p_finished_event /*= 0*/) const
{
	cl_int n_result;
	r_p_buffer = clEnqueueMapBuffer(m_h_handle, h_buffer, b_blocking_map, n_map_flags,
		n_offset, n_size, n_wait_for_event_num, p_wait_for_event, p_finished_event, &n_result);
	return (CLresult)n_result;
}

inline CLresult CCLCommandQueueInterface::n_Enqueue_UnmapMemObject(void *p_buffer, cl_mem h_buffer,
	cl_uint n_wait_for_event_num /*= 0*/, const cl_event *p_wait_for_event /*= 0*/,
	cl_event *p_finished_event /*= 0*/) const
{
	return (CLresult)clEnqueueUnmapMemObject(m_h_handle, h_buffer, p_buffer,
		n_wait_for_event_num, p_wait_for_event, p_finished_event);
}

inline CLresult CCLCommandQueueInterface::n_Enqueue_Marker(cl_event *p_event) const
{
	_ASSERTE(p_event);
	return (CLresult)clEnqueueMarker(m_h_handle, p_event);
}

// ---

inline CLresult CCLCommandQueueInterface::n_Enqueue_ReadBuffer(cl_mem h_device_src, bool b_blocking, size_t n_offset,
	size_t n_size, void *p_host_dest, cl_uint n_wait_for_event_num,
	const cl_event *p_wait_for_event, CCLUniqueEvent &r_finished_event) const
{
	cl_event h_event;
	CLresult n_result = n_Enqueue_ReadBuffer(h_device_src, b_blocking,
		n_offset, n_size, p_host_dest, n_wait_for_event_num, p_wait_for_event, &h_event);
	r_finished_event = h_event; // properly deletes the previously owned event
	return n_result;
}

inline CLresult CCLCommandQueueInterface::n_Enqueue_WriteBuffer(cl_mem h_device_dest, bool b_blocking, size_t n_offset,
	size_t n_size, const void *p_host_src, cl_uint n_wait_for_event_num,
	const cl_event *p_wait_for_event, CCLUniqueEvent &r_finished_event) const
{
	cl_event h_event;
	CLresult n_result = n_Enqueue_WriteBuffer(h_device_dest, b_blocking,
		n_offset, n_size, p_host_src, n_wait_for_event_num, p_wait_for_event, &h_event);
	r_finished_event = h_event; // properly deletes the previously owned event
	return n_result;
}

inline CLresult CCLCommandQueueInterface::n_Enqueue_CopyBuffer(cl_mem h_device_src, cl_mem h_device_dest,
	size_t n_src_offset, size_t n_dest_offset, size_t n_size,
	cl_uint n_wait_for_event_num, const cl_event *p_wait_for_event,
	CCLUniqueEvent &r_finished_event) const
{
	cl_event h_event;
	CLresult n_result = n_Enqueue_CopyBuffer(h_device_src, h_device_dest, n_src_offset,
		n_dest_offset, n_size, n_wait_for_event_num, p_wait_for_event, &h_event);
	r_finished_event = h_event; // properly deletes the previously owned event
	return n_result;
}

inline CLresult CCLCommandQueueInterface::n_Enqueue_Memcpy_DtoH(void *p_host_dest, cl_mem h_device_src, size_t n_offset,
	size_t n_size, cl_uint n_wait_for_event_num, const cl_event *p_wait_for_event,
	CCLUniqueEvent &r_finished_event) const
{
	cl_event h_event;
	CLresult n_result = n_Enqueue_Memcpy_DtoH(p_host_dest, h_device_src, n_offset,
		n_size, n_wait_for_event_num, p_wait_for_event, &h_event);
	r_finished_event = h_event; // properly deletes the previously owned event
	return n_result;
}

inline CLresult CCLCommandQueueInterface::n_Enqueue_Memcpy_DtoH_Async(void *p_host_dest, cl_mem h_device_src, size_t n_offset,
	size_t n_size, cl_uint n_wait_for_event_num, const cl_event *p_wait_for_event,
	CCLUniqueEvent &r_finished_event) const
{
	cl_event h_event;
	CLresult n_result = n_Enqueue_Memcpy_DtoH_Async(p_host_dest, h_device_src, n_offset,
		n_size, n_wait_for_event_num, p_wait_for_event, &h_event);
	r_finished_event = h_event; // properly deletes the previously owned event
	return n_result;
}

inline CLresult CCLCommandQueueInterface::n_Enqueue_Memcpy_HtoD(cl_mem h_device_dest, size_t n_offset, const void *p_host_src,
	size_t n_size, cl_uint n_wait_for_event_num, const cl_event *p_wait_for_event,
	CCLUniqueEvent &r_finished_event) const
{
	cl_event h_event;
	CLresult n_result = n_Enqueue_Memcpy_HtoD(h_device_dest, n_offset, p_host_src,
		n_size, n_wait_for_event_num, p_wait_for_event, &h_event);
	r_finished_event = h_event; // properly deletes the previously owned event
	return n_result;
}

inline CLresult CCLCommandQueueInterface::n_Enqueue_Memcpy_HtoD_Async(cl_mem h_device_dest, size_t n_offset, const void *p_host_src,
	size_t n_size, cl_uint n_wait_for_event_num, const cl_event *p_wait_for_event,
	CCLUniqueEvent &r_finished_event) const
{
	cl_event h_event;
	CLresult n_result = n_Enqueue_Memcpy_HtoD_Async(h_device_dest, n_offset, p_host_src,
		n_size, n_wait_for_event_num, p_wait_for_event, &h_event);
	r_finished_event = h_event; // properly deletes the previously owned event
	return n_result;
}

inline CLresult CCLCommandQueueInterface::n_Enqueue_Memcpy_DtoD(cl_mem h_device_dest, size_t n_dest_offset,
	cl_mem h_device_src, size_t n_src_offset, size_t n_size,
	cl_uint n_wait_for_event_num, const cl_event *p_wait_for_event,
	CCLUniqueEvent &r_finished_event) const
{
	cl_event h_event;
	CLresult n_result = n_Enqueue_Memcpy_DtoD(h_device_dest, n_dest_offset, h_device_src,
		n_src_offset, n_size, n_wait_for_event_num, p_wait_for_event, &h_event);
	r_finished_event = h_event; // properly deletes the previously owned event
	return n_result;
}

inline CLresult CCLCommandQueueInterface::n_Enqueue_MapBuffer(void *&r_p_buffer, cl_mem h_buffer, bool b_blocking_map, 
	cl_map_flags n_map_flags, size_t n_offset, size_t n_size,
	cl_uint n_wait_for_event_num, const cl_event *p_wait_for_event,
	CCLUniqueEvent &r_finished_event) const
{
	cl_event h_event;
	CLresult n_result = n_Enqueue_MapBuffer(r_p_buffer, h_buffer, b_blocking_map,
		n_map_flags, n_offset, n_size, n_wait_for_event_num, p_wait_for_event, &h_event);
	r_finished_event = h_event; // properly deletes the previously owned event
	return n_result;
}

inline CLresult CCLCommandQueueInterface::n_Enqueue_UnmapMemObject(void *p_buffer, cl_mem h_buffer,
	cl_uint n_wait_for_event_num, const cl_event *p_wait_for_event,
	CCLUniqueEvent &r_finished_event) const
{
	cl_event h_event;
	CLresult n_result = n_Enqueue_UnmapMemObject(p_buffer, h_buffer, n_wait_for_event_num, p_wait_for_event, &h_event);
	r_finished_event = h_event; // properly deletes the previously owned event
	return n_result;
}

inline CLresult CCLCommandQueueInterface::n_Enqueue_Marker(CCLUniqueEvent &r_event) const
{
	cl_event h_event;
	CLresult n_result = n_Enqueue_Marker(&h_event);
	r_event = h_event; // properly deletes the previously owned event
	return n_result;
}

// ---

inline CLresult CCLCommandQueueInterface::n_Enqueue_WaitForEvents(cl_uint n_wait_for_event_num,
	const cl_event *p_wait_for_event) const
{
	return (CLresult)clEnqueueWaitForEvents(m_h_handle, n_wait_for_event_num, p_wait_for_event);
}

inline CLresult CCLCommandQueueInterface::n_Enqueue_WaitForEvents(cl_uint n_wait_for_event_num,
	const CCLUniqueEvent *p_wait_for_event) const
{
	if(sizeof(CCLUniqueEvent) == sizeof(cl_event)) { // compiletime constant
		return (CLresult)clEnqueueWaitForEvents(m_h_handle,
			n_wait_for_event_num, (const cl_event*)p_wait_for_event);
		// CCLUniqueEvent is a thin wrapper, can pass directly
	} else {
		for(cl_uint i = 0; i < n_wait_for_event_num; ++ i) {
			CLresult n_result;
			cl_event h_event = p_wait_for_event[i].h_Get();
			if((n_result = (CLresult)clEnqueueWaitForEvents(m_h_handle, 1, &h_event)))
				return n_result;
		}
		// if for some reason the event handle gets padded, we need to wait
		// for them one by one (could also collect the handles to an array)
		// this will be less efficient but shouldn't introduce any deadlock

		return cl_Success;
	}
}

inline CLresult CCLCommandQueueInterface::n_Enqueue_Barrier() const
{
	return (CLresult)clEnqueueBarrier(m_h_handle);
}

inline CLresult CCLCommandQueueInterface::n_Finish() const
{
	return (CLresult)clFinish(m_h_handle);
}

inline cl_command_queue CCLCommandQueueInterface::h_Handle() const
{
	return m_h_handle;
}

/*
 *								=== ~CCLCommandQueueInterface ===
 */

/*
 *								=== CCLContextInterface ===
 */

inline void CCLContextInterface::Destroy(cl_context h_context)
{
	if(h_context)
		clReleaseContext(h_context);
}

inline CLresult CCLContextInterface::n_CreateBuffer(cl_mem &r_h_mem,
	size_t n_size, cl_mem_flags n_flags /*= CL_MEM_READ_WRITE*/) const
{
	cl_int n_result;
	r_h_mem = clCreateBuffer(m_h_handle, n_flags, n_size, 0, &n_result);
	return (CLresult)n_result;
}

inline CLresult CCLContextInterface::n_CreateBuffer(cl_mem &r_h_mem,
	size_t n_size, void *p_host_pointer, cl_mem_flags n_flags) const
{
	_ASSERTE(n_flags & CL_MEM_USE_HOST_PTR ||
		n_flags & CL_MEM_ALLOC_HOST_PTR || n_flags & CL_MEM_COPY_HOST_PTR);
	// make sure the host pointer is going to be used

	_ASSERTE((!(n_flags & CL_MEM_USE_HOST_PTR) && !(n_flags & CL_MEM_COPY_HOST_PTR)) || p_host_pointer);
	// CL_MEM_USE_HOST_PTR is valid only if host_ptr is not NULL
	// CL_MEM_COPY_HOST_PTR is valid only if host_ptr is not NULL

	_ASSERTE(!(n_flags & CL_MEM_ALLOC_HOST_PTR) || !(n_flags & CL_MEM_USE_HOST_PTR));
	// CL_MEM_ALLOC_HOST_PTR and CL_MEM_USE_HOST_PTR are mutually exclusive

	_ASSERTE(!(n_flags & CL_MEM_COPY_HOST_PTR) || !(n_flags & CL_MEM_USE_HOST_PTR));
	// CL_MEM_COPY_HOST_PTR and CL_MEM_USE_HOST_PTR are mutually exclusive.

	cl_int n_result;
	r_h_mem = clCreateBuffer(m_h_handle, n_flags, n_size, p_host_pointer, &n_result);
	return (CLresult)n_result;
}

inline CLresult CCLContextInterface::n_CreateBuffer(CCLUniqueMem &r_mem,
	size_t n_size, cl_mem_flags n_flags /*= CL_MEM_READ_WRITE*/) const
{
	{
		CCLUniqueMem empty_mem;
		r_mem.Swap(empty_mem);
	}
	// if there is memory there, it deletes it first to increase the chances of the allocation succeeding

	cl_mem h_mem;
	CLresult n_result = n_CreateBuffer(h_mem, n_size, n_flags);

	if(n_result == cl_Success)
		r_mem = h_mem;

	return n_result;
}

inline CLresult CCLContextInterface::n_CreateBuffer(CCLUniqueMem &r_mem,
	size_t n_size, void *p_host_pointer, cl_mem_flags n_flags) const
{
	{
		CCLUniqueMem empty_mem;
		r_mem.Swap(empty_mem);
	}
	// if there is memory there, it deletes it first to increase the chances of the allocation succeeding

	cl_mem h_mem;
	CLresult n_result = n_CreateBuffer(h_mem, n_size, p_host_pointer, n_flags);

	if(n_result == cl_Success)
		r_mem = h_mem;

	return n_result;
}

inline cl_mem CCLContextInterface::h_CreateBuffer(size_t n_size, cl_mem_flags n_flags /*= CL_MEM_READ_WRITE*/) const
{
	cl_mem h_mem;
	return (n_CreateBuffer(h_mem, n_size, n_flags) == CL_SUCCESS)? h_mem : 0;
}

inline cl_mem CCLContextInterface::h_CreateBuffer(size_t n_size, void *p_host_pointer, cl_mem_flags n_flags) const
{
	cl_mem h_mem;
	return (n_CreateBuffer(h_mem, n_size, p_host_pointer, n_flags) == CL_SUCCESS)? h_mem : 0;
}

/*
 *								=== ~CCLContextInterface ===
 */

/*
 *								=== CCLUniqueInstance ===
 */

inline CCLUniqueInstance::CCLUniqueInstance()
	:m_n_last_error(cl_Success)
{
	m_p_device[0] = 0;
}

inline CCLUniqueInstance::CCLUniqueInstance(const CCLUniqueInstance &r_other)
	:m_n_last_error(cl_Success)
{
	m_p_device[0] = 0;
	Swap(const_cast<CCLUniqueInstance&>(r_other)); // makes r_other empty
}

inline CCLUniqueInstance &CCLUniqueInstance::operator =(const CCLUniqueInstance &r_other)
{
	{
		CCLUniqueInstance empty;
		Swap(empty);
	}
	// takes care of destruction and initializes this to empty

	Swap(const_cast<CCLUniqueInstance&>(r_other));
	// makes r_other empty

	return *this;
}

template <class CDeviceSelector>
CCLUniqueInstance::CCLUniqueInstance(CDeviceSelector device_selector, int n_device_type /*= CL_DEVICE_TYPE_GPU*/,
	bool b_implementation_profile_selection /*= false*/, bool b_stderr_output /*= true*/,
	int n_queue_options /*= CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE*/)
{
	m_p_device[0] = 0;
	m_n_last_error = n_Init(device_selector, n_device_type, n_queue_options,
		b_implementation_profile_selection, b_stderr_output);
}

inline bool CCLUniqueInstance::b_Status() const
{
	return m_n_last_error == CL_SUCCESS;
}

inline CLresult CCLUniqueInstance::n_Status() const
{
	return m_n_last_error;
}

inline size_t CCLUniqueInstance::n_Device_Num() const
{
	return 1;
}

inline const cl_device_id *CCLUniqueInstance::p_Device() const
{
	return m_p_device;
}

inline cl_context CCLUniqueInstance::h_Context() const
{
	return CCLUniqueContext::h_Get();
}

inline cl_device_id CCLUniqueInstance::h_Device(size_t UNUSED(n_index)) const
{
	_ASSERTE(!n_index); // so far there is only a single one
	return m_p_device[0];
}

inline cl_command_queue CCLUniqueInstance::h_Command_Queue(size_t UNUSED(n_index)) const
{
	_ASSERTE(!n_index); // so far there is only a single one
	return m_p_cmd_queue[0];
}

inline const CCLCommandQueueInterface &CCLUniqueInstance::operator [](size_t UNUSED(n_index)) const
{
	_ASSERTE(!n_index); // so far there is only a single one
	return m_p_cmd_queue[0];
}

#if 0
template <class CDeviceSelector>
int CCLUniqueInstance::n_Init(CDeviceSelector device_selector, int n_device_type /*= CL_DEVICE_TYPE_GPU*/,
	bool b_implementation_profile_selection /*= false*/, bool b_stderr_output /*= true*/)
{
	int n_result;
	if((n_result = CCLUtils::n_OpenCL_Init(&m_h_context, n_device_type,
	   b_implementation_profile_selection)) != CL_SUCCESS) {
		if(b_stderr_output)
			fprintf(stderr, "error: failed to initialize OpenCL (%d)\n", n_result);
		return n_result;
	}
	// create OpenCL context

	if(CCLUtils::n_Get_Best_DeviceId(&m_h_device, m_h_context, device_selector) < 0) {
		n_result = CL_DEVICE_NOT_FOUND; // might cause confusion
		if(b_stderr_output)
			fprintf(stderr, "error: failed to get handle of an OpenCL device\n");
		return n_result;
	}
	// get fastest device

	m_h_cmd_queue = clCreateCommandQueue(m_h_context, m_h_device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &n_result);
	if(n_result != CL_SUCCESS) {
		if(b_stderr_output)
			fprintf(stderr, "error: failed to create OpenCL command queue (%d)\n", n_result);
		return n_result;
	}
	// create command queue

	return CL_SUCCESS;
}
#endif // 0

inline void CCLUniqueInstance::Swap(CCLUniqueInstance &r_other)
{
	r_other.m_p_cmd_queue[0].Swap(m_p_cmd_queue[0]);
	CCLUniqueContext::Swap(r_other);
	std::swap(r_other.m_p_device[0], m_p_device[0]);
	std::swap(r_other.m_n_last_error, m_n_last_error);
}

namespace std {

/**
 *	@brief swaps two managed OpenCL instances
 *	@param[in,out] r_first is managed OpenCL instance to swap
 *	@param[in,out] r_second is managed OpenCL instance to swap
 */
inline void swap(CCLUniqueInstance &r_first, CCLUniqueInstance &r_second)
{
	r_first.Swap(r_second);
}

} // ~std

/*
 *								=== ~CCLUniqueInstance ===
 */

/*
 *								=== CCLUniqueProgram ===
 */

inline CCLUniqueProgram::CCLUniqueProgram()
	:m_h_program(0), m_n_last_result(cl_Success), m_n_compile_flags(0)
{}

inline CCLUniqueProgram::CCLUniqueProgram(const CCLUniqueProgram &r_other)
	:m_h_program(0), m_n_last_result(cl_Success), m_n_compile_flags(0)
{
	Swap(const_cast<CCLUniqueProgram&>(r_other)); // makes r_other empty
}

inline CCLUniqueProgram::CCLUniqueProgram(cl_context h_context, char *p_s_source_code,
	TBuildFromSource_Tag UNUSED(t_tag), const char *p_s_compiler_options /*= ""*/,
	const char *p_s_cache_file /*= "%copykernelname%"*/, int n_max_cache_size /*= 32*/)
	:m_h_program(0), m_n_last_result(cl_Success), m_n_compile_flags(0)
{
	CCLUniqueProgram program(h_context, const_cast<const char*>(p_s_source_code),
		from_source_code, p_s_compiler_options, p_s_cache_file, n_max_cache_size);
	Swap(program);
}

inline CCLUniqueProgram::CCLUniqueProgram(CCLUniqueInstance &r_instance, char *p_s_source_code,
	TBuildFromSource_Tag UNUSED(t_tag), const char *p_s_compiler_options /*= ""*/,
	const char *p_s_cache_file /*= "%copykernelname%"*/, int n_max_cache_size /*= 32*/)
	:m_h_program(0), m_n_last_result(cl_Success), m_n_compile_flags(0)
{
	CCLUniqueProgram program(r_instance, const_cast<const char*>(p_s_source_code),
		from_source_code, p_s_compiler_options, p_s_cache_file, n_max_cache_size);
	Swap(program);
}

#if 0
template <class CCompressedSourceCode>
inline CCLUniqueProgram::CCLUniqueProgram(cl_context h_context,
	const CCompressedSourceCode &r_source_code, TBuildFromCompressedSource_Tag UNUSED(t_tag),
	const char *p_s_compiler_options /*= ""*/,
	const char *p_s_cache_file /*= "%copykernelname%"*/, int n_max_cache_size /*= 32*/)
	:m_h_program(0), m_n_last_result(CL_SUCCESS), m_n_compile_flags(0)
{
	const char *p_s_source_code;
	if((p_s_source_code = r_source_code.p_Data())) { // decompress
		CCLUniqueProgram program(h_context, p_s_source_code,
			from_source_code, p_s_compiler_options, p_s_cache_file, n_max_cache_size);
		Swap(program);
		if(r_source_code.b_Dynamic())
			delete[] const_cast<char*>(p_s_source_code); // free memory
	} else
		m_n_last_result = CL_OUT_OF_HOST_MEMORY; // well, ...
}

template <class CCompressedSourceCode>
inline CCLUniqueProgram::CCLUniqueProgram(CCLUniqueInstance &r_instance,
	const CCompressedSourceCode &r_source_code, TBuildFromCompressedSource_Tag UNUSED(t_tag),
	const char *p_s_compiler_options /*= ""*/,
	const char *p_s_cache_file /*= "%copykernelname%"*/, int n_max_cache_size /*= 32*/)
	:m_h_program(0), m_n_last_result(CL_SUCCESS), m_n_compile_flags(0)
{
	const char *p_s_source_code;
	if((p_s_source_code = r_source_code.p_Data())) { // decompress
		CCLUniqueProgram program(r_instance, p_s_source_code,
			from_source_code, p_s_compiler_options, p_s_cache_file, n_max_cache_size);
		Swap(program);
		if(r_source_code.b_Dynamic())
			delete[] const_cast<char*>(p_s_source_code); // free memory
	} else
		m_n_last_result = CL_OUT_OF_HOST_MEMORY; // well, ...
}
#endif // 0

inline CCLUniqueProgram &CCLUniqueProgram::operator =(const CCLUniqueProgram &r_other)
{
	{
		CCLUniqueProgram empty;
		Swap(empty);
	}
	// takes care of destruction and initializes this to empty

	Swap(const_cast<CCLUniqueProgram&>(r_other));
	// makes r_other empty

	return *this;
}

inline bool CCLUniqueProgram::b_Status() const
{
	return m_n_last_result == CL_SUCCESS;
}

inline CLresult CCLUniqueProgram::n_Status() const
{
	return m_n_last_result;
}

inline int CCLUniqueProgram::n_StatusWord() const
{
	return m_n_compile_flags;
}

inline void CCLUniqueProgram::Dump_StatusWord() const
{
	CCLProgramCompiler::Dump_StatusWord(m_n_compile_flags);
}

inline void CCLUniqueProgram::Swap(CCLUniqueProgram &r_other)
{
	std::swap(r_other.m_h_program, m_h_program);
	std::swap(r_other.m_n_last_result, m_n_last_result);
	std::swap(r_other.m_n_compile_flags, m_n_compile_flags);
}

namespace std {

/**
 *	@brief swaps two managed OpenCL programs
 *	@param[in,out] r_first is managed OpenCL program to swap
 *	@param[in,out] r_second is managed OpenCL program to swap
 */
inline void swap(CCLUniqueProgram &r_first, CCLUniqueProgram &r_second)
{
	r_first.Swap(r_second);
}

} // ~std

/*
 *								=== ~CCLUniqueProgram ===
 */

/*
 *								=== CCLLocalMem ===
 */

inline CCLLocalMem::CCLLocalMem(size_t n_size)
	:m_n_size(n_size)
{}

inline size_t CCLLocalMem::n_Size() const
{
	return m_n_size;
}

/*
 *								=== ~CCLLocalMem ===
 */

/*
 *								=== CCLArgLoader ===
 */

template <const int n_index>
inline CCLArgLoader<n_index>::CCLArgLoader(cl_kernel h_func, CLresult n_result /*= cl_Success*/)
	:m_h_func(h_func), m_n_result(n_result)
{}

template <const int n_index>
inline CLresult CCLArgLoader<n_index>::n_Result() const
{
	return m_n_result;
}

template <const int n_index>
inline int CCLArgLoader<n_index>::n_Count() const
{
	return n_index + 1;
}

template <const int n_index>
inline CCLArgLoader<n_index + 1> CCLArgLoader<n_index>::operator ,(int n_value)
{
	//cl__SafeParamSeti(m_h_func, n_index, n_value);
	if(m_n_result == CL_SUCCESS)
		m_n_result = (CLresult)clSetKernelArg(m_h_func, n_index, sizeof(n_value), &n_value);
	return CCLArgLoader<n_index + 1>(m_h_func, m_n_result);
}

template <const int n_index>
inline CCLArgLoader<n_index + 1> CCLArgLoader<n_index>::operator ,(float f_value)
{
	//cl__SafeParamSetf(m_h_func, n_index, f_value);
	if(m_n_result == CL_SUCCESS)
		m_n_result = (CLresult)clSetKernelArg(m_h_func, n_index, sizeof(f_value), &f_value);
	return CCLArgLoader<n_index + 1>(m_h_func, m_n_result);
}

template <const int n_index>
inline CCLArgLoader<n_index + 1> CCLArgLoader<n_index>::operator ,(cl_mem p_value)
{
	//cl__SafeParamSetv(m_h_func, n_index, sizeof(cl_mem), (void*)&p_value);
	if(m_n_result == CL_SUCCESS)
		m_n_result = (CLresult)clSetKernelArg(m_h_func, n_index, sizeof(cl_mem), (void*)&p_value);
	return CCLArgLoader<n_index + 1>(m_h_func, m_n_result);
}

template <const int n_index>
inline CCLArgLoader<n_index + 1> CCLArgLoader<n_index>::operator ,(CCLLocalMem t_local_mem_cfg)
{
	//cl__SafeParamSetv(m_h_func, n_index, t_local_mem_cfg.n_Size(), NULL);
	if(m_n_result == CL_SUCCESS)
		m_n_result = (CLresult)clSetKernelArg(m_h_func, n_index, t_local_mem_cfg.n_Size(), NULL);
	return CCLArgLoader<n_index + 1>(m_h_func, m_n_result);
}

/*
 *								=== ~CCLArgLoader ===
 */

/*
 *								=== CCLKernelCall ===
 */

inline CCLKernelCall::CCLKernelCall(CLresult n_loader_result,
	cl_command_queue h_cmd_queue, cl_kernel h_kernel,
	size_t n_work_size_x, size_t n_block_size_x)
	:m_n_result(n_loader_result), m_h_cmd_queue(h_cmd_queue), m_h_kernel(h_kernel), m_n_dimension(1)
#ifdef _DEBUG
	, m_b_called(false) // !!
#endif // _DEBUG
{
	m_p_global_work_size[0] = n_work_size_x;
	m_p_local_work_size[0] = n_block_size_x;
}

inline CCLKernelCall::CCLKernelCall(CLresult n_loader_result,
	cl_command_queue h_cmd_queue, cl_kernel h_kernel,
	size_t n_work_size_x, size_t n_work_size_y,
	size_t n_block_size_x, size_t n_block_size_y)
	:m_n_result(n_loader_result), m_h_cmd_queue(h_cmd_queue), m_h_kernel(h_kernel), m_n_dimension(2)
#ifdef _DEBUG
	, m_b_called(false) // !!
#endif // _DEBUG
{
	m_p_global_work_size[0] = n_work_size_x;
	m_p_global_work_size[1] = n_work_size_y;
	m_p_local_work_size[0] = n_block_size_x;
	m_p_local_work_size[1] = n_block_size_y;
}

inline CCLKernelCall::CCLKernelCall(CLresult n_loader_result,
	cl_command_queue h_cmd_queue, cl_kernel h_kernel,
	size_t n_work_size_x, size_t n_work_size_y, size_t n_work_size_z,
	size_t n_block_size_x, size_t n_block_size_y, size_t n_block_size_z)
	:m_n_result(n_loader_result), m_h_cmd_queue(h_cmd_queue), m_h_kernel(h_kernel), m_n_dimension(2)
#ifdef _DEBUG
	, m_b_called(false) // !!
#endif // _DEBUG
{
	m_p_global_work_size[0] = n_work_size_x;
	m_p_global_work_size[1] = n_work_size_y;
	m_p_global_work_size[2] = n_work_size_z;
	m_p_local_work_size[0] = n_block_size_x;
	m_p_local_work_size[1] = n_block_size_y;
	m_p_local_work_size[2] = n_block_size_z;
}

inline CCLKernelCall::CCLKernelCall(CLresult n_loader_result,
	const CCLCommandQueueInterface &r_cmd_queue, cl_kernel h_kernel,
	size_t n_work_size_x, size_t n_block_size_x)
	:m_n_result(n_loader_result), m_h_cmd_queue(r_cmd_queue.h_Handle()), m_h_kernel(h_kernel), m_n_dimension(1)
#ifdef _DEBUG
	, m_b_called(false) // !!
#endif // _DEBUG
{
	m_p_global_work_size[0] = n_work_size_x;
	m_p_local_work_size[0] = n_block_size_x;
}

inline CCLKernelCall::CCLKernelCall(CLresult n_loader_result,
	const CCLCommandQueueInterface &r_cmd_queue, cl_kernel h_kernel,
	size_t n_work_size_x, size_t n_work_size_y,
	size_t n_block_size_x, size_t n_block_size_y)
	:m_n_result(n_loader_result), m_h_cmd_queue(r_cmd_queue.h_Handle()), m_h_kernel(h_kernel), m_n_dimension(2)
#ifdef _DEBUG
	, m_b_called(false) // !!
#endif // _DEBUG
{
	m_p_global_work_size[0] = n_work_size_x;
	m_p_global_work_size[1] = n_work_size_y;
	m_p_local_work_size[0] = n_block_size_x;
	m_p_local_work_size[1] = n_block_size_y;
}

inline CCLKernelCall::CCLKernelCall(CLresult n_loader_result,
	const CCLCommandQueueInterface &r_cmd_queue, cl_kernel h_kernel,
	size_t n_work_size_x, size_t n_work_size_y, size_t n_work_size_z,
	size_t n_block_size_x, size_t n_block_size_y, size_t n_block_size_z)
	:m_n_result(n_loader_result), m_h_cmd_queue(r_cmd_queue.h_Handle()), m_h_kernel(h_kernel), m_n_dimension(2)
#ifdef _DEBUG
	, m_b_called(false) // !!
#endif // _DEBUG
{
	m_p_global_work_size[0] = n_work_size_x;
	m_p_global_work_size[1] = n_work_size_y;
	m_p_global_work_size[2] = n_work_size_z;
	m_p_local_work_size[0] = n_block_size_x;
	m_p_local_work_size[1] = n_block_size_y;
	m_p_local_work_size[2] = n_block_size_z;
}

inline CCLKernelCall::operator CLresult()
{
#ifdef _DEBUG
	m_b_called = true;
#endif // _DEBUG
	if(m_n_result == CL_SUCCESS) {
		m_n_result = (CLresult)clEnqueueNDRangeKernel(m_h_cmd_queue,
			m_h_kernel, m_n_dimension, 0, m_p_global_work_size, m_p_local_work_size, 0, 0, 0);
	}
	return m_n_result;
}

inline CLresult CCLKernelCall::After(cl_uint n_wait_for_event_num, const cl_event *p_wait_for_event)
{
#ifdef _DEBUG
	m_b_called = true;
#endif // _DEBUG
	if(m_n_result == CL_SUCCESS) {
		m_n_result = (CLresult)clEnqueueNDRangeKernel(m_h_cmd_queue,
			m_h_kernel, m_n_dimension, 0, m_p_global_work_size, m_p_local_work_size,
			n_wait_for_event_num, p_wait_for_event, 0);
	}
	return m_n_result;
}

inline CLresult CCLKernelCall::GetEvent(cl_event *p_finished_event)
{
#ifdef _DEBUG
	m_b_called = true;
#endif // _DEBUG
	if(m_n_result == CL_SUCCESS) {
		m_n_result = (CLresult)clEnqueueNDRangeKernel(m_h_cmd_queue,
			m_h_kernel, m_n_dimension, 0, m_p_global_work_size, m_p_local_work_size,
			0, 0, p_finished_event);
	}
	return m_n_result;
}

inline CLresult CCLKernelCall::GetEvent(CCLUniqueEvent &r_finished_event)
{
#ifdef _DEBUG
	m_b_called = true;
#endif // _DEBUG
	if(m_n_result == CL_SUCCESS) {
		cl_event h_finished_event;
		m_n_result = (CLresult)clEnqueueNDRangeKernel(m_h_cmd_queue,
			m_h_kernel, m_n_dimension, 0, m_p_global_work_size, m_p_local_work_size,
			0, 0, &h_finished_event);
		r_finished_event = h_finished_event; // destroys the previously owned event correctly
	}
	return m_n_result;
}

inline CLresult CCLKernelCall::WithEvents(cl_uint n_wait_for_event_num /*= 0*/,
	const cl_event *p_wait_for_event /*= 0*/, cl_event *p_finished_event /*= 0*/)
{
#ifdef _DEBUG
	m_b_called = true;
#endif // _DEBUG
	if(m_n_result == CL_SUCCESS) {
		m_n_result = (CLresult)clEnqueueNDRangeKernel(m_h_cmd_queue,
			m_h_kernel, m_n_dimension, 0, m_p_global_work_size, m_p_local_work_size,
			n_wait_for_event_num, p_wait_for_event, p_finished_event);
	}
	return m_n_result;
}

inline CLresult CCLKernelCall::WithEvents(cl_uint n_wait_for_event_num /*= 0*/,
	const cl_event *p_wait_for_event /*= 0*/, CCLUniqueEvent &r_finished_event)
{
#ifdef _DEBUG
	m_b_called = true;
#endif // _DEBUG
	if(m_n_result == CL_SUCCESS) {
		cl_event h_finished_event;
		m_n_result = (CLresult)clEnqueueNDRangeKernel(m_h_cmd_queue,
			m_h_kernel, m_n_dimension, 0, m_p_global_work_size, m_p_local_work_size,
			n_wait_for_event_num, p_wait_for_event, &h_finished_event);
		r_finished_event = h_finished_event; // destroys the previously owned event correctly
	}
	return m_n_result;
}

/*
 *								=== ~CCLKernelCall ===
 */

#if 0
/*
 *								=== CUniqueCLEvent ===
 */

inline CUniqueCLEvent::CUniqueCLEvent(cl_event h_event /*= 0*/)
	:m_h_event(h_event)
{}

inline CUniqueCLEvent::CUniqueCLEvent(const CUniqueCLEvent &r_other)
	:m_h_event(r_other.m_h_event)
{
	r_other.m_h_event = 0;
}

inline CUniqueCLEvent::~CUniqueCLEvent()
{
	if(m_h_event)
		clReleaseEvent(m_h_event);
}

inline CUniqueCLEvent &CUniqueCLEvent::operator =(const CUniqueCLEvent &r_other)
{
	CUniqueCLEvent::~CUniqueCLEvent(); // destroy this first
	m_h_event = r_other.m_h_event; // grab the other event
	r_other.m_h_event = 0;
	return *this;
}

inline CUniqueCLEvent::operator cl_event *()
{
	return &m_h_event;
}

/*
 *								=== ~CUniqueCLEvent ===
 */
#endif // 0

#if !defined(_MSC_VER) || defined(__MWERKS__) || _MSC_VER >= 1400

/**
 *	@brief calls all OpenCL functions required to pass no parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs0(h_func) CL_SUCCESS

/**
 *	@brief calls all OpenCL functions required to pass 1 parameter to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs1(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

/**
 *	@brief calls all OpenCL functions required to pass 2 parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs2(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

/**
 *	@brief calls all OpenCL functions required to pass 3 parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs3(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

/**
 *	@brief calls all OpenCL functions required to pass 4 parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs4(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs5(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs6(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs7(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs8(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs9(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs10(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs11(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs12(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs13(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs14(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs15(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs16(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

/**
 *	@def clCall1D0
 *	@brief sets arguments of a 1D kernel with no arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 */
#define clCall1D0(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x) \
	(CLresult)CCLKernelCall(CL_SUCCESS, (h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D1
 *	@brief sets arguments of a 1D kernel with 1 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D1(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs1((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D2
 *	@brief sets arguments of a 1D kernel with 2 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D2(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs2((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D3
 *	@brief sets arguments of a 1D kernel with 3 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D3(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs3((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D4
 *	@brief sets arguments of a 1D kernel with 4 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D4(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs4((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D5
 *	@brief sets arguments of a 1D kernel with 5 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D5(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs5((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D6
 *	@brief sets arguments of a 1D kernel with 6 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D6(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs6((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D7
 *	@brief sets arguments of a 1D kernel with 7 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D7(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs7((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D8
 *	@brief sets arguments of a 1D kernel with 8 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D8(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs8((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D9
 *	@brief sets arguments of a 1D kernel with 9 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D9(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs9((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D10
 *	@brief sets arguments of a 1D kernel with 10 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D10(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs10((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D11
 *	@brief sets arguments of a 1D kernel with 11 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D11(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs11((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D12
 *	@brief sets arguments of a 1D kernel with 12 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D12(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs12((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D13
 *	@brief sets arguments of a 1D kernel with 13 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D13(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs13((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D14
 *	@brief sets arguments of a 1D kernel with 14 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D14(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs14((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D15
 *	@brief sets arguments of a 1D kernel with 15 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D15(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs15((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D16
 *	@brief sets arguments of a 1D kernel with 16 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D16(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs16((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall2D0
 *	@brief sets arguments of a 2D kernel with 1 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 */
#define clCall2D0(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y) \
	(CLresult)CCLKernelCall(CL_SUCCESS, \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D1
 *	@brief sets arguments of a 2D kernel with 1 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D1(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs1((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D2
 *	@brief sets arguments of a 2D kernel with 2 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D2(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs2((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D3
 *	@brief sets arguments of a 2D kernel with 3 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D3(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs3((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D4
 *	@brief sets arguments of a 2D kernel with 4 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D4(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs4((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D5
 *	@brief sets arguments of a 2D kernel with 5 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D5(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs5((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D6
 *	@brief sets arguments of a 2D kernel with 6 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D6(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs6((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D7
 *	@brief sets arguments of a 2D kernel with 7 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D7(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs7((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D8
 *	@brief sets arguments of a 2D kernel with 8 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D8(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs8((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D9
 *	@brief sets arguments of a 2D kernel with 9 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D9(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs9((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D10
 *	@brief sets arguments of a 2D kernel with 10 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D10(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs10((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D11
 *	@brief sets arguments of a 2D kernel with 11 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D11(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs11((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D12
 *	@brief sets arguments of a 2D kernel with 12 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D12(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs12((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D13
 *	@brief sets arguments of a 2D kernel with 13 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D13(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs13((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D14
 *	@brief sets arguments of a 2D kernel with 14 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D14(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs14((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D15
 *	@brief sets arguments of a 2D kernel with 15 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D15(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs15((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D16
 *	@brief sets arguments of a 2D kernel with 16 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D16(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs16((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall3D0
 *	@brief sets arguments of a 2D kernel with 1 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 */
#define clCall3D0(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z) \
	(CLresult)CCLKernelCall(CL_SUCCESS, (h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), \
	(n_work_size_z), (n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D1
 *	@brief sets arguments of a 3D kernel with 1 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D1(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs1((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D2
 *	@brief sets arguments of a 3D kernel with 2 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D2(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs2((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D3
 *	@brief sets arguments of a 3D kernel with 3 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D3(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs3((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D4
 *	@brief sets arguments of a 3D kernel with 4 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D4(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs4((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D5
 *	@brief sets arguments of a 3D kernel with 5 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D5(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs5((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D6
 *	@brief sets arguments of a 3D kernel with 6 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D6(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs6((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D7
 *	@brief sets arguments of a 3D kernel with 7 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D7(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs7((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D8
 *	@brief sets arguments of a 3D kernel with 8 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D8(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs8((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D9
 *	@brief sets arguments of a 3D kernel with 9 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D9(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs9((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D10
 *	@brief sets arguments of a 3D kernel with 10 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D10(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs10((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D11
 *	@brief sets arguments of a 3D kernel with 11 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D11(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs11((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D12
 *	@brief sets arguments of a 3D kernel with 12 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D12(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs12((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D13
 *	@brief sets arguments of a 3D kernel with 13 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D13(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs13((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D14
 *	@brief sets arguments of a 3D kernel with 14 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D14(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs14((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D15
 *	@brief sets arguments of a 3D kernel with 15 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D15(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs15((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D16
 *	@brief sets arguments of a 3D kernel with 16 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D16(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	(CLresult)CCLKernelCall(clSetKernelArgs16((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall1D0Ex
 *	@brief sets arguments of a 1D kernel with no arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 */
#define clCall1D0Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x) \
	CCLKernelCall(CL_SUCCESS, (h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D1Ex
 *	@brief sets arguments of a 1D kernel with 1 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D1Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs1((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D2Ex
 *	@brief sets arguments of a 1D kernel with 2 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D2Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs2((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D3Ex
 *	@brief sets arguments of a 1D kernel with 3 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D3Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs3((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D4Ex
 *	@brief sets arguments of a 1D kernel with 4 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D4Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs4((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D5Ex
 *	@brief sets arguments of a 1D kernel with 5 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D5Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs5((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D6Ex
 *	@brief sets arguments of a 1D kernel with 6 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D6Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs6((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D7Ex
 *	@brief sets arguments of a 1D kernel with 7 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D7Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs7((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D8Ex
 *	@brief sets arguments of a 1D kernel with 8 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D8Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs8((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D9Ex
 *	@brief sets arguments of a 1D kernel with 9 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D9Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs9((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D10Ex
 *	@brief sets arguments of a 1D kernel with 10 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D10Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs10((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D11Ex
 *	@brief sets arguments of a 1D kernel with 11 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D11Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs11((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D12Ex
 *	@brief sets arguments of a 1D kernel with 12 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D12Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs12((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D13Ex
 *	@brief sets arguments of a 1D kernel with 13 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D13Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs13((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D14Ex
 *	@brief sets arguments of a 1D kernel with 14 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D14Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs14((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D15Ex
 *	@brief sets arguments of a 1D kernel with 15 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D15Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs15((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D16Ex
 *	@brief sets arguments of a 1D kernel with 16 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall1D16Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs16((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall2D0Ex
 *	@brief sets arguments of a 2D kernel with 1 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 */
#define clCall2D0Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y) \
	CCLKernelCall(CL_SUCCESS, \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D1Ex
 *	@brief sets arguments of a 2D kernel with 1 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D1Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs1((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D2Ex
 *	@brief sets arguments of a 2D kernel with 2 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D2Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs2((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D3Ex
 *	@brief sets arguments of a 2D kernel with 3 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D3Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs3((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D4Ex
 *	@brief sets arguments of a 2D kernel with 4 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D4Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs4((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D5Ex
 *	@brief sets arguments of a 2D kernel with 5 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D5Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs5((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D6Ex
 *	@brief sets arguments of a 2D kernel with 6 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D6Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs6((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D7Ex
 *	@brief sets arguments of a 2D kernel with 7 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D7Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs7((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D8Ex
 *	@brief sets arguments of a 2D kernel with 8 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D8Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs8((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D9Ex
 *	@brief sets arguments of a 2D kernel with 9 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D9Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs9((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D10Ex
 *	@brief sets arguments of a 2D kernel with 10 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D10Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs10((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D11Ex
 *	@brief sets arguments of a 2D kernel with 11 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D11Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs11((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D12Ex
 *	@brief sets arguments of a 2D kernel with 12 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D12Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs12((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D13Ex
 *	@brief sets arguments of a 2D kernel with 13 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D13Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs13((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D14Ex
 *	@brief sets arguments of a 2D kernel with 14 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D14Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs14((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D15Ex
 *	@brief sets arguments of a 2D kernel with 15 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D15Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs15((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D16Ex
 *	@brief sets arguments of a 2D kernel with 16 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall2D16Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs16((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall3D0Ex
 *	@brief sets arguments of a 2D kernel with 1 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 */
#define clCall3D0Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z) \
	CCLKernelCall(CL_SUCCESS, (h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), \
	(n_work_size_z), (n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D1Ex
 *	@brief sets arguments of a 3D kernel with 1 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D1Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs1((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D2Ex
 *	@brief sets arguments of a 3D kernel with 2 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D2Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs2((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D3Ex
 *	@brief sets arguments of a 3D kernel with 3 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D3Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs3((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D4Ex
 *	@brief sets arguments of a 3D kernel with 4 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D4Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs4((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D5Ex
 *	@brief sets arguments of a 3D kernel with 5 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D5Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs5((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D6Ex
 *	@brief sets arguments of a 3D kernel with 6 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D6Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs6((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D7Ex
 *	@brief sets arguments of a 3D kernel with 7 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D7Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs7((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D8Ex
 *	@brief sets arguments of a 3D kernel with 8 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D8Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs8((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D9Ex
 *	@brief sets arguments of a 3D kernel with 9 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D9Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs9((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D10Ex
 *	@brief sets arguments of a 3D kernel with 10 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D10Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs10((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D11Ex
 *	@brief sets arguments of a 3D kernel with 11 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D11Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs11((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D12Ex
 *	@brief sets arguments of a 3D kernel with 12 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D12Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs12((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D13Ex
 *	@brief sets arguments of a 3D kernel with 13 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D13Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs13((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D14Ex
 *	@brief sets arguments of a 3D kernel with 14 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D14Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs14((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D15Ex
 *	@brief sets arguments of a 3D kernel with 15 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D15Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs15((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D16Ex
 *	@brief sets arguments of a 3D kernel with 16 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the x dimension
 *	@param[in] n_work_size_z is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the x dimension
 *	@param[in] n_block_size_z is thread block size in the x dimension
 *	@param[in] ... are kernel function argument (int, float or cl_mem)
 */
#define clCall3D16Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,...) \
	CCLKernelCall(clSetKernelArgs16((h_kernel), __VA_ARGS__), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

#else // !defined(_MSC_VER) || defined(__MWERKS__) || _MSC_VER >= 1400

/**
 *	@brief calls all OpenCL functions required to pass no parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs0(h_func) CL_SUCCESS

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs1(h_func,a) (CCLArgLoader<0>(h_func),a).n_Result()

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs2(h_func,a,b) (CCLArgLoader<0>(h_func),a,b).n_Result()

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs3(h_func,a,b,c) (CCLArgLoader<0>(h_func),a,b,c).n_Result()

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs4(h_func,a,b,c,d) \
	(CCLArgLoader<0>(h_func),a,b,c,d).n_Result()

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs5(h_func,a,b,c,d,e) \
	(CCLArgLoader<0>(h_func),a,b,c,d,e).n_Result()

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs6(h_func,a,b,c,d,e,f) \
	(CCLArgLoader<0>(h_func),a,b,c,d,e,f).n_Result()

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs7(h_func,a,b,c,d,e,f,g) \
	(CCLArgLoader<0>(h_func),a,b,c,d,e,f,g).n_Result()

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs8(h_func,a,b,c,d,e,f,g,h) \
	(CCLArgLoader<0>(h_func),a,b,c,d,e,f,g,h).n_Result()

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs9(h_func,a,b,c,d,e,f,g,h,i) \
	(CCLArgLoader<0>(h_func),a,b,c,d,e,f,g,h,i).n_Result()

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs10(h_func,a,b,c,d,e,f,g,h,i,j) \
	(CCLArgLoader<0>(h_func),a,b,c,d,e,f,g,h,i,j).n_Result()

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs11(h_func,a,b,c,d,e,f,g,h,i,j,k) \
	(CCLArgLoader<0>(h_func),a,b,c,d,e,f,g,h,i,j,k).n_Result()

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs12(h_func,a,b,c,d,e,f,g,h,i,j,k,l) \
	(CCLArgLoader<0>(h_func),a,b,c,d,e,f,g,h,i,j,k,l).n_Result()

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs13(h_func,a,b,c,d,e,f,g,h,i,j,k,l,m) \
	(CCLArgLoader<0>(h_func),a,b,c,d,e,f,g,h,i,j,k,l,m).n_Result()

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 *	@param[in] n is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs14(h_func,a,b,c,d,e,f,g,h,i,j,k,l,m,n) \
	(CCLArgLoader<0>(h_func),a,b,c,d,e,f,g,h,i,j,k,l,m,n).n_Result()

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 *	@param[in] n is kernel function argument (int, float or cl_mem)
 *	@param[in] o is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs15(h_func,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o) \
	(CCLArgLoader<0>(h_func),a,b,c,d,e,f,g,h,i,j,k,l,m,n,o).n_Result()

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 *	@param[in] n is kernel function argument (int, float or cl_mem)
 *	@param[in] o is kernel function argument (int, float or cl_mem)
 *	@param[in] p is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs16(h_func,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
	(CCLArgLoader<0>(h_func),a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p).n_Result()

/**
 *	@def clCall1D0
 *	@brief sets arguments of a 1D kernel with no arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 */
#define clCall1D0(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x) \
	(CLresult)CCLKernelCall(CL_SUCCESS, (h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D1
 *	@brief sets arguments of a 1D kernel with 1 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 */
#define clCall1D1(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a) \
	(CLresult)CCLKernelCall(clSetKernelArgs1((h_kernel), (a)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D2
 *	@brief sets arguments of a 1D kernel with 2 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 */
#define clCall1D2(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a,b) \
	(CLresult)CCLKernelCall(clSetKernelArgs2((h_kernel), (a), (b)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D3
 *	@brief sets arguments of a 1D kernel with 3 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 */
#define clCall1D3(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a,b,c) \
	(CLresult)CCLKernelCall(clSetKernelArgs3((h_kernel), (a), (b), (c)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D4
 *	@brief sets arguments of a 1D kernel with 4 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 */
#define clCall1D4(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a,b,c,d) \
	(CLresult)CCLKernelCall(clSetKernelArgs4((h_kernel), (a), (b), (c), (d)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D5
 *	@brief sets arguments of a 1D kernel with 5 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 */
#define clCall1D5(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a,b,c,d,e) \
	(CLresult)CCLKernelCall(clSetKernelArgs5((h_kernel), (a), (b), (c), (d), (e)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D6
 *	@brief sets arguments of a 1D kernel with 6 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 */
#define clCall1D6(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a,b,c,d,e,f) \
	(CLresult)CCLKernelCall(clSetKernelArgs6((h_kernel), (a), (b), (c), (d), (e), (f)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D7
 *	@brief sets arguments of a 1D kernel with 7 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 */
#define clCall1D7(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a,b,c,d,e,f,g) \
	(CLresult)CCLKernelCall(clSetKernelArgs7((h_kernel), (a), (b), (c), (d), (e), (f), (g)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D8
 *	@brief sets arguments of a 1D kernel with 8 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 */
#define clCall1D8(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a,b,c,d,e,f,g,h) \
	(CLresult)CCLKernelCall(clSetKernelArgs8((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D9
 *	@brief sets arguments of a 1D kernel with 9 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 */
#define clCall1D9(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a,b,c,d,e,f,g,h,i) \
	(CLresult)CCLKernelCall(clSetKernelArgs9((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D10
 *	@brief sets arguments of a 1D kernel with 10 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 */
#define clCall1D10(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a,b,c,d,e,f,g,h,i,j) \
	(CLresult)CCLKernelCall(clSetKernelArgs10((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i), (j)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D11
 *	@brief sets arguments of a 1D kernel with 11 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 */
#define clCall1D11(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a,b,c,d,e,f,g,h,i,j,k) \
	(CLresult)CCLKernelCall(clSetKernelArgs11((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i), (j), \
	(k)), (h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D12
 *	@brief sets arguments of a 1D kernel with 12 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 */
#define clCall1D12(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a,b,c,d,e,f,g,h,i,j,k,l) \
	(CLresult)CCLKernelCall(clSetKernelArgs12((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i), (j), \
	(k), (l)), (h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D13
 *	@brief sets arguments of a 1D kernel with 13 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 */
#define clCall1D13(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a,b,c,d,e,f,g,h,i,j,k,l,m) \
	(CLresult)CCLKernelCall(clSetKernelArgs13((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i), (j), \
	(k), (l), (m)), (h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D14
 *	@brief sets arguments of a 1D kernel with 14 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 *	@param[in] n is kernel function argument (int, float or cl_mem)
 */
#define clCall1D14(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a,b,c,d,e,f,g,h,i,j,k,l,m,n) \
	(CLresult)CCLKernelCall(clSetKernelArgs14((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i), (j), (k), \
	(l), (m), (n)), (h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D15
 *	@brief sets arguments of a 1D kernel with 15 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 *	@param[in] n is kernel function argument (int, float or cl_mem)
 *	@param[in] o is kernel function argument (int, float or cl_mem)
 */
#define clCall1D15(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o) \
	(CLresult)CCLKernelCall(clSetKernelArgs15((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i), (j), \
	(k), (l), (m), (n), (o)), (h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D16
 *	@brief sets arguments of a 1D kernel with 16 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 *	@param[in] n is kernel function argument (int, float or cl_mem)
 *	@param[in] o is kernel function argument (int, float or cl_mem)
 *	@param[in] p is kernel function argument (int, float or cl_mem)
 */
#define clCall1D16(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
	(CLresult)CCLKernelCall(clSetKernelArgs16((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i), (j), (k), \
	(l), (m), (n), (o), (p)), (h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall2D0
 *	@brief sets arguments of a 2D kernel with 1 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 */
#define clCall2D0(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y) \
	(CLresult)CCLKernelCall(CL_SUCCESS, \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D1
 *	@brief sets arguments of a 2D kernel with 1 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 */
#define clCall2D1(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a) \
	(CLresult)CCLKernelCall(clSetKernelArgs1((h_kernel), (a)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D2
 *	@brief sets arguments of a 2D kernel with 2 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 */
#define clCall2D2(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a,b) \
	(CLresult)CCLKernelCall(clSetKernelArgs2((h_kernel), (a), (b)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D3
 *	@brief sets arguments of a 2D kernel with 3 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 */
#define clCall2D3(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a,b,c) \
	(CLresult)CCLKernelCall(clSetKernelArgs3((h_kernel), (a), (b), (c)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D4
 *	@brief sets arguments of a 2D kernel with 4 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 */
#define clCall2D4(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a,b,c,d) \
	(CLresult)CCLKernelCall(clSetKernelArgs4((h_kernel), (a), (b), (c), (d)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D5
 *	@brief sets arguments of a 2D kernel with 5 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 */
#define clCall2D5(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a,b,c,d,e) \
	(CLresult)CCLKernelCall(clSetKernelArgs5((h_kernel), (a), (b), (c), (d), (e)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D6
 *	@brief sets arguments of a 2D kernel with 6 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 */
#define clCall2D6(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a,b,c,d,e,f) \
	(CLresult)CCLKernelCall(clSetKernelArgs6((h_kernel), (a), (b), (c), (d), (e), (f)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D7
 *	@brief sets arguments of a 2D kernel with 7 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 */
#define clCall2D7(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a,b,c,d,e,f,g) \
	(CLresult)CCLKernelCall(clSetKernelArgs7((h_kernel), (a), (b), (c), (d), (e), (f), (g)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D8
 *	@brief sets arguments of a 2D kernel with 8 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 */
#define clCall2D8(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a,b,c,d,e,f,g,h) \
	(CLresult)CCLKernelCall(clSetKernelArgs8((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D9
 *	@brief sets arguments of a 2D kernel with 9 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 */
#define clCall2D9(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a,b,c,d,e,f,g,h,i) \
	(CLresult)CCLKernelCall(clSetKernelArgs9((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D10
 *	@brief sets arguments of a 2D kernel with 10 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 */
#define clCall2D10(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a,b,c,d,e,f,g,h,i,j) \
	(CLresult)CCLKernelCall(clSetKernelArgs10((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i), (j)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D11
 *	@brief sets arguments of a 2D kernel with 11 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 */
#define clCall2D11(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a,b,c,d,e,f,g,h,i,j,k) \
	(CLresult)CCLKernelCall(clSetKernelArgs11((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i), (j), (k)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D12
 *	@brief sets arguments of a 2D kernel with 12 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 */
#define clCall2D12(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a,b,c,d,e,f,g,h,i,j,k,l) \
	(CLresult)CCLKernelCall(clSetKernelArgs12((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i), (j), (k), (l)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D13
 *	@brief sets arguments of a 2D kernel with 13 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 */
#define clCall2D13(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a,b,c,d,e,f,g,h,i,j,k,l,m) \
	(CLresult)CCLKernelCall(clSetKernelArgs13((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i), (j), (k), (l), (m)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D14
 *	@brief sets arguments of a 2D kernel with 14 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 *	@param[in] n is kernel function argument (int, float or cl_mem)
 */
#define clCall2D14(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a,b,c,d,e,f,g,h,i,j,k,l,m,n) \
	(CLresult)CCLKernelCall(clSetKernelArgs14((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i), (j), (k), (l), (m), (n)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D15
 *	@brief sets arguments of a 2D kernel with 15 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 *	@param[in] n is kernel function argument (int, float or cl_mem)
 *	@param[in] o is kernel function argument (int, float or cl_mem)
 */
#define clCall2D15(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o) \
	(CLresult)CCLKernelCall(clSetKernelArgs15((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i), (j), (k), (l), (m), (n), (o)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D16
 *	@brief sets arguments of a 2D kernel with 16 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 *	@param[in] n is kernel function argument (int, float or cl_mem)
 *	@param[in] o is kernel function argument (int, float or cl_mem)
 *	@param[in] p is kernel function argument (int, float or cl_mem)
 */
#define clCall2D16(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
	(CLresult)CCLKernelCall(clSetKernelArgs16((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i), (j), (k), (l), (m), (n), (o), (p)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall3D0
 *	@brief sets arguments of a 2D kernel with 1 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 */
#define clCall3D0(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z) \
	(CLresult)CCLKernelCall(CL_SUCCESS, (h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), \
	(n_work_size_z), (n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D1
 *	@brief sets arguments of a 2D kernel with 1 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 */
#define clCall3D1(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a) \
	(CLresult)CCLKernelCall(clSetKernelArgs1((h_kernel), (a)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D2
 *	@brief sets arguments of a 2D kernel with 2 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 */
#define clCall3D2(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a,b) \
	(CLresult)CCLKernelCall(clSetKernelArgs2((h_kernel), (a), (b)), (h_cmd_queue), \
	(h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D3
 *	@brief sets arguments of a 2D kernel with 3 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 */
#define clCall3D3(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a,b,c) \
	(CLresult)CCLKernelCall(clSetKernelArgs3((h_kernel), (a), (b), (c)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D4
 *	@brief sets arguments of a 2D kernel with 4 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 */
#define clCall3D4(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a,b,c,d) \
	(CLresult)CCLKernelCall(clSetKernelArgs4((h_kernel), (a), (b), (c), (d)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D5
 *	@brief sets arguments of a 2D kernel with 5 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 */
#define clCall3D5(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a,b,c,d,e) \
	(CLresult)CCLKernelCall(clSetKernelArgs5((h_kernel), (a), (b), (c), (d), (e)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D6
 *	@brief sets arguments of a 2D kernel with 6 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 */
#define clCall3D6(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a,b,c,d,e,f) \
	(CLresult)CCLKernelCall(clSetKernelArgs6((h_kernel), (a), (b), (c), (d), (e), (f)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D7
 *	@brief sets arguments of a 2D kernel with 7 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 */
#define clCall3D7(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a,b,c,d,e,f,g) \
	(CLresult)CCLKernelCall(clSetKernelArgs7((h_kernel), (a), (b), (c), (d), (e), (f), (g)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D8
 *	@brief sets arguments of a 2D kernel with 8 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 */
#define clCall3D8(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a,b,c,d,e,f,g,h) \
	(CLresult)CCLKernelCall(clSetKernelArgs8((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D9
 *	@brief sets arguments of a 2D kernel with 9 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 */
#define clCall3D9(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a,b,c,d,e,f,g,h,i) \
	(CLresult)CCLKernelCall(clSetKernelArgs9((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D10
 *	@brief sets arguments of a 2D kernel with 10 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 */
#define clCall3D10(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a,b,c,d,e,f,g,h,i,j) \
	(CLresult)CCLKernelCall(clSetKernelArgs10((h_kernel), (a), (b), (c), \
	(d), (e), (f), (g), (h), (i), (j)), (h_cmd_queue), \
	(h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D11
 *	@brief sets arguments of a 2D kernel with 11 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 */
#define clCall3D11(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a,b,c,d,e,f,g,h,i,j,k) \
	(CLresult)CCLKernelCall(clSetKernelArgs11((h_kernel), (a), (b), (c), (d), \
	(e), (f), (g), (h), (i), (j), (k)), (h_cmd_queue), \
	(h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D12
 *	@brief sets arguments of a 2D kernel with 12 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 */
#define clCall3D12(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a,b,c,d,e,f,g,h,i,j,k,l) \
	(CLresult)CCLKernelCall(clSetKernelArgs12((h_kernel), (a), (b), (c), (d), \
	(e), (f), (g), (h), (i), (j), (k), (l)), (h_cmd_queue), \
	(h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D13
 *	@brief sets arguments of a 2D kernel with 13 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 */
#define clCall3D13(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a,b,c,d,e,f,g,h,i,j,k,l,m) \
	(CLresult)CCLKernelCall(clSetKernelArgs13((h_kernel), (a), (b), (c), (d), \
	(e), (f), (g), (h), (i), (j), (k), (l), (m)), (h_cmd_queue), \
	(h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D14
 *	@brief sets arguments of a 2D kernel with 14 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 *	@param[in] n is kernel function argument (int, float or cl_mem)
 */
#define clCall3D14(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a,b,c,d,e,f,g,h,i,j,k,l,m,n) \
	(CLresult)CCLKernelCall(clSetKernelArgs14((h_kernel), (a), (b), (c), (d), \
	(e), (f), (g), (h), (i), (j), (k), (l), (m), (n)), (h_cmd_queue), \
	(h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D15
 *	@brief sets arguments of a 2D kernel with 15 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 *	@param[in] n is kernel function argument (int, float or cl_mem)
 *	@param[in] o is kernel function argument (int, float or cl_mem)
 */
#define clCall3D15(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o) \
	(CLresult)CCLKernelCall(clSetKernelArgs15((h_kernel), (a), (b), (c), (d), \
	(e), (f), (g), (h), (i), (j), (k), (l), (m), (n), (o)), (h_cmd_queue), \
	(h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D16
 *	@brief sets arguments of a 2D kernel with 16 arguments and calls it
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 *	@param[in] n is kernel function argument (int, float or cl_mem)
 *	@param[in] o is kernel function argument (int, float or cl_mem)
 *	@param[in] p is kernel function argument (int, float or cl_mem)
 */
#define clCall3D16(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
	(CLresult)CCLKernelCall(clSetKernelArgs16((h_kernel), (a), (b), (c), (d), \
	(e), (f), (g), (h), (i), (j), (k), (l), (m), (n), (o), (p)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall1D0Ex
 *	@brief sets arguments of a 1D kernel with no arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 */
#define clCall1D0Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x) \
	CCLKernelCall(CL_SUCCESS, (h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D1Ex
 *	@brief sets arguments of a 1D kernel with 1 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 */
#define clCall1D1Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a) \
	CCLKernelCall(clSetKernelArgs1((h_kernel), (a)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D2Ex
 *	@brief sets arguments of a 1D kernel with 2 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 */
#define clCall1D2Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a,b) \
	CCLKernelCall(clSetKernelArgs2((h_kernel), (a), (b)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D3Ex
 *	@brief sets arguments of a 1D kernel with 3 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 */
#define clCall1D3Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a,b,c) \
	CCLKernelCall(clSetKernelArgs3((h_kernel), (a), (b), (c)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D4Ex
 *	@brief sets arguments of a 1D kernel with 4 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 */
#define clCall1D4Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a,b,c,d) \
	CCLKernelCall(clSetKernelArgs4((h_kernel), (a), (b), (c), (d)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D5Ex
 *	@brief sets arguments of a 1D kernel with 5 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 */
#define clCall1D5Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a,b,c,d,e) \
	CCLKernelCall(clSetKernelArgs5((h_kernel), (a), (b), (c), (d), (e)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D6Ex
 *	@brief sets arguments of a 1D kernel with 6 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 */
#define clCall1D6Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a,b,c,d,e,f) \
	CCLKernelCall(clSetKernelArgs6((h_kernel), (a), (b), (c), (d), (e), (f)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D7Ex
 *	@brief sets arguments of a 1D kernel with 7 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 */
#define clCall1D7Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a,b,c,d,e,f,g) \
	CCLKernelCall(clSetKernelArgs7((h_kernel), (a), (b), (c), (d), (e), (f), (g)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D8Ex
 *	@brief sets arguments of a 1D kernel with 8 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 */
#define clCall1D8Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a,b,c,d,e,f,g,h) \
	CCLKernelCall(clSetKernelArgs8((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D9Ex
 *	@brief sets arguments of a 1D kernel with 9 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 */
#define clCall1D9Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a,b,c,d,e,f,g,h,i) \
	CCLKernelCall(clSetKernelArgs9((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D10Ex
 *	@brief sets arguments of a 1D kernel with 10 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 */
#define clCall1D10Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a,b,c,d,e,f,g,h,i,j) \
	CCLKernelCall(clSetKernelArgs10((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i), (j)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D11Ex
 *	@brief sets arguments of a 1D kernel with 11 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 */
#define clCall1D11Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a,b,c,d,e,f,g,h,i,j,k) \
	CCLKernelCall(clSetKernelArgs11((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i), (j), \
	(k)), (h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D12Ex
 *	@brief sets arguments of a 1D kernel with 12 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 */
#define clCall1D12Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a,b,c,d,e,f,g,h,i,j,k,l) \
	CCLKernelCall(clSetKernelArgs12((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i), (j), \
	(k), (l)), (h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D13Ex
 *	@brief sets arguments of a 1D kernel with 13 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 */
#define clCall1D13Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a,b,c,d,e,f,g,h,i,j,k,l,m) \
	CCLKernelCall(clSetKernelArgs13((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i), (j), \
	(k), (l), (m)), (h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D14Ex
 *	@brief sets arguments of a 1D kernel with 14 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 *	@param[in] n is kernel function argument (int, float or cl_mem)
 */
#define clCall1D14Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a,b,c,d,e,f,g,h,i,j,k,l,m,n) \
	CCLKernelCall(clSetKernelArgs14((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i), (j), (k), \
	(l), (m), (n)), (h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D15Ex
 *	@brief sets arguments of a 1D kernel with 15 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 *	@param[in] n is kernel function argument (int, float or cl_mem)
 *	@param[in] o is kernel function argument (int, float or cl_mem)
 */
#define clCall1D15Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o) \
	CCLKernelCall(clSetKernelArgs15((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i), (j), \
	(k), (l), (m), (n), (o)), (h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall1D16Ex
 *	@brief sets arguments of a 1D kernel with 16 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 *	@param[in] n is kernel function argument (int, float or cl_mem)
 *	@param[in] o is kernel function argument (int, float or cl_mem)
 *	@param[in] p is kernel function argument (int, float or cl_mem)
 */
#define clCall1D16Ex(h_cmd_queue,h_kernel,n_work_size_x,n_block_size_x,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
	CCLKernelCall(clSetKernelArgs16((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i), (j), (k), \
	(l), (m), (n), (o), (p)), (h_cmd_queue), (h_kernel), (n_work_size_x), (n_block_size_x))

/**
 *	@def clCall2D0Ex
 *	@brief sets arguments of a 2D kernel with 1 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 */
#define clCall2D0Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y) \
	CCLKernelCall(CL_SUCCESS, \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D1Ex
 *	@brief sets arguments of a 2D kernel with 1 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 */
#define clCall2D1Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a) \
	CCLKernelCall(clSetKernelArgs1((h_kernel), (a)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D2Ex
 *	@brief sets arguments of a 2D kernel with 2 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 */
#define clCall2D2Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a,b) \
	CCLKernelCall(clSetKernelArgs2((h_kernel), (a), (b)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D3Ex
 *	@brief sets arguments of a 2D kernel with 3 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 */
#define clCall2D3Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a,b,c) \
	CCLKernelCall(clSetKernelArgs3((h_kernel), (a), (b), (c)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D4Ex
 *	@brief sets arguments of a 2D kernel with 4 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 */
#define clCall2D4Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a,b,c,d) \
	CCLKernelCall(clSetKernelArgs4((h_kernel), (a), (b), (c), (d)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D5Ex
 *	@brief sets arguments of a 2D kernel with 5 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 */
#define clCall2D5Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a,b,c,d,e) \
	CCLKernelCall(clSetKernelArgs5((h_kernel), (a), (b), (c), (d), (e)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D6Ex
 *	@brief sets arguments of a 2D kernel with 6 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 */
#define clCall2D6Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a,b,c,d,e,f) \
	CCLKernelCall(clSetKernelArgs6((h_kernel), (a), (b), (c), (d), (e), (f)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D7Ex
 *	@brief sets arguments of a 2D kernel with 7 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 */
#define clCall2D7Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a,b,c,d,e,f,g) \
	CCLKernelCall(clSetKernelArgs7((h_kernel), (a), (b), (c), (d), (e), (f), (g)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D8Ex
 *	@brief sets arguments of a 2D kernel with 8 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 */
#define clCall2D8Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a,b,c,d,e,f,g,h) \
	CCLKernelCall(clSetKernelArgs8((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D9Ex
 *	@brief sets arguments of a 2D kernel with 9 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 */
#define clCall2D9Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a,b,c,d,e,f,g,h,i) \
	CCLKernelCall(clSetKernelArgs9((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D10Ex
 *	@brief sets arguments of a 2D kernel with 10 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 */
#define clCall2D10Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a,b,c,d,e,f,g,h,i,j) \
	CCLKernelCall(clSetKernelArgs10((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i), (j)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D11Ex
 *	@brief sets arguments of a 2D kernel with 11 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 */
#define clCall2D11Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a,b,c,d,e,f,g,h,i,j,k) \
	CCLKernelCall(clSetKernelArgs11((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i), (j), (k)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D12Ex
 *	@brief sets arguments of a 2D kernel with 12 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 */
#define clCall2D12Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a,b,c,d,e,f,g,h,i,j,k,l) \
	CCLKernelCall(clSetKernelArgs12((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i), (j), (k), (l)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D13Ex
 *	@brief sets arguments of a 2D kernel with 13 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 */
#define clCall2D13Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a,b,c,d,e,f,g,h,i,j,k,l,m) \
	CCLKernelCall(clSetKernelArgs13((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i), (j), (k), (l), (m)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D14Ex
 *	@brief sets arguments of a 2D kernel with 14 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 *	@param[in] n is kernel function argument (int, float or cl_mem)
 */
#define clCall2D14Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a,b,c,d,e,f,g,h,i,j,k,l,m,n) \
	CCLKernelCall(clSetKernelArgs14((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i), (j), (k), (l), (m), (n)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D15Ex
 *	@brief sets arguments of a 2D kernel with 15 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 *	@param[in] n is kernel function argument (int, float or cl_mem)
 *	@param[in] o is kernel function argument (int, float or cl_mem)
 */
#define clCall2D15Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o) \
	CCLKernelCall(clSetKernelArgs15((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i), (j), (k), (l), (m), (n), (o)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall2D16Ex
 *	@brief sets arguments of a 2D kernel with 16 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 *	@param[in] n is kernel function argument (int, float or cl_mem)
 *	@param[in] o is kernel function argument (int, float or cl_mem)
 *	@param[in] p is kernel function argument (int, float or cl_mem)
 */
#define clCall2D16Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_block_size_x,n_block_size_y,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
	CCLKernelCall(clSetKernelArgs16((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i), (j), (k), (l), (m), (n), (o), (p)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_block_size_x), (n_block_size_y))

/**
 *	@def clCall3D0Ex
 *	@brief sets arguments of a 2D kernel with 1 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 */
#define clCall3D0Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z) \
	CCLKernelCall(CL_SUCCESS, (h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), \
	(n_work_size_z), (n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D1Ex
 *	@brief sets arguments of a 2D kernel with 1 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 */
#define clCall3D1Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a) \
	CCLKernelCall(clSetKernelArgs1((h_kernel), (a)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D2Ex
 *	@brief sets arguments of a 2D kernel with 2 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 */
#define clCall3D2Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a,b) \
	CCLKernelCall(clSetKernelArgs2((h_kernel), (a), (b)), (h_cmd_queue), \
	(h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D3Ex
 *	@brief sets arguments of a 2D kernel with 3 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 */
#define clCall3D3Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a,b,c) \
	CCLKernelCall(clSetKernelArgs3((h_kernel), (a), (b), (c)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D4Ex
 *	@brief sets arguments of a 2D kernel with 4 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 */
#define clCall3D4Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a,b,c,d) \
	CCLKernelCall(clSetKernelArgs4((h_kernel), (a), (b), (c), (d)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D5Ex
 *	@brief sets arguments of a 2D kernel with 5 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 */
#define clCall3D5Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a,b,c,d,e) \
	CCLKernelCall(clSetKernelArgs5((h_kernel), (a), (b), (c), (d), (e)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D6Ex
 *	@brief sets arguments of a 2D kernel with 6 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 */
#define clCall3D6Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a,b,c,d,e,f) \
	CCLKernelCall(clSetKernelArgs6((h_kernel), (a), (b), (c), (d), (e), (f)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D7Ex
 *	@brief sets arguments of a 2D kernel with 7 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 */
#define clCall3D7Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a,b,c,d,e,f,g) \
	CCLKernelCall(clSetKernelArgs7((h_kernel), (a), (b), (c), (d), (e), (f), (g)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D8Ex
 *	@brief sets arguments of a 2D kernel with 8 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 */
#define clCall3D8Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a,b,c,d,e,f,g,h) \
	CCLKernelCall(clSetKernelArgs8((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D9Ex
 *	@brief sets arguments of a 2D kernel with 9 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 */
#define clCall3D9Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a,b,c,d,e,f,g,h,i) \
	CCLKernelCall(clSetKernelArgs9((h_kernel), (a), (b), (c), (d), (e), (f), (g), (h), (i)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D10Ex
 *	@brief sets arguments of a 2D kernel with 10 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 */
#define clCall3D10Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a,b,c,d,e,f,g,h,i,j) \
	CCLKernelCall(clSetKernelArgs10((h_kernel), (a), (b), (c), \
	(d), (e), (f), (g), (h), (i), (j)), (h_cmd_queue), \
	(h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D11Ex
 *	@brief sets arguments of a 2D kernel with 11 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 */
#define clCall3D11Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a,b,c,d,e,f,g,h,i,j,k) \
	CCLKernelCall(clSetKernelArgs11((h_kernel), (a), (b), (c), (d), \
	(e), (f), (g), (h), (i), (j), (k)), (h_cmd_queue), \
	(h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D12Ex
 *	@brief sets arguments of a 2D kernel with 12 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 */
#define clCall3D12Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a,b,c,d,e,f,g,h,i,j,k,l) \
	CCLKernelCall(clSetKernelArgs12((h_kernel), (a), (b), (c), (d), \
	(e), (f), (g), (h), (i), (j), (k), (l)), (h_cmd_queue), \
	(h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D13Ex
 *	@brief sets arguments of a 2D kernel with 13 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 */
#define clCall3D13Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a,b,c,d,e,f,g,h,i,j,k,l,m) \
	CCLKernelCall(clSetKernelArgs13((h_kernel), (a), (b), (c), (d), \
	(e), (f), (g), (h), (i), (j), (k), (l), (m)), (h_cmd_queue), \
	(h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D14Ex
 *	@brief sets arguments of a 2D kernel with 14 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 *	@param[in] n is kernel function argument (int, float or cl_mem)
 */
#define clCall3D14Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a,b,c,d,e,f,g,h,i,j,k,l,m,n) \
	CCLKernelCall(clSetKernelArgs14((h_kernel), (a), (b), (c), (d), \
	(e), (f), (g), (h), (i), (j), (k), (l), (m), (n)), (h_cmd_queue), \
	(h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D15Ex
 *	@brief sets arguments of a 2D kernel with 15 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 *	@param[in] n is kernel function argument (int, float or cl_mem)
 *	@param[in] o is kernel function argument (int, float or cl_mem)
 */
#define clCall3D15Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o) \
	CCLKernelCall(clSetKernelArgs15((h_kernel), (a), (b), (c), (d), \
	(e), (f), (g), (h), (i), (j), (k), (l), (m), (n), (o)), (h_cmd_queue), \
	(h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

/**
 *	@def clCall3D16Ex
 *	@brief sets arguments of a 2D kernel with 16 arguments and returns the call object
 *
 *	@param[in] h_cmd_queue is handle to the command queue
 *	@param[in] h_kernel is handle to the kernel to call
 *	@param[in] n_work_size_x is global work size in the x dimension
 *	@param[in] n_work_size_y is global work size in the y dimension
 *	@param[in] n_work_size_z is global work size in the z dimension
 *	@param[in] n_block_size_x is thread block size in the x dimension
 *	@param[in] n_block_size_y is thread block size in the y dimension
 *	@param[in] n_block_size_z is thread block size in the z dimension
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 *	@param[in] n is kernel function argument (int, float or cl_mem)
 *	@param[in] o is kernel function argument (int, float or cl_mem)
 *	@param[in] p is kernel function argument (int, float or cl_mem)
 */
#define clCall3D16Ex(h_cmd_queue,h_kernel,n_work_size_x,n_work_size_y,n_work_size_z,n_block_size_x,n_block_size_y,n_block_size_z,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
	CCLKernelCall(clSetKernelArgs16((h_kernel), (a), (b), (c), (d), \
	(e), (f), (g), (h), (i), (j), (k), (l), (m), (n), (o), (p)), \
	(h_cmd_queue), (h_kernel), (n_work_size_x), (n_work_size_y), (n_work_size_z), \
	(n_block_size_x), (n_block_size_y), (n_block_size_z))

#endif // !defined(_MSC_VER) || defined(__MWERKS__) || _MSC_VER >= 1400
