#ifndef __CL_KERNEL_NVIDIA_SPECIFIC_INCLUDED
#define __CL_KERNEL_NVIDIA_SPECIFIC_INCLUDED

#include "VendorDetect.h"
#include "Integer.h"

//#ifdef NVIDIA
#define WARP_SIZE 32
/*#else // NVIDIA
#define WARP_SIZE 32
//#define WARP_SIZE 64 // some AMD GPUs allegedly have warp size of only 32, would break code which relies on warp-synchronous programming
#endif // NVIDIA*/
// does not make a difference so far

#define LOG_WARP_SIZE (n_Log2(WARP_SIZE))

#if WARP_SIZE == 32
typedef int32_t intwarp_t;
typedef uint32_t uintwarp_t;
#elif WARP_SIZE == 64
typedef int64_t intwarp_t;
typedef uint64_t uintwarp_t;
#elif WARP_SIZE == 16
typedef int16_t intwarp_t;
typedef uint16_t uintwarp_t;
#elif WARP_SIZE == 8
typedef int8_t intwarp_t;
typedef uint8_t uintwarp_t;
#else
#error "internal error: unexpected warp size"
#endif
// typedef warp size types

#define LOCAL_MEMORY_BANK_NUM WARP_SIZE // 32 on SM 2.0+, same as WARP_SIZE, right?

STATIC_ASSERT(b_Is_POT(LOCAL_MEMORY_BANK_NUM), LOCAL_MEMORY_BANK_NUM_MUST_BE_POWER_OF_TWO);

#define LOG_LOCAL_MEMORY_BANK_NUM (n_Log2(LOCAL_MEMORY_BANK_NUM))
#define CONFLICT_FREE_OFFSET(x) ((x) >> LOG_LOCAL_MEMORY_BANK_NUM)
#define CONFLICT_FREE_INDEX(x) ((x) + CONFLICT_FREE_OFFSET((x)))
#define CONFLICT_FREE_SIZE(s) ((s) + CONFLICT_FREE_OFFSET((s) - 1))

#ifdef TIME_64BIT
typedef uint64_t TTimeVal;
#else // TIME_64BIT
typedef uint32_t TTimeVal;
#endif // TIME_64BIT

/**
 *	@brief gets the high resolution GPU counter
 *	@return Returns the value of the counter, in clocks (divide by device frequency to get time).
 */
inline TTimeVal n_nv_Clock()
{
#ifdef NVIDIA
	TTimeVal n_clock;
#ifdef TIME_64BIT
	asm volatile("mov.u64 %0, %%clock64;" : "=l" (n_clock)); // make sure the compiler will not reorder this
#else // TIME_64BIT
	asm volatile("mov.u32 %0, %%clock;" : "=r" (n_clock)); // make sure the compiler will not reorder this
#endif // TIME_64BIT
	return n_clock;
#else // NVIDIA
	return 0;
#endif // NVIDIA
}

/**
 *	@brief reads a 32-bit integer value from global memory while avoiding cache
 *	@param[in] p_src is address of the value to be loaded
 *	@return Returns the value at the specified address.
 */
inline uint32_t n_StreamingLoad_uint32_t(__global __read_only const uint32_t *p_src)
{
#ifdef NVIDIA
	uint32_t n_result;
	asm("ld.global.cs.u32 %r0, [%r1];" : "=r" (n_result) : "r" (p_src));
	return n_result;
#else // NVIDIA
	return *p_src; // generic
#endif // NVIDIA
}

/**
 *	@brief writes a 32-bit integer value to global memory while avoiding cache
 *
 *	@param[out] p_dest is address to be written to
 *	@param[in] n_value is value to write
 */
inline void StreamingWrite_uint32_t(__global __write_only uint32_t *p_dest, const uint32_t n_value)
{
#ifdef NVIDIA
	asm("st.global.cs.u32 [%r0], %r1;" :  : "r" (p_dest), "r" (n_value) : "memory");
#else // NVIDIA
	*p_dest = n_value; // generic
#endif // NVIDIA
}

/**
 *	@brief reads a 32-bit integer value from global memory while avoiding cache
 *	@param[in] p_src is address of the value to be loaded
 *	@return Returns the value at the specified address.
 */
inline int32_t n_StreamingLoad_int32_t(__global __read_only const int32_t *p_src)
{
#ifdef NVIDIA
	uint32_t n_result;
	asm("ld.global.cs.s32 %r0, [%r1];" : "=r" (n_result) : "r" (p_src));
	return n_result;
#else // NVIDIA
	return *p_src; // generic
#endif // NVIDIA
}

/**
 *	@brief writes a 32-bit integer value to global memory while avoiding cache
 *
 *	@param[out] p_dest is address to be written to
 *	@param[in] n_value is value to write
 */
inline void StreamingWrite_int32_t(__global __write_only int32_t *p_dest, const int32_t n_value)
{
#ifdef NVIDIA
	asm("st.global.cs.s32 [%r0], %r1;" :  : "r" (p_dest), "r" (n_value) : "memory");
#else // NVIDIA
	*p_dest = n_value; // generic
#endif // NVIDIA
}

/**
 *	@brief reads a 32-bit integer value from global memory while avoiding cache
 *	@param[in] p_src is address of the value to be loaded
 *	@return Returns the value at the specified address.
 */
inline float n_StreamingLoad_float(__global __read_only const float *p_src)
{
#ifdef NVIDIA
	float f_result;
	asm("ld.global.cs.f32 %r0, [%r1];" : "=r" (f_result) : "r" (p_src));
	return f_result;
#else // NVIDIA
	return *p_src; // generic
#endif // NVIDIA
}

/**
 *	@brief writes a 32-bit integer value to global memory while avoiding cache
 *
 *	@param[out] p_dest is address to be written to
 *	@param[in] n_value is value to write
 */
inline void StreamingWrite_float(__global __write_only float *p_dest, const float f_value)
{
#ifdef NVIDIA
	asm("st.global.cs.f32 [%r0], %r1;" :  : "r" (p_dest), "r" (f_value) : "memory");
#else // NVIDIA
	*p_dest = n_value; // generic
#endif // NVIDIA
}

#endif // !__CL_KERNEL_NVIDIA_SPECIFIC_INCLUDED
