#ifndef __CL_KERNEL_NVIDIA_CC20_SPECIFIC_INCLUDED
#define __CL_KERNEL_NVIDIA_CC20_SPECIFIC_INCLUDED

/**
 *	@file gpgpu/kernel_utils/NV20.h
 *	@brief functionality for devices with compute capability 2.0 and higher
 */

#include "Integer.h"
#include "NVIDIA.h" // intwarp_t

// voting is actually since CC 1.2 but those GPUs are mostly historical today

inline bool b_warp_vote_all(bool b_flag)
{
	bool b_result;
	asm("{.reg .pred p;\n\t"
		"setp.ne.u32 p, %1, 0;\n\t"
		"vote.all.pred %0, p;\n\t"
		"}" : "=r"(b_result) : "r"(b_flag));
	return b_result;
}

inline bool b_warp_vote_none(bool b_flag)
{
	bool b_result;
	asm("{.reg .pred p;\n\t"
		"setp.ne.u32 p, %1, 0;\n\t"
		"vote.all.pred %0, !p;\n\t"
		"}" : "=r"(b_result) : "r"(b_flag));
	return b_result;
}

inline bool b_warp_vote_any(bool b_flag)
{
	bool b_result;
	asm("{.reg .pred p;\n\t"
		"setp.ne.u32 p, %1, 0;\n\t"
		"vote.any.pred %0, p;\n\t"
		"}" : "=r"(b_result) : "r"(b_flag));
	return b_result;
}

inline bool b_warp_vote_not_all(bool b_flag)
{
	bool b_result;
	asm("{.reg .pred p;\n\t"
		"setp.ne.u32 p, %1, 0;\n\t"
		"vote.any.pred %0, !p;\n\t"
		"}" : "=r"(b_result) : "r"(b_flag));
	return b_result;
}

inline bool b_warp_vote_uni(bool b_flag)
{
	bool b_result;
	asm("{.reg .pred p;\n\t"
		"setp.ne.u32 p, %1, 0;\n\t"
		"vote.uni.pred %0, p;\n\t"
		"}" : "=r"(b_result) : "r"(b_flag));
	return b_result;
}

// ballot is since CC 2.0

inline intwarp_t n_warp_ballot(bool b_flag)
{
	uint32_t n_result;
	asm("{.reg .pred p;\n\t"
		"setp.ne.u32 p, %1, 0;\n\t"
		"vote.ballot.b32 %0, p;\n\t"
		"}" : "=r"(n_result) : "r"(b_flag));
	return n_result;
}

// bit counting, since CC 2.0

inline uint32_t n_popc32(uint32_t n_x)
{
	uint32_t n_result;
	asm("popc.b32 %0, %1;" : "=r"(n_result) : "r"(n_x));
	return n_result;
}

inline uint32_t n_popc64(uint64_t n_x)
{
	uint32_t n_result;
	asm("popc.b64 %0, %1;" : "=r"(n_result) : "r"(n_x));
	return n_result;
}

inline uint32_t n_lzcnt32(uint32_t n_x) // returns 31 - n_bfind_uint32_t(n_x)
{
	uint32_t n_result;
	asm("clz.b32 %0, %1;" : "=r"(n_result) : "r"(n_x));
	return n_result;
}

inline uint32_t n_lzcnt64(uint64_t n_x) // returns 63 - n_bfind_uint64_t(n_x)
{
	uint32_t n_result;
	asm("clz.b64 %0, %1;" : "=r"(n_result) : "r"(n_x));
	return n_result;
}

inline uint32_t n_bfind_uint32_t(uint32_t n_x) // returns 31 - n_lzcnt32(n_x)
{
	uint32_t n_result;
	asm("bfind.u32 %0, %1;" : "=r"(n_result) : "r"(n_x));
	return n_result;
}

inline uint32_t n_bfind_uint64_t(uint64_t n_x) // returns 63 - n_lzcnt64(n_x)
{
	uint32_t n_result;
	asm("bfind.u64 %0, %1;" : "=r"(n_result) : "r"(n_x));
	return n_result;
}

inline uint32_t n_bfind_int32_t(int32_t n_x)
{
	uint32_t n_result;
	asm("bfind.s32 %0, %1;" : "=r"(n_result) : "r"(n_x));
	return n_result;
}

inline uint32_t n_bfind_int64_t(int64_t n_x)
{
	uint32_t n_result;
	asm("bfind.s64 %0, %1;" : "=r"(n_result) : "r"(n_x));
	return n_result;
}

inline uint32_t n_bfind_shiftamt_uint32_t(uint32_t n_x)
{
	uint32_t n_result;
	asm("bfind.shiftamt.u32 %0, %1;" : "=r"(n_result) : "r"(n_x));
	return n_result;
}

inline uint32_t n_bfind_shiftamt_uint64_t(uint64_t n_x)
{
	uint32_t n_result;
	asm("bfind.shiftamt.u64 %0, %1;" : "=r"(n_result) : "r"(n_x));
	return n_result;
}

inline uint32_t n_bfind_shiftamt_int32_t(int32_t n_x)
{
	uint32_t n_result;
	asm("bfind.shiftamt.s32 %0, %1;" : "=r"(n_result) : "r"(n_x));
	return n_result;
}

inline uint32_t n_bfind_shiftamt_int64_t(int64_t n_x)
{
	uint32_t n_result;
	asm("bfind.shiftamt.s64 %0, %1;" : "=r"(n_result) : "r"(n_x));
	return n_result;
}

inline uint32_t n_brev32(uint32_t n_x)
{
	uint32_t n_result;
	asm("brev.b32 %0, %1;" : "=r"(n_result) : "r"(n_x));
	return n_result;
}

inline uint64_t n_brev64(uint64_t n_x)
{
	uint64_t n_result;
	asm("brev.b64 %0, %1;" : "=r"(n_result) : "r"(n_x));
	return n_result;
}

#endif // !__CL_KERNEL_NVIDIA_CC20_SPECIFIC_INCLUDED
