#include <immintrin.h>
#include <xmmintrin.h>

static const __m256 w0 = { +CDF97_U2_S, +CDF97_U2_S, +CDF97_U2_S, +CDF97_U2_S, +CDF97_U2_S, +CDF97_U2_S, +CDF97_U2_S, +CDF97_U2_S };
static const __m256 w1 = { +CDF97_P2_S, +CDF97_P2_S, +CDF97_P2_S, +CDF97_P2_S, +CDF97_P2_S, +CDF97_P2_S, +CDF97_P2_S, +CDF97_P2_S };
static const __m256 w2 = { +CDF97_U1_S, +CDF97_U1_S, +CDF97_U1_S, +CDF97_U1_S, +CDF97_U1_S, +CDF97_U1_S, +CDF97_U1_S, +CDF97_U1_S };
static const __m256 w3 = { +CDF97_P1_S, +CDF97_P1_S, +CDF97_P1_S, +CDF97_P1_S, +CDF97_P1_S, +CDF97_P1_S, +CDF97_P1_S, +CDF97_P1_S };

#define _MM256_TRANSPOSE8_PS(row0, row1, row2, row3, row4, row5, row6, row7) \
do { \
	__m256 __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7; \
	__m256 __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7; \
\
	__t0 = _mm256_unpacklo_ps(row0, row1); \
	__t1 = _mm256_unpackhi_ps(row0, row1); \
	__t2 = _mm256_unpacklo_ps(row2, row3); \
	__t3 = _mm256_unpackhi_ps(row2, row3); \
	__t4 = _mm256_unpacklo_ps(row4, row5); \
	__t5 = _mm256_unpackhi_ps(row4, row5); \
	__t6 = _mm256_unpacklo_ps(row6, row7); \
	__t7 = _mm256_unpackhi_ps(row6, row7); \
\
	__tt0 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(1,0,1,0)); \
	__tt1 = _mm256_shuffle_ps(__t0, __t2, _MM_SHUFFLE(3,2,3,2)); \
	__tt2 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(1,0,1,0)); \
	__tt3 = _mm256_shuffle_ps(__t1, __t3, _MM_SHUFFLE(3,2,3,2)); \
	__tt4 = _mm256_shuffle_ps(__t4, __t6, _MM_SHUFFLE(1,0,1,0)); \
	__tt5 = _mm256_shuffle_ps(__t4, __t6, _MM_SHUFFLE(3,2,3,2)); \
	__tt6 = _mm256_shuffle_ps(__t5, __t7, _MM_SHUFFLE(1,0,1,0)); \
	__tt7 = _mm256_shuffle_ps(__t5, __t7, _MM_SHUFFLE(3,2,3,2)); \
\
	row0 = _mm256_permute2f128_ps(__tt0, __tt4, 0x20); \
	row1 = _mm256_permute2f128_ps(__tt1, __tt5, 0x20); \
	row2 = _mm256_permute2f128_ps(__tt2, __tt6, 0x20); \
	row3 = _mm256_permute2f128_ps(__tt3, __tt7, 0x20); \
	row4 = _mm256_permute2f128_ps(__tt0, __tt4, 0x31); \
	row5 = _mm256_permute2f128_ps(__tt1, __tt5, 0x31); \
	row6 = _mm256_permute2f128_ps(__tt2, __tt6, 0x31); \
	row7 = _mm256_permute2f128_ps(__tt3, __tt7, 0x31); \
} while(0)

#define VERT_2X8(in0, in1, out0, out1, buff) \
do { \
	__m256 l0, l1, l2, l3; \
	__m256 c0, c1, c2, c3; \
	__m256 r0, r1, r2, r3; \
	__m256 x0, x1; \
	__m256 y0, y1; \
\
	l0 = _mm256_load_ps((buff)+0*8); \
	l1 = _mm256_load_ps((buff)+1*8); \
	l2 = _mm256_load_ps((buff)+2*8); \
	l3 = _mm256_load_ps((buff)+3*8); \
\
	x0 = (in0); \
	x1 = (in1); \
\
	c0 = l1; \
	c1 = l2; \
	c2 = l3; \
	c3 = x0; \
\
	r3 = x1; \
	r2 = c3 + w3 * ( l3 + r3 ); \
	r1 = c2 + w2 * ( l2 + r2 ); \
	r0 = c1 + w1 * ( l1 + r1 ); \
	y0 = c0 + w0 * ( l0 + r0 ); \
	y1 = r0; \
\
	l0 = r0; \
	l1 = r1; \
	l2 = r2; \
	l3 = r3; \
\
	(out0) = y0; \
	(out1) = y1; \
\
	_mm256_store_ps((buff)+0*8, l0); \
	_mm256_store_ps((buff)+1*8, l1); \
	_mm256_store_ps((buff)+2*8, l2); \
	_mm256_store_ps((buff)+3*8, l3); \
} while(0)

static
void load_8x8_limited(
	float *t, // float[8*8] == __m256[8]
	const struct imageptr_t *input,
	const struct vec2_t local,
	const struct vec2_t size
)
{
	for(int y = 0; y < 8; y++)
	{
		for(int x = 0; x < 8; x++)
		{
			*(t+y*8+x) = *imageptr_pixel_ext_limited2(input, vec2_add(local, vec2_create(x,y)), size);
		}
	}
}

void load_core_save_8x8_fast(
	float *buffer_x,
	float *buffer_y,
	const struct imageptr_t *input,
	struct imageptr_t *llband,
	struct imageptr_t *hlband,
	struct imageptr_t *lhband,
	struct imageptr_t *hhband
)
{
	// 8x8 block
	__m256 t[8];

	// load

	t[ 0] = _mm256_loadu_ps( imageptr_pixel_offset_0_0_const(input) );
	t[ 1] = _mm256_loadu_ps( imageptr_pixel_offset_0_1_const(input) );
	t[ 2] = _mm256_loadu_ps( imageptr_pixel_offset_0_2_const(input) );
	t[ 3] = _mm256_loadu_ps( imageptr_pixel_offset_0_3_const(input) );
	t[ 4] = _mm256_loadu_ps( imageptr_pixel_offset_0_4_const(input) );
	t[ 5] = _mm256_loadu_ps( imageptr_pixel_offset_0_5_const(input) );
	t[ 6] = _mm256_loadu_ps( imageptr_pixel_offset_0_6_const(input) );
	t[ 7] = _mm256_loadu_ps( imageptr_pixel_offset_0_7_const(input) );

	// core

	_MM256_TRANSPOSE8_PS(t[ 0], t[ 1], t[ 2], t[ 3], t[ 4], t[ 5], t[ 6], t[ 7]);

	VERT_2X8(t[ 0], t[ 1], t[ 0], t[ 1], buffer_y+ 0);
	VERT_2X8(t[ 2], t[ 3], t[ 2], t[ 3], buffer_y+ 0);
	VERT_2X8(t[ 4], t[ 5], t[ 4], t[ 5], buffer_y+ 0);
	VERT_2X8(t[ 6], t[ 7], t[ 6], t[ 7], buffer_y+ 0);

	_MM256_TRANSPOSE8_PS(t[ 0], t[ 1], t[ 2], t[ 3], t[ 4], t[ 5], t[ 6], t[ 7]);

	VERT_2X8(t[ 0], t[ 1], t[ 0], t[ 1], buffer_x+ 0);
	VERT_2X8(t[ 2], t[ 3], t[ 2], t[ 3], buffer_x+ 0);
	VERT_2X8(t[ 4], t[ 5], t[ 4], t[ 5], buffer_x+ 0);
	VERT_2X8(t[ 6], t[ 7], t[ 6], t[ 7], buffer_x+ 0);

	// save
	{
		static const __m128 const16_ll = { CDF97_S2_S, CDF97_S2_S, CDF97_S2_S, CDF97_S2_S };

		const __m128 row0 = _mm_shuffle_ps(_mm256_extractf128_ps(t[0], 0), _mm256_extractf128_ps(t[0], 1), _MM_SHUFFLE(2, 0, 2, 0)) * const16_ll;
		const __m128 row1 = _mm_shuffle_ps(_mm256_extractf128_ps(t[2], 0), _mm256_extractf128_ps(t[2], 1), _MM_SHUFFLE(2, 0, 2, 0)) * const16_ll;
		const __m128 row2 = _mm_shuffle_ps(_mm256_extractf128_ps(t[4], 0), _mm256_extractf128_ps(t[4], 1), _MM_SHUFFLE(2, 0, 2, 0)) * const16_ll;
		const __m128 row3 = _mm_shuffle_ps(_mm256_extractf128_ps(t[6], 0), _mm256_extractf128_ps(t[6], 1), _MM_SHUFFLE(2, 0, 2, 0)) * const16_ll;

		_mm_store_ps( imageptr_pixel_offset_0_0(llband), row0 );
		_mm_store_ps( imageptr_pixel_offset_0_1(llband), row1 );
		_mm_store_ps( imageptr_pixel_offset_0_2(llband), row2 );
		_mm_store_ps( imageptr_pixel_offset_0_3(llband), row3 );
	}

	{
		const __m128 row0 = _mm_shuffle_ps(_mm256_extractf128_ps(t[0], 0), _mm256_extractf128_ps(t[0], 1), _MM_SHUFFLE(3, 1, 3, 1));
		const __m128 row1 = _mm_shuffle_ps(_mm256_extractf128_ps(t[2], 0), _mm256_extractf128_ps(t[2], 1), _MM_SHUFFLE(3, 1, 3, 1));
		const __m128 row2 = _mm_shuffle_ps(_mm256_extractf128_ps(t[4], 0), _mm256_extractf128_ps(t[4], 1), _MM_SHUFFLE(3, 1, 3, 1));
		const __m128 row3 = _mm_shuffle_ps(_mm256_extractf128_ps(t[6], 0), _mm256_extractf128_ps(t[6], 1), _MM_SHUFFLE(3, 1, 3, 1));

		_mm_store_ps( imageptr_pixel_offset_0_0(hlband), row0 );
		_mm_store_ps( imageptr_pixel_offset_0_1(hlband), row1 );
		_mm_store_ps( imageptr_pixel_offset_0_2(hlband), row2 );
		_mm_store_ps( imageptr_pixel_offset_0_3(hlband), row3 );
	}

	{
		const __m128 row0 = _mm_shuffle_ps(_mm256_extractf128_ps(t[1], 0), _mm256_extractf128_ps(t[1], 1), _MM_SHUFFLE(2, 0, 2, 0));
		const __m128 row1 = _mm_shuffle_ps(_mm256_extractf128_ps(t[3], 0), _mm256_extractf128_ps(t[3], 1), _MM_SHUFFLE(2, 0, 2, 0));
		const __m128 row2 = _mm_shuffle_ps(_mm256_extractf128_ps(t[5], 0), _mm256_extractf128_ps(t[5], 1), _MM_SHUFFLE(2, 0, 2, 0));
		const __m128 row3 = _mm_shuffle_ps(_mm256_extractf128_ps(t[7], 0), _mm256_extractf128_ps(t[7], 1), _MM_SHUFFLE(2, 0, 2, 0));

		_mm_store_ps( imageptr_pixel_offset_0_0(lhband), row0 );
		_mm_store_ps( imageptr_pixel_offset_0_1(lhband), row1 );
		_mm_store_ps( imageptr_pixel_offset_0_2(lhband), row2 );
		_mm_store_ps( imageptr_pixel_offset_0_3(lhband), row3 );
	}

	{
		static const __m128 const16_hh = { CDF97_S1_S, CDF97_S1_S, CDF97_S1_S, CDF97_S1_S };

		const __m128 row0 = _mm_shuffle_ps(_mm256_extractf128_ps(t[1], 0), _mm256_extractf128_ps(t[1], 1), _MM_SHUFFLE(3, 1, 3, 1)) * const16_hh;
		const __m128 row1 = _mm_shuffle_ps(_mm256_extractf128_ps(t[3], 0), _mm256_extractf128_ps(t[3], 1), _MM_SHUFFLE(3, 1, 3, 1)) * const16_hh;
		const __m128 row2 = _mm_shuffle_ps(_mm256_extractf128_ps(t[5], 0), _mm256_extractf128_ps(t[5], 1), _MM_SHUFFLE(3, 1, 3, 1)) * const16_hh;
		const __m128 row3 = _mm_shuffle_ps(_mm256_extractf128_ps(t[7], 0), _mm256_extractf128_ps(t[7], 1), _MM_SHUFFLE(3, 1, 3, 1)) * const16_hh;

		_mm_store_ps( imageptr_pixel_offset_0_0(hhband), row0 );
		_mm_store_ps( imageptr_pixel_offset_0_1(hhband), row1 );
		_mm_store_ps( imageptr_pixel_offset_0_2(hhband), row2 );
		_mm_store_ps( imageptr_pixel_offset_0_3(hhband), row3 );
	}
}

void load_core_save_8x8_slow(
	float *buffer_x,
	float *buffer_y,
	const struct imageptr_t *input,
	const struct vec2_t local,
	const struct vec2_t size,
	struct imageptr_t *llband,
	struct imageptr_t *hlband,
	struct imageptr_t *lhband,
	struct imageptr_t *hhband
)
{
	// 8x8 block
	__m256 t[8];

	// load

	load_8x8_limited(
		(float *)t,
		input,
		local,
		size
	);

	// core

	_MM256_TRANSPOSE8_PS(t[ 0], t[ 1], t[ 2], t[ 3], t[ 4], t[ 5], t[ 6], t[ 7]);

	VERT_2X8(t[ 0], t[ 1], t[ 0], t[ 1], buffer_y+ 0);
	VERT_2X8(t[ 2], t[ 3], t[ 2], t[ 3], buffer_y+ 0);
	VERT_2X8(t[ 4], t[ 5], t[ 4], t[ 5], buffer_y+ 0);
	VERT_2X8(t[ 6], t[ 7], t[ 6], t[ 7], buffer_y+ 0);

	_MM256_TRANSPOSE8_PS(t[ 0], t[ 1], t[ 2], t[ 3], t[ 4], t[ 5], t[ 6], t[ 7]);

	VERT_2X8(t[ 0], t[ 1], t[ 0], t[ 1], buffer_x+ 0);
	VERT_2X8(t[ 2], t[ 3], t[ 2], t[ 3], buffer_x+ 0);
	VERT_2X8(t[ 4], t[ 5], t[ 4], t[ 5], buffer_x+ 0);
	VERT_2X8(t[ 6], t[ 7], t[ 6], t[ 7], buffer_x+ 0);

	// save
	{
		static const __m128 const16_ll = { CDF97_S2_S, CDF97_S2_S, CDF97_S2_S, CDF97_S2_S };

		const __m128 row0 = _mm_shuffle_ps(_mm256_extractf128_ps(t[0], 0), _mm256_extractf128_ps(t[0], 1), _MM_SHUFFLE(2, 0, 2, 0)) * const16_ll;
		const __m128 row1 = _mm_shuffle_ps(_mm256_extractf128_ps(t[2], 0), _mm256_extractf128_ps(t[2], 1), _MM_SHUFFLE(2, 0, 2, 0)) * const16_ll;
		const __m128 row2 = _mm_shuffle_ps(_mm256_extractf128_ps(t[4], 0), _mm256_extractf128_ps(t[4], 1), _MM_SHUFFLE(2, 0, 2, 0)) * const16_ll;
		const __m128 row3 = _mm_shuffle_ps(_mm256_extractf128_ps(t[6], 0), _mm256_extractf128_ps(t[6], 1), _MM_SHUFFLE(2, 0, 2, 0)) * const16_ll;

		_mm_store_ps( imageptr_pixel_offset_0_0(llband), row0 );
		_mm_store_ps( imageptr_pixel_offset_0_1(llband), row1 );
		_mm_store_ps( imageptr_pixel_offset_0_2(llband), row2 );
		_mm_store_ps( imageptr_pixel_offset_0_3(llband), row3 );
	}

	{
		const __m128 row0 = _mm_shuffle_ps(_mm256_extractf128_ps(t[0], 0), _mm256_extractf128_ps(t[0], 1), _MM_SHUFFLE(3, 1, 3, 1));
		const __m128 row1 = _mm_shuffle_ps(_mm256_extractf128_ps(t[2], 0), _mm256_extractf128_ps(t[2], 1), _MM_SHUFFLE(3, 1, 3, 1));
		const __m128 row2 = _mm_shuffle_ps(_mm256_extractf128_ps(t[4], 0), _mm256_extractf128_ps(t[4], 1), _MM_SHUFFLE(3, 1, 3, 1));
		const __m128 row3 = _mm_shuffle_ps(_mm256_extractf128_ps(t[6], 0), _mm256_extractf128_ps(t[6], 1), _MM_SHUFFLE(3, 1, 3, 1));

		_mm_store_ps( imageptr_pixel_offset_0_0(hlband), row0 );
		_mm_store_ps( imageptr_pixel_offset_0_1(hlband), row1 );
		_mm_store_ps( imageptr_pixel_offset_0_2(hlband), row2 );
		_mm_store_ps( imageptr_pixel_offset_0_3(hlband), row3 );
	}

	{
		const __m128 row0 = _mm_shuffle_ps(_mm256_extractf128_ps(t[1], 0), _mm256_extractf128_ps(t[1], 1), _MM_SHUFFLE(2, 0, 2, 0));
		const __m128 row1 = _mm_shuffle_ps(_mm256_extractf128_ps(t[3], 0), _mm256_extractf128_ps(t[3], 1), _MM_SHUFFLE(2, 0, 2, 0));
		const __m128 row2 = _mm_shuffle_ps(_mm256_extractf128_ps(t[5], 0), _mm256_extractf128_ps(t[5], 1), _MM_SHUFFLE(2, 0, 2, 0));
		const __m128 row3 = _mm_shuffle_ps(_mm256_extractf128_ps(t[7], 0), _mm256_extractf128_ps(t[7], 1), _MM_SHUFFLE(2, 0, 2, 0));

		_mm_store_ps( imageptr_pixel_offset_0_0(lhband), row0 );
		_mm_store_ps( imageptr_pixel_offset_0_1(lhband), row1 );
		_mm_store_ps( imageptr_pixel_offset_0_2(lhband), row2 );
		_mm_store_ps( imageptr_pixel_offset_0_3(lhband), row3 );
	}

	{
		static const __m128 const16_hh = { CDF97_S1_S, CDF97_S1_S, CDF97_S1_S, CDF97_S1_S };

		const __m128 row0 = _mm_shuffle_ps(_mm256_extractf128_ps(t[1], 0), _mm256_extractf128_ps(t[1], 1), _MM_SHUFFLE(3, 1, 3, 1)) * const16_hh;
		const __m128 row1 = _mm_shuffle_ps(_mm256_extractf128_ps(t[3], 0), _mm256_extractf128_ps(t[3], 1), _MM_SHUFFLE(3, 1, 3, 1)) * const16_hh;
		const __m128 row2 = _mm_shuffle_ps(_mm256_extractf128_ps(t[5], 0), _mm256_extractf128_ps(t[5], 1), _MM_SHUFFLE(3, 1, 3, 1)) * const16_hh;
		const __m128 row3 = _mm_shuffle_ps(_mm256_extractf128_ps(t[7], 0), _mm256_extractf128_ps(t[7], 1), _MM_SHUFFLE(3, 1, 3, 1)) * const16_hh;

		_mm_store_ps( imageptr_pixel_offset_0_0(hhband), row0 );
		_mm_store_ps( imageptr_pixel_offset_0_1(hhband), row1 );
		_mm_store_ps( imageptr_pixel_offset_0_2(hhband), row2 );
		_mm_store_ps( imageptr_pixel_offset_0_3(hhband), row3 );
	}
}

void load_core_8x8_honly_fast(
	float *buffer_y,
	const struct imageptr_t *input
)
{
	// 8x8 block
	__m256 t[8];

	// load

	t[ 0] = _mm256_loadu_ps( imageptr_pixel_offset_0_0_const(input) );
	t[ 1] = _mm256_loadu_ps( imageptr_pixel_offset_0_1_const(input) );
	t[ 2] = _mm256_loadu_ps( imageptr_pixel_offset_0_2_const(input) );
	t[ 3] = _mm256_loadu_ps( imageptr_pixel_offset_0_3_const(input) );
	t[ 4] = _mm256_loadu_ps( imageptr_pixel_offset_0_4_const(input) );
	t[ 5] = _mm256_loadu_ps( imageptr_pixel_offset_0_5_const(input) );
	t[ 6] = _mm256_loadu_ps( imageptr_pixel_offset_0_6_const(input) );
	t[ 7] = _mm256_loadu_ps( imageptr_pixel_offset_0_7_const(input) );

	// core

	_MM256_TRANSPOSE8_PS(t[ 0], t[ 1], t[ 2], t[ 3], t[ 4], t[ 5], t[ 6], t[ 7]);

	VERT_2X8(t[ 0], t[ 1], t[ 0], t[ 1], buffer_y+ 0);
	VERT_2X8(t[ 2], t[ 3], t[ 2], t[ 3], buffer_y+ 0);
	VERT_2X8(t[ 4], t[ 5], t[ 4], t[ 5], buffer_y+ 0);
	VERT_2X8(t[ 6], t[ 7], t[ 6], t[ 7], buffer_y+ 0);
}

void load_core_8x8_honly_slow(
	float *buffer_y,
	const struct imageptr_t *input,
	const struct vec2_t local,
	const struct vec2_t size
)
{
	// 8x8 block
	__m256 t[8];

	// load

	load_8x8_limited(
		(float *)t,
		input,
		local,
		size
	);

	// core

	_MM256_TRANSPOSE8_PS(t[ 0], t[ 1], t[ 2], t[ 3], t[ 4], t[ 5], t[ 6], t[ 7]);

	VERT_2X8(t[ 0], t[ 1], t[ 0], t[ 1], buffer_y+ 0);
	VERT_2X8(t[ 2], t[ 3], t[ 2], t[ 3], buffer_y+ 0);
	VERT_2X8(t[ 4], t[ 5], t[ 4], t[ 5], buffer_y+ 0);
	VERT_2X8(t[ 6], t[ 7], t[ 6], t[ 7], buffer_y+ 0);
}

void save_8x8_debug(
	struct imageptr_t *llband,
	struct imageptr_t *hlband,
	struct imageptr_t *lhband,
	struct imageptr_t *hhband,
	struct imageptr_t *debug,
	const struct vec2_t local
)
{
	// top
	{
		__m128 t[8];

		__m128 ll0 = _mm_load_ps( imageptr_pixel_offset_0_0(llband) );
		__m128 ll1 = _mm_load_ps( imageptr_pixel_offset_0_1(llband) );
		__m128 hl0 = _mm_load_ps( imageptr_pixel_offset_0_0(hlband) );
		__m128 hl1 = _mm_load_ps( imageptr_pixel_offset_0_1(hlband) );
		__m128 lh0 = _mm_load_ps( imageptr_pixel_offset_0_0(lhband) );
		__m128 lh1 = _mm_load_ps( imageptr_pixel_offset_0_1(lhband) );
		__m128 hh0 = _mm_load_ps( imageptr_pixel_offset_0_0(hhband) );
		__m128 hh1 = _mm_load_ps( imageptr_pixel_offset_0_1(hhband) );

		t[0] = _mm_unpacklo_ps(ll0, hl0);
		t[1] = _mm_unpackhi_ps(ll0, hl0);
		t[4] = _mm_unpacklo_ps(ll1, hl1);
		t[5] = _mm_unpackhi_ps(ll1, hl1);
		t[2] = _mm_unpacklo_ps(lh0, hh0);
		t[3] = _mm_unpackhi_ps(lh0, hh0);
		t[6] = _mm_unpacklo_ps(lh1, hh1);
		t[7] = _mm_unpackhi_ps(lh1, hh1);

		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_0_0(local)), t[0] );
		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_4_0(local)), t[1] );
		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_0_1(local)), t[2] );
		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_4_1(local)), t[3] );
		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_0_2(local)), t[4] );
		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_4_2(local)), t[5] );
		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_0_3(local)), t[6] );
		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_4_3(local)), t[7] );
	}

	// bottom
	{
		__m128 t[8];

		__m128 ll0 = _mm_load_ps( imageptr_pixel_offset_0_2(llband) );
		__m128 ll1 = _mm_load_ps( imageptr_pixel_offset_0_3(llband) );
		__m128 hl0 = _mm_load_ps( imageptr_pixel_offset_0_2(hlband) );
		__m128 hl1 = _mm_load_ps( imageptr_pixel_offset_0_3(hlband) );
		__m128 lh0 = _mm_load_ps( imageptr_pixel_offset_0_2(lhband) );
		__m128 lh1 = _mm_load_ps( imageptr_pixel_offset_0_3(lhband) );
		__m128 hh0 = _mm_load_ps( imageptr_pixel_offset_0_2(hhband) );
		__m128 hh1 = _mm_load_ps( imageptr_pixel_offset_0_3(hhband) );

		t[0] = _mm_unpacklo_ps(ll0, hl0);
		t[1] = _mm_unpackhi_ps(ll0, hl0);
		t[4] = _mm_unpacklo_ps(ll1, hl1);
		t[5] = _mm_unpackhi_ps(ll1, hl1);
		t[2] = _mm_unpacklo_ps(lh0, hh0);
		t[3] = _mm_unpackhi_ps(lh0, hh0);
		t[6] = _mm_unpacklo_ps(lh1, hh1);
		t[7] = _mm_unpackhi_ps(lh1, hh1);

		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_0_4(local)), t[0] );
		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_4_4(local)), t[1] );
		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_0_5(local)), t[2] );
		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_4_5(local)), t[3] );
		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_0_6(local)), t[4] );
		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_4_6(local)), t[5] );
		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_0_7(local)), t[6] );
		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_4_7(local)), t[7] );
	}
}
