#include "system.h"
#include <xmmintrin.h>

static const __m128 w0 = { +CDF97_U2_S, +CDF97_U2_S, +CDF97_U2_S, +CDF97_U2_S };
static const __m128 w1 = { +CDF97_P2_S, +CDF97_P2_S, +CDF97_P2_S, +CDF97_P2_S };
static const __m128 w2 = { +CDF97_U1_S, +CDF97_U1_S, +CDF97_U1_S, +CDF97_U1_S };
static const __m128 w3 = { +CDF97_P1_S, +CDF97_P1_S, +CDF97_P1_S, +CDF97_P1_S };

// 8x8 block in terms of __m128
// [   0   ] [   1   ]
// [   2   ] [   3   ]
// [   4   ] [   5   ]
// [   6   ] [   7   ]
// [   8   ] [   9   ]
// [  10   ] [  11   ]
// [  12   ] [  13   ]
// [  14   ] [  15   ]

/* Transpose the 8x8 matrix composed of row[0-15].  */
#define _MM_TRANSPOSE8_PS(row0, row1, row2, row3, row4, row5, row6, row7, row8, row9, rowA, rowB, rowC, rowD, rowE, rowF) \
do { \
	__m128 t0 = (row0), t1 = (row1), t2 = (row2), t3 = (row3), \
	       t4 = (row4), t5 = (row5), t6 = (row6), t7 = (row7), \
	       t8 = (row8), t9 = (row9), tA = (rowA), tB = (rowB), \
	       tC = (rowC), tD = (rowD), tE = (rowE), tF = (rowF); \
\
	_MM_TRANSPOSE4_PS(t0, t2, t4, t6); \
	_MM_TRANSPOSE4_PS(t1, t3, t5, t7); \
	_MM_TRANSPOSE4_PS(t8, tA, tC, tE); \
	_MM_TRANSPOSE4_PS(t9, tB, tD, tF); \
\
	(row0) = t0; \
	(row2) = t2; \
	(row4) = t4; \
	(row6) = t6; \
\
	(row1) = t8; \
	(row3) = tA; \
	(row5) = tC; \
	(row7) = tE; \
\
	(row8) = t1; \
	(rowA) = t3; \
	(rowC) = t5; \
	(rowE) = t7; \
\
	(row9) = t9; \
	(rowB) = tB; \
	(rowD) = tD; \
	(rowF) = tF; \
} while(0)

#define VERT_2X4(in0, in1, out0, out1, buff) \
do { \
	__m128 l0, l1, l2, l3; \
	__m128 c0, c1, c2, c3; \
	__m128 r0, r1, r2, r3; \
	__m128 x0, x1; \
	__m128 y0, y1; \
\
	l0 = _mm_load_ps((buff)+0*4); \
	l1 = _mm_load_ps((buff)+1*4); \
	l2 = _mm_load_ps((buff)+2*4); \
	l3 = _mm_load_ps((buff)+3*4); \
\
	x0 = (in0); \
	x1 = (in1); \
\
	c0 = l1; \
	c1 = l2; \
	c2 = l3; \
	c3 = x0; \
\
	r3 = x1; \
	r2 = c3 + w3 * ( l3 + r3 ); \
	r1 = c2 + w2 * ( l2 + r2 ); \
	r0 = c1 + w1 * ( l1 + r1 ); \
	y0 = c0 + w0 * ( l0 + r0 ); \
	y1 = r0; \
\
	l0 = r0; \
	l1 = r1; \
	l2 = r2; \
	l3 = r3; \
\
	(out0) = y0; \
	(out1) = y1; \
\
	_mm_store_ps((buff)+0*4, l0); \
	_mm_store_ps((buff)+1*4, l1); \
	_mm_store_ps((buff)+2*4, l2); \
	_mm_store_ps((buff)+3*4, l3); \
} while(0)

/**
 * @note like VERT_2X4 except load/store operations over buffers
 *
	l0 = _mm_load_ps((buff)+0*4);
	l1 = _mm_load_ps((buff)+1*4);
	l2 = _mm_load_ps((buff)+2*4);
	l3 = _mm_load_ps((buff)+3*4);

	_mm_store_ps((buff)+0*4, l0);
	_mm_store_ps((buff)+1*4, l1);
	_mm_store_ps((buff)+2*4, l2);
	_mm_store_ps((buff)+3*4, l3);
 */
#define VERT_2X4_FAST(x0, x1, l0, l1, l2, l3) \
do { \
	__m128 r0, r1, r2, r3; \
\
	 r3  = (x1); \
	 r2  = (x0) + w3 * ( (l3) + r3 ); \
	 r1  = (l3) + w2 * ( (l2) + r2 ); \
	 r0  = (l2) + w1 * ( (l1) + r1 ); \
	(x0) = (l1) + w0 * ( (l0) + r0 ); \
	(x1) = r0; \
\
	(l0) = r0; \
	(l1) = r1; \
	(l2) = r2; \
	(l3) = r3; \
} while(0)

#define VERT_8X4_FAST(x0, x1, x2, x3, x4, x5, x6, x7, l0, l1, l2, l3) \
do { \
	VERT_2X4_FAST((x0), (x1), (l0), (l1), (l2), (l3)); \
	VERT_2X4_FAST((x2), (x3), (l0), (l1), (l2), (l3)); \
	VERT_2X4_FAST((x4), (x5), (l0), (l1), (l2), (l3)); \
	VERT_2X4_FAST((x6), (x7), (l0), (l1), (l2), (l3)); \
} while(0)

static
void load_8x8_limited(
	float *t, // float[8*8] == __m128[16]
	const struct imageptr_t *input,
	const struct vec2_t local,
	const struct vec2_t size
)
{
	for(int y = 0; y < 8; y++)
	{
		for(int x = 0; x < 8; x++)
		{
			*(t+y*8+x) = *imageptr_pixel_ext_limited2(input, vec2_add(local, vec2_create(x,y)), size);
		}
	}
}

void load_core_save_8x8_fast(
	float *buffer_x,
	float *buffer_y,
	const struct imageptr_t *input,
	struct imageptr_t *llband,
	struct imageptr_t *hlband,
	struct imageptr_t *lhband,
	struct imageptr_t *hhband
)
{
	// 8x8 block
	__m128 t[16];

	// load

	t[ 0] = _mm_loadu_ps( imageptr_pixel_offset_0_0_const(input) ); t[ 1] = _mm_loadu_ps( imageptr_pixel_offset_4_0_const(input) );
	t[ 2] = _mm_loadu_ps( imageptr_pixel_offset_0_1_const(input) ); t[ 3] = _mm_loadu_ps( imageptr_pixel_offset_4_1_const(input) );
	t[ 4] = _mm_loadu_ps( imageptr_pixel_offset_0_2_const(input) ); t[ 5] = _mm_loadu_ps( imageptr_pixel_offset_4_2_const(input) );
	t[ 6] = _mm_loadu_ps( imageptr_pixel_offset_0_3_const(input) ); t[ 7] = _mm_loadu_ps( imageptr_pixel_offset_4_3_const(input) );
	t[ 8] = _mm_loadu_ps( imageptr_pixel_offset_0_4_const(input) ); t[ 9] = _mm_loadu_ps( imageptr_pixel_offset_4_4_const(input) );
	t[10] = _mm_loadu_ps( imageptr_pixel_offset_0_5_const(input) ); t[11] = _mm_loadu_ps( imageptr_pixel_offset_4_5_const(input) );
	t[12] = _mm_loadu_ps( imageptr_pixel_offset_0_6_const(input) ); t[13] = _mm_loadu_ps( imageptr_pixel_offset_4_6_const(input) );
	t[14] = _mm_loadu_ps( imageptr_pixel_offset_0_7_const(input) ); t[15] = _mm_loadu_ps( imageptr_pixel_offset_4_7_const(input) );

	// core
#ifndef CONFIG_CORE_DISABLE_DWT

#if 1
	_MM_TRANSPOSE8_PS(t[ 0], t[ 1], t[ 2], t[ 3], t[ 4], t[ 5], t[ 6], t[ 7], t[ 8], t[ 9], t[10], t[11], t[12], t[13], t[14], t[15]);

	{
		__m128 l[8];

		l[0] = _mm_load_ps((buffer_y+ 0)+0*4);
		l[1] = _mm_load_ps((buffer_y+ 0)+1*4);
		l[2] = _mm_load_ps((buffer_y+ 0)+2*4);
		l[3] = _mm_load_ps((buffer_y+ 0)+3*4);

		VERT_8X4_FAST(t[ 0], t[ 2], t[ 4], t[ 6], t[ 8], t[10], t[12], t[14], l[0], l[1], l[2], l[3]);

		_mm_store_ps((buffer_y+ 0)+0*4, l[0]);
		_mm_store_ps((buffer_y+ 0)+1*4, l[1]);
		_mm_store_ps((buffer_y+ 0)+2*4, l[2]);
		_mm_store_ps((buffer_y+ 0)+3*4, l[3]);

		l[4] = _mm_load_ps((buffer_y+16)+0*4);
		l[5] = _mm_load_ps((buffer_y+16)+1*4);
		l[6] = _mm_load_ps((buffer_y+16)+2*4);
		l[7] = _mm_load_ps((buffer_y+16)+3*4);

		VERT_8X4_FAST(t[ 1], t[ 3], t[ 5], t[ 7], t[ 9], t[11], t[13], t[15], l[4], l[5], l[6], l[7]);

		_mm_store_ps((buffer_y+16)+0*4, l[4]);
		_mm_store_ps((buffer_y+16)+1*4, l[5]);
		_mm_store_ps((buffer_y+16)+2*4, l[6]);
		_mm_store_ps((buffer_y+16)+3*4, l[7]);
	}

	_MM_TRANSPOSE8_PS(t[ 0], t[ 1], t[ 2], t[ 3], t[ 4], t[ 5], t[ 6], t[ 7], t[ 8], t[ 9], t[10], t[11], t[12], t[13], t[14], t[15]);

	{
		__m128 l[8];

		l[0] = _mm_load_ps((buffer_x+ 0)+0*4);
		l[1] = _mm_load_ps((buffer_x+ 0)+1*4);
		l[2] = _mm_load_ps((buffer_x+ 0)+2*4);
		l[3] = _mm_load_ps((buffer_x+ 0)+3*4);

		VERT_8X4_FAST(t[ 0], t[ 2], t[ 4], t[ 6], t[ 8], t[10], t[12], t[14], l[0], l[1], l[2], l[3]);

		_mm_store_ps((buffer_x+ 0)+0*4, l[0]);
		_mm_store_ps((buffer_x+ 0)+1*4, l[1]);
		_mm_store_ps((buffer_x+ 0)+2*4, l[2]);
		_mm_store_ps((buffer_x+ 0)+3*4, l[3]);

		l[4] = _mm_load_ps((buffer_x+16)+0*4);
		l[5] = _mm_load_ps((buffer_x+16)+1*4);
		l[6] = _mm_load_ps((buffer_x+16)+2*4);
		l[7] = _mm_load_ps((buffer_x+16)+3*4);

		VERT_8X4_FAST(t[ 1], t[ 3], t[ 5], t[ 7], t[ 9], t[11], t[13], t[15], l[4], l[5], l[6], l[7]);

		_mm_store_ps((buffer_x+16)+0*4, l[4]);
		_mm_store_ps((buffer_x+16)+1*4, l[5]);
		_mm_store_ps((buffer_x+16)+2*4, l[6]);
		_mm_store_ps((buffer_x+16)+3*4, l[7]);
	}
#endif

#if 0
	_MM_TRANSPOSE8_PS(t[ 0], t[ 1], t[ 2], t[ 3], t[ 4], t[ 5], t[ 6], t[ 7], t[ 8], t[ 9], t[10], t[11], t[12], t[13], t[14], t[15]);

	VERT_2X4(t[ 0], t[ 2], t[ 0], t[ 2], buffer_y+ 0); VERT_2X4(t[ 1], t[ 3], t[ 1], t[ 3], buffer_y+16);
	VERT_2X4(t[ 4], t[ 6], t[ 4], t[ 6], buffer_y+ 0); VERT_2X4(t[ 5], t[ 7], t[ 5], t[ 7], buffer_y+16);
	VERT_2X4(t[ 8], t[10], t[ 8], t[10], buffer_y+ 0); VERT_2X4(t[ 9], t[11], t[ 9], t[11], buffer_y+16);
	VERT_2X4(t[12], t[14], t[12], t[14], buffer_y+ 0); VERT_2X4(t[13], t[15], t[13], t[15], buffer_y+16);

	_MM_TRANSPOSE8_PS(t[ 0], t[ 1], t[ 2], t[ 3], t[ 4], t[ 5], t[ 6], t[ 7], t[ 8], t[ 9], t[10], t[11], t[12], t[13], t[14], t[15]);

	VERT_2X4(t[ 0], t[ 2], t[ 0], t[ 2], buffer_x+ 0); VERT_2X4(t[ 1], t[ 3], t[ 1], t[ 3], buffer_x+16);
	VERT_2X4(t[ 4], t[ 6], t[ 4], t[ 6], buffer_x+ 0); VERT_2X4(t[ 5], t[ 7], t[ 5], t[ 7], buffer_x+16);
	VERT_2X4(t[ 8], t[10], t[ 8], t[10], buffer_x+ 0); VERT_2X4(t[ 9], t[11], t[ 9], t[11], buffer_x+16);
	VERT_2X4(t[12], t[14], t[12], t[14], buffer_x+ 0); VERT_2X4(t[13], t[15], t[13], t[15], buffer_x+16);
#endif

#if 0
	__m128 *mmbuffer_y = (__m128 *)buffer_y;
	__m128 *mmbuffer_x = (__m128 *)buffer_x;

	_MM_TRANSPOSE4_PS(t[ 0], t[ 2], t[ 4], t[ 6]);
	VERT_2X4_FAST(t[ 0], t[ 2], mmbuffer_y[0], mmbuffer_y[1], mmbuffer_y[2], mmbuffer_y[3]);

	_MM_TRANSPOSE4_PS(t[ 1], t[ 3], t[ 5], t[ 7]);
	VERT_2X4_FAST(t[ 1], t[ 3], mmbuffer_y[0], mmbuffer_y[1], mmbuffer_y[2], mmbuffer_y[3]);
	VERT_2X4_FAST(t[ 5], t[ 7], mmbuffer_y[0], mmbuffer_y[1], mmbuffer_y[2], mmbuffer_y[3]);

	_MM_TRANSPOSE4_PS(t[ 8], t[10], t[12], t[14]);
	VERT_2X4_FAST(t[ 8], t[10], mmbuffer_y[4], mmbuffer_y[5], mmbuffer_y[6], mmbuffer_y[7]);
	VERT_2X4_FAST(t[12], t[14], mmbuffer_y[4], mmbuffer_y[5], mmbuffer_y[6], mmbuffer_y[7]);

	_MM_TRANSPOSE4_PS(t[ 9], t[11], t[13], t[15]);
	VERT_2X4_FAST(t[ 9], t[11], mmbuffer_y[4], mmbuffer_y[5], mmbuffer_y[6], mmbuffer_y[7]);
	VERT_2X4_FAST(t[13], t[15], mmbuffer_y[4], mmbuffer_y[5], mmbuffer_y[6], mmbuffer_y[7]);

	_MM_TRANSPOSE4_PS(t[ 0], t[ 2], t[ 4], t[ 6]);
	VERT_2X4_FAST(t[ 0], t[ 2], mmbuffer_x[0], mmbuffer_x[1], mmbuffer_x[2], mmbuffer_x[3]);
	VERT_2X4_FAST(t[ 4], t[ 6], mmbuffer_x[0], mmbuffer_x[1], mmbuffer_x[2], mmbuffer_x[3]);

	_MM_TRANSPOSE4_PS(t[ 8], t[10], t[12], t[14]);
	VERT_2X4_FAST(t[ 8], t[10], mmbuffer_x[0], mmbuffer_x[1], mmbuffer_x[2], mmbuffer_x[3]);
	VERT_2X4_FAST(t[12], t[14], mmbuffer_x[0], mmbuffer_x[1], mmbuffer_x[2], mmbuffer_x[3]);

	_MM_TRANSPOSE4_PS(t[ 1], t[ 3], t[ 5], t[ 7]);
	VERT_2X4_FAST(t[ 1], t[ 3], mmbuffer_x[4], mmbuffer_x[5], mmbuffer_x[6], mmbuffer_x[7]);
	VERT_2X4_FAST(t[ 5], t[ 7], mmbuffer_x[4], mmbuffer_x[5], mmbuffer_x[6], mmbuffer_x[7]);

	_MM_TRANSPOSE4_PS(t[ 9], t[11], t[13], t[15]);
	VERT_2X4_FAST(t[ 9], t[11], mmbuffer_x[4], mmbuffer_x[5], mmbuffer_x[6], mmbuffer_x[7]);
	VERT_2X4_FAST(t[13], t[15], mmbuffer_x[4], mmbuffer_x[5], mmbuffer_x[6], mmbuffer_x[7]);
#endif

#if 0
	_MM_TRANSPOSE4_PS(t[ 0], t[ 2], t[ 4], t[ 6]);
	VERT_2X4(t[ 0], t[ 2], t[ 0], t[ 2], buffer_y+ 0);
	VERT_2X4(t[ 4], t[ 6], t[ 4], t[ 6], buffer_y+ 0);

	_MM_TRANSPOSE4_PS(t[ 1], t[ 3], t[ 5], t[ 7]);
	VERT_2X4(t[ 1], t[ 3], t[ 1], t[ 3], buffer_y+ 0);
	VERT_2X4(t[ 5], t[ 7], t[ 5], t[ 7], buffer_y+ 0);

	_MM_TRANSPOSE4_PS(t[ 8], t[10], t[12], t[14]);
	VERT_2X4(t[ 8], t[10], t[ 8], t[10], buffer_y+16);
	VERT_2X4(t[12], t[14], t[12], t[14], buffer_y+16);

	_MM_TRANSPOSE4_PS(t[ 9], t[11], t[13], t[15]);
	VERT_2X4(t[ 9], t[11], t[ 9], t[11], buffer_y+16);
	VERT_2X4(t[13], t[15], t[13], t[15], buffer_y+16);

	_MM_TRANSPOSE4_PS(t[ 0], t[ 2], t[ 4], t[ 6]);
	VERT_2X4(t[ 0], t[ 2], t[ 0], t[ 2], buffer_x+ 0);
	VERT_2X4(t[ 4], t[ 6], t[ 4], t[ 6], buffer_x+ 0);

	_MM_TRANSPOSE4_PS(t[ 8], t[10], t[12], t[14]);
	VERT_2X4(t[ 8], t[10], t[ 8], t[10], buffer_x+ 0);
	VERT_2X4(t[12], t[14], t[12], t[14], buffer_x+ 0);

	_MM_TRANSPOSE4_PS(t[ 1], t[ 3], t[ 5], t[ 7]);
	VERT_2X4(t[ 1], t[ 3], t[ 1], t[ 3], buffer_x+16);
	VERT_2X4(t[ 5], t[ 7], t[ 5], t[ 7], buffer_x+16);

	_MM_TRANSPOSE4_PS(t[ 9], t[11], t[13], t[15]);
	VERT_2X4(t[ 9], t[11], t[ 9], t[11], buffer_x+16);
	VERT_2X4(t[13], t[15], t[13], t[15], buffer_x+16);
#endif

#if 0
	_MM_TRANSPOSE4_PS(t[ 0], t[ 2], t[ 4], t[ 6]);
	_MM_TRANSPOSE4_PS(t[ 1], t[ 3], t[ 5], t[ 7]);
	_MM_TRANSPOSE4_PS(t[ 8], t[10], t[12], t[14]);
	_MM_TRANSPOSE4_PS(t[ 9], t[11], t[13], t[15]);

	VERT_2X4(t[ 0], t[ 2], t[ 0], t[ 2], buffer_y+ 0);
	VERT_2X4(t[ 4], t[ 6], t[ 4], t[ 6], buffer_y+ 0);
	VERT_2X4(t[ 1], t[ 3], t[ 1], t[ 3], buffer_y+ 0);
	VERT_2X4(t[ 5], t[ 7], t[ 5], t[ 7], buffer_y+ 0);

	VERT_2X4(t[ 8], t[10], t[ 8], t[10], buffer_y+16);
	VERT_2X4(t[12], t[14], t[12], t[14], buffer_y+16);
	VERT_2X4(t[ 9], t[11], t[ 9], t[11], buffer_y+16);
	VERT_2X4(t[13], t[15], t[13], t[15], buffer_y+16);

	_MM_TRANSPOSE4_PS(t[ 0], t[ 2], t[ 4], t[ 6]);
	_MM_TRANSPOSE4_PS(t[ 8], t[10], t[12], t[14]);
	_MM_TRANSPOSE4_PS(t[ 1], t[ 3], t[ 5], t[ 7]);
	_MM_TRANSPOSE4_PS(t[ 9], t[11], t[13], t[15]);

	VERT_2X4(t[ 0], t[ 2], t[ 0], t[ 2], buffer_x+ 0);
	VERT_2X4(t[ 4], t[ 6], t[ 4], t[ 6], buffer_x+ 0);
	VERT_2X4(t[ 8], t[10], t[ 8], t[10], buffer_x+ 0);
	VERT_2X4(t[12], t[14], t[12], t[14], buffer_x+ 0);

	VERT_2X4(t[ 1], t[ 3], t[ 1], t[ 3], buffer_x+16);
	VERT_2X4(t[ 5], t[ 7], t[ 5], t[ 7], buffer_x+16);
	VERT_2X4(t[ 9], t[11], t[ 9], t[11], buffer_x+16);
	VERT_2X4(t[13], t[15], t[13], t[15], buffer_x+16);
#endif
#else /* CONFIG_CORE_DISABLE_DWT */
	UNUSED(buffer_x);
	UNUSED(buffer_y);
#endif /* CONFIG_CORE_DISABLE_DWT */

	// save

	{
#ifndef CONFIG_CORE_DISABLE_DWT
		static const __m128 const16_ll = { CDF97_S2_S, CDF97_S2_S, CDF97_S2_S, CDF97_S2_S };

		const __m128 row0 = _mm_shuffle_ps(t[ 0], t[ 1], _MM_SHUFFLE(2, 0, 2, 0)) * const16_ll;
		const __m128 row1 = _mm_shuffle_ps(t[ 4], t[ 5], _MM_SHUFFLE(2, 0, 2, 0)) * const16_ll;
		const __m128 row2 = _mm_shuffle_ps(t[ 8], t[ 9], _MM_SHUFFLE(2, 0, 2, 0)) * const16_ll;
		const __m128 row3 = _mm_shuffle_ps(t[12], t[13], _MM_SHUFFLE(2, 0, 2, 0)) * const16_ll;
#else /* CONFIG_CORE_DISABLE_DWT */
		// wire the input directly to the output
		const __m128 row0 = t[ 0];
		const __m128 row1 = t[ 4];
		const __m128 row2 = t[ 8];
		const __m128 row3 = t[12];
#endif /* CONFIG_CORE_DISABLE_DWT */
		_mm_store_ps( imageptr_pixel_offset_0_0(llband), row0 );
		_mm_store_ps( imageptr_pixel_offset_0_1(llband), row1 );
		_mm_store_ps( imageptr_pixel_offset_0_2(llband), row2 );
		_mm_store_ps( imageptr_pixel_offset_0_3(llband), row3 );
	}

	{
#ifndef CONFIG_CORE_DISABLE_DWT
		const __m128 row0 = _mm_shuffle_ps(t[ 0], t[ 1], _MM_SHUFFLE(3, 1, 3, 1));
		const __m128 row1 = _mm_shuffle_ps(t[ 4], t[ 5], _MM_SHUFFLE(3, 1, 3, 1));
		const __m128 row2 = _mm_shuffle_ps(t[ 8], t[ 9], _MM_SHUFFLE(3, 1, 3, 1));
		const __m128 row3 = _mm_shuffle_ps(t[12], t[13], _MM_SHUFFLE(3, 1, 3, 1));
#else /* CONFIG_CORE_DISABLE_DWT */
		// wire the input directly to the output
		const __m128 row0 = t[ 1];
		const __m128 row1 = t[ 5];
		const __m128 row2 = t[ 9];
		const __m128 row3 = t[13];
#endif /* CONFIG_CORE_DISABLE_DWT */
		_mm_store_ps( imageptr_pixel_offset_0_0(hlband), row0 );
		_mm_store_ps( imageptr_pixel_offset_0_1(hlband), row1 );
		_mm_store_ps( imageptr_pixel_offset_0_2(hlband), row2 );
		_mm_store_ps( imageptr_pixel_offset_0_3(hlband), row3 );
	}

	{
#ifndef CONFIG_CORE_DISABLE_DWT
		const __m128 row0 = _mm_shuffle_ps(t[ 2], t[ 3], _MM_SHUFFLE(2, 0, 2, 0));
		const __m128 row1 = _mm_shuffle_ps(t[ 6], t[ 7], _MM_SHUFFLE(2, 0, 2, 0));
		const __m128 row2 = _mm_shuffle_ps(t[10], t[11], _MM_SHUFFLE(2, 0, 2, 0));
		const __m128 row3 = _mm_shuffle_ps(t[14], t[15], _MM_SHUFFLE(2, 0, 2, 0));
#else /* CONFIG_CORE_DISABLE_DWT */
		// wire the input directly to the output
		const __m128 row0 = t[ 2];
		const __m128 row1 = t[ 6];
		const __m128 row2 = t[10];
		const __m128 row3 = t[14];
#endif /* CONFIG_CORE_DISABLE_DWT */
		_mm_store_ps( imageptr_pixel_offset_0_0(lhband), row0 );
		_mm_store_ps( imageptr_pixel_offset_0_1(lhband), row1 );
		_mm_store_ps( imageptr_pixel_offset_0_2(lhband), row2 );
		_mm_store_ps( imageptr_pixel_offset_0_3(lhband), row3 );
	}

	{
#ifndef CONFIG_CORE_DISABLE_DWT
		static const __m128 const16_hh = { CDF97_S1_S, CDF97_S1_S, CDF97_S1_S, CDF97_S1_S };

		const __m128 row0 = _mm_shuffle_ps(t[ 2], t[ 3], _MM_SHUFFLE(3, 1, 3, 1)) * const16_hh;
		const __m128 row1 = _mm_shuffle_ps(t[ 6], t[ 7], _MM_SHUFFLE(3, 1, 3, 1)) * const16_hh;
		const __m128 row2 = _mm_shuffle_ps(t[10], t[11], _MM_SHUFFLE(3, 1, 3, 1)) * const16_hh;
		const __m128 row3 = _mm_shuffle_ps(t[14], t[15], _MM_SHUFFLE(3, 1, 3, 1)) * const16_hh;
#else /* CONFIG_CORE_DISABLE_DWT */
		// wire the input directly to the output
		const __m128 row0 = t[ 3];
		const __m128 row1 = t[ 7];
		const __m128 row2 = t[11];
		const __m128 row3 = t[15];
#endif /* CONFIG_CORE_DISABLE_DWT */
		_mm_store_ps( imageptr_pixel_offset_0_0(hhband), row0 );
		_mm_store_ps( imageptr_pixel_offset_0_1(hhband), row1 );
		_mm_store_ps( imageptr_pixel_offset_0_2(hhband), row2 );
		_mm_store_ps( imageptr_pixel_offset_0_3(hhband), row3 );
	}
}

void load_core_save_8x8_slow(
	float *buffer_x,
	float *buffer_y,
	const struct imageptr_t *input,
	const struct vec2_t local,
	const struct vec2_t size,
	struct imageptr_t *llband,
	struct imageptr_t *hlband,
	struct imageptr_t *lhband,
	struct imageptr_t *hhband
)
{
	// 8x8 block
	__m128 t[16];

	// load
	load_8x8_limited(
		(float *)t,
		input,
		local,
		size
	);

	// core
#ifndef CONFIG_CORE_DISABLE_DWT
	_MM_TRANSPOSE8_PS(t[ 0], t[ 1], t[ 2], t[ 3], t[ 4], t[ 5], t[ 6], t[ 7], t[ 8], t[ 9], t[10], t[11], t[12], t[13], t[14], t[15]);

	VERT_2X4(t[ 0], t[ 2], t[ 0], t[ 2], buffer_y+ 0); VERT_2X4(t[ 1], t[ 3], t[ 1], t[ 3], buffer_y+16);
	VERT_2X4(t[ 4], t[ 6], t[ 4], t[ 6], buffer_y+ 0); VERT_2X4(t[ 5], t[ 7], t[ 5], t[ 7], buffer_y+16);
	VERT_2X4(t[ 8], t[10], t[ 8], t[10], buffer_y+ 0); VERT_2X4(t[ 9], t[11], t[ 9], t[11], buffer_y+16);
	VERT_2X4(t[12], t[14], t[12], t[14], buffer_y+ 0); VERT_2X4(t[13], t[15], t[13], t[15], buffer_y+16);

	_MM_TRANSPOSE8_PS(t[ 0], t[ 1], t[ 2], t[ 3], t[ 4], t[ 5], t[ 6], t[ 7], t[ 8], t[ 9], t[10], t[11], t[12], t[13], t[14], t[15]);

	VERT_2X4(t[ 0], t[ 2], t[ 0], t[ 2], buffer_x+ 0); VERT_2X4(t[ 1], t[ 3], t[ 1], t[ 3], buffer_x+16);
	VERT_2X4(t[ 4], t[ 6], t[ 4], t[ 6], buffer_x+ 0); VERT_2X4(t[ 5], t[ 7], t[ 5], t[ 7], buffer_x+16);
	VERT_2X4(t[ 8], t[10], t[ 8], t[10], buffer_x+ 0); VERT_2X4(t[ 9], t[11], t[ 9], t[11], buffer_x+16);
	VERT_2X4(t[12], t[14], t[12], t[14], buffer_x+ 0); VERT_2X4(t[13], t[15], t[13], t[15], buffer_x+16);
#else /* CONFIG_CORE_DISABLE_DWT */
	UNUSED(buffer_x);
	UNUSED(buffer_y);
#endif /* CONFIG_CORE_DISABLE_DWT */

	// save

	{
#ifndef CONFIG_CORE_DISABLE_DWT
		static const __m128 const16_ll = { CDF97_S2_S, CDF97_S2_S, CDF97_S2_S, CDF97_S2_S };

		const __m128 row0 = _mm_shuffle_ps(t[ 0], t[ 1], _MM_SHUFFLE(2, 0, 2, 0)) * const16_ll;
		const __m128 row1 = _mm_shuffle_ps(t[ 4], t[ 5], _MM_SHUFFLE(2, 0, 2, 0)) * const16_ll;
		const __m128 row2 = _mm_shuffle_ps(t[ 8], t[ 9], _MM_SHUFFLE(2, 0, 2, 0)) * const16_ll;
		const __m128 row3 = _mm_shuffle_ps(t[12], t[13], _MM_SHUFFLE(2, 0, 2, 0)) * const16_ll;
#else /* CONFIG_CORE_DISABLE_DWT */
		const __m128 row0 = t[ 0];
		const __m128 row1 = t[ 4];
		const __m128 row2 = t[ 8];
		const __m128 row3 = t[12];
#endif /* CONFIG_CORE_DISABLE_DWT */
		_mm_store_ps( imageptr_pixel_offset_0_0(llband), row0 );
		_mm_store_ps( imageptr_pixel_offset_0_1(llband), row1 );
		_mm_store_ps( imageptr_pixel_offset_0_2(llband), row2 );
		_mm_store_ps( imageptr_pixel_offset_0_3(llband), row3 );
	}

	{
#ifndef CONFIG_CORE_DISABLE_DWT
		const __m128 row0 = _mm_shuffle_ps(t[ 0], t[ 1], _MM_SHUFFLE(3, 1, 3, 1));
		const __m128 row1 = _mm_shuffle_ps(t[ 4], t[ 5], _MM_SHUFFLE(3, 1, 3, 1));
		const __m128 row2 = _mm_shuffle_ps(t[ 8], t[ 9], _MM_SHUFFLE(3, 1, 3, 1));
		const __m128 row3 = _mm_shuffle_ps(t[12], t[13], _MM_SHUFFLE(3, 1, 3, 1));
#else /* CONFIG_CORE_DISABLE_DWT */
		const __m128 row0 = t[ 1];
		const __m128 row1 = t[ 5];
		const __m128 row2 = t[ 9];
		const __m128 row3 = t[13];
#endif /* CONFIG_CORE_DISABLE_DWT */
		_mm_store_ps( imageptr_pixel_offset_0_0(hlband), row0 );
		_mm_store_ps( imageptr_pixel_offset_0_1(hlband), row1 );
		_mm_store_ps( imageptr_pixel_offset_0_2(hlband), row2 );
		_mm_store_ps( imageptr_pixel_offset_0_3(hlband), row3 );
	}

	{
#ifndef CONFIG_CORE_DISABLE_DWT
		const __m128 row0 = _mm_shuffle_ps(t[ 2], t[ 3], _MM_SHUFFLE(2, 0, 2, 0));
		const __m128 row1 = _mm_shuffle_ps(t[ 6], t[ 7], _MM_SHUFFLE(2, 0, 2, 0));
		const __m128 row2 = _mm_shuffle_ps(t[10], t[11], _MM_SHUFFLE(2, 0, 2, 0));
		const __m128 row3 = _mm_shuffle_ps(t[14], t[15], _MM_SHUFFLE(2, 0, 2, 0));
#else /* CONFIG_CORE_DISABLE_DWT */
		const __m128 row0 = t[ 2];
		const __m128 row1 = t[ 6];
		const __m128 row2 = t[10];
		const __m128 row3 = t[14];
#endif /* CONFIG_CORE_DISABLE_DWT */
		_mm_store_ps( imageptr_pixel_offset_0_0(lhband), row0 );
		_mm_store_ps( imageptr_pixel_offset_0_1(lhband), row1 );
		_mm_store_ps( imageptr_pixel_offset_0_2(lhband), row2 );
		_mm_store_ps( imageptr_pixel_offset_0_3(lhband), row3 );
	}

	{
#ifndef CONFIG_CORE_DISABLE_DWT
		static const __m128 const16_hh = { CDF97_S1_S, CDF97_S1_S, CDF97_S1_S, CDF97_S1_S };

		const __m128 row0 = _mm_shuffle_ps(t[ 2], t[ 3], _MM_SHUFFLE(3, 1, 3, 1)) * const16_hh;
		const __m128 row1 = _mm_shuffle_ps(t[ 6], t[ 7], _MM_SHUFFLE(3, 1, 3, 1)) * const16_hh;
		const __m128 row2 = _mm_shuffle_ps(t[10], t[11], _MM_SHUFFLE(3, 1, 3, 1)) * const16_hh;
		const __m128 row3 = _mm_shuffle_ps(t[14], t[15], _MM_SHUFFLE(3, 1, 3, 1)) * const16_hh;
#else /* CONFIG_CORE_DISABLE_DWT */
		const __m128 row0 = t[ 3];
		const __m128 row1 = t[ 7];
		const __m128 row2 = t[11];
		const __m128 row3 = t[15];
#endif /* CONFIG_CORE_DISABLE_DWT */
		_mm_store_ps( imageptr_pixel_offset_0_0(hhband), row0 );
		_mm_store_ps( imageptr_pixel_offset_0_1(hhband), row1 );
		_mm_store_ps( imageptr_pixel_offset_0_2(hhband), row2 );
		_mm_store_ps( imageptr_pixel_offset_0_3(hhband), row3 );
	}
}

void load_core_8x8_honly_fast(
	float *buffer_y,
	const struct imageptr_t *input
)
{
	// 8x8 block
	__m128 t[16];

	// load
	t[ 0] = _mm_loadu_ps( imageptr_pixel_offset_0_0_const(input) ); t[ 1] = _mm_loadu_ps( imageptr_pixel_offset_4_0_const(input) );
	t[ 2] = _mm_loadu_ps( imageptr_pixel_offset_0_1_const(input) ); t[ 3] = _mm_loadu_ps( imageptr_pixel_offset_4_1_const(input) );
	t[ 4] = _mm_loadu_ps( imageptr_pixel_offset_0_2_const(input) ); t[ 5] = _mm_loadu_ps( imageptr_pixel_offset_4_2_const(input) );
	t[ 6] = _mm_loadu_ps( imageptr_pixel_offset_0_3_const(input) ); t[ 7] = _mm_loadu_ps( imageptr_pixel_offset_4_3_const(input) );
	t[ 8] = _mm_loadu_ps( imageptr_pixel_offset_0_4_const(input) ); t[ 9] = _mm_loadu_ps( imageptr_pixel_offset_4_4_const(input) );
	t[10] = _mm_loadu_ps( imageptr_pixel_offset_0_5_const(input) ); t[11] = _mm_loadu_ps( imageptr_pixel_offset_4_5_const(input) );
	t[12] = _mm_loadu_ps( imageptr_pixel_offset_0_6_const(input) ); t[13] = _mm_loadu_ps( imageptr_pixel_offset_4_6_const(input) );
	t[14] = _mm_loadu_ps( imageptr_pixel_offset_0_7_const(input) ); t[15] = _mm_loadu_ps( imageptr_pixel_offset_4_7_const(input) );

	// core
#ifndef CONFIG_CORE_DISABLE_DWT
	_MM_TRANSPOSE8_PS(t[ 0], t[ 1], t[ 2], t[ 3], t[ 4], t[ 5], t[ 6], t[ 7], t[ 8], t[ 9], t[10], t[11], t[12], t[13], t[14], t[15]);

	VERT_2X4(t[ 0], t[ 2], t[ 0], t[ 2], buffer_y+ 0); VERT_2X4(t[ 1], t[ 3], t[ 1], t[ 3], buffer_y+16);
	VERT_2X4(t[ 4], t[ 6], t[ 4], t[ 6], buffer_y+ 0); VERT_2X4(t[ 5], t[ 7], t[ 5], t[ 7], buffer_y+16);
	VERT_2X4(t[ 8], t[10], t[ 8], t[10], buffer_y+ 0); VERT_2X4(t[ 9], t[11], t[ 9], t[11], buffer_y+16);
	VERT_2X4(t[12], t[14], t[12], t[14], buffer_y+ 0); VERT_2X4(t[13], t[15], t[13], t[15], buffer_y+16);
#else /* CONFIG_CORE_DISABLE_DWT */
	UNUSED(t);
	UNUSED(buffer_y);
#endif /* CONFIG_CORE_DISABLE_DWT */
}

void load_core_8x8_honly_slow(
	float *buffer_y,
	const struct imageptr_t *input,
	const struct vec2_t local,
	const struct vec2_t size
)
{
	// 8x8 block
	__m128 t[16];

	// load
	load_8x8_limited(
		(float *)t,
		input,
		local,
		size
	);

#ifndef CONFIG_CORE_DISABLE_DWT
	// core
	_MM_TRANSPOSE8_PS(t[ 0], t[ 1], t[ 2], t[ 3], t[ 4], t[ 5], t[ 6], t[ 7], t[ 8], t[ 9], t[10], t[11], t[12], t[13], t[14], t[15]);

	VERT_2X4(t[ 0], t[ 2], t[ 0], t[ 2], buffer_y+ 0); VERT_2X4(t[ 1], t[ 3], t[ 1], t[ 3], buffer_y+16);
	VERT_2X4(t[ 4], t[ 6], t[ 4], t[ 6], buffer_y+ 0); VERT_2X4(t[ 5], t[ 7], t[ 5], t[ 7], buffer_y+16);
	VERT_2X4(t[ 8], t[10], t[ 8], t[10], buffer_y+ 0); VERT_2X4(t[ 9], t[11], t[ 9], t[11], buffer_y+16);
	VERT_2X4(t[12], t[14], t[12], t[14], buffer_y+ 0); VERT_2X4(t[13], t[15], t[13], t[15], buffer_y+16);
#else /* CONFIG_CORE_DISABLE_DWT */
	UNUSED(buffer_y);
#endif /* CONFIG_CORE_DISABLE_DWT */
}

void save_8x8_debug(
	struct imageptr_t *llband,
	struct imageptr_t *hlband,
	struct imageptr_t *lhband,
	struct imageptr_t *hhband,
	struct imageptr_t *debug,
	const struct vec2_t local
)
{
	// top
	{
		__m128 t[8];

		__m128 ll0 = _mm_load_ps( imageptr_pixel_offset_0_0(llband) );
		__m128 ll1 = _mm_load_ps( imageptr_pixel_offset_0_1(llband) );
		__m128 hl0 = _mm_load_ps( imageptr_pixel_offset_0_0(hlband) );
		__m128 hl1 = _mm_load_ps( imageptr_pixel_offset_0_1(hlband) );
		__m128 lh0 = _mm_load_ps( imageptr_pixel_offset_0_0(lhband) );
		__m128 lh1 = _mm_load_ps( imageptr_pixel_offset_0_1(lhband) );
		__m128 hh0 = _mm_load_ps( imageptr_pixel_offset_0_0(hhband) );
		__m128 hh1 = _mm_load_ps( imageptr_pixel_offset_0_1(hhband) );

		t[0] = _mm_unpacklo_ps(ll0, hl0);
		t[1] = _mm_unpackhi_ps(ll0, hl0);
		t[4] = _mm_unpacklo_ps(ll1, hl1);
		t[5] = _mm_unpackhi_ps(ll1, hl1);
		t[2] = _mm_unpacklo_ps(lh0, hh0);
		t[3] = _mm_unpackhi_ps(lh0, hh0);
		t[6] = _mm_unpacklo_ps(lh1, hh1);
		t[7] = _mm_unpackhi_ps(lh1, hh1);

		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_0_0(local)), t[0] );
		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_4_0(local)), t[1] );
		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_0_1(local)), t[2] );
		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_4_1(local)), t[3] );
		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_0_2(local)), t[4] );
		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_4_2(local)), t[5] );
		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_0_3(local)), t[6] );
		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_4_3(local)), t[7] );
	}

	// bottom
	{
		__m128 t[8];

		__m128 ll0 = _mm_load_ps( imageptr_pixel_offset_0_2(llband) );
		__m128 ll1 = _mm_load_ps( imageptr_pixel_offset_0_3(llband) );
		__m128 hl0 = _mm_load_ps( imageptr_pixel_offset_0_2(hlband) );
		__m128 hl1 = _mm_load_ps( imageptr_pixel_offset_0_3(hlband) );
		__m128 lh0 = _mm_load_ps( imageptr_pixel_offset_0_2(lhband) );
		__m128 lh1 = _mm_load_ps( imageptr_pixel_offset_0_3(lhband) );
		__m128 hh0 = _mm_load_ps( imageptr_pixel_offset_0_2(hhband) );
		__m128 hh1 = _mm_load_ps( imageptr_pixel_offset_0_3(hhband) );

		t[0] = _mm_unpacklo_ps(ll0, hl0);
		t[1] = _mm_unpackhi_ps(ll0, hl0);
		t[4] = _mm_unpacklo_ps(ll1, hl1);
		t[5] = _mm_unpackhi_ps(ll1, hl1);
		t[2] = _mm_unpacklo_ps(lh0, hh0);
		t[3] = _mm_unpackhi_ps(lh0, hh0);
		t[6] = _mm_unpacklo_ps(lh1, hh1);
		t[7] = _mm_unpackhi_ps(lh1, hh1);

		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_0_4(local)), t[0] );
		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_4_4(local)), t[1] );
		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_0_5(local)), t[2] );
		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_4_5(local)), t[3] );
		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_0_6(local)), t[4] );
		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_4_6(local)), t[5] );
		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_0_7(local)), t[6] );
		_mm_storeu_ps( imageptr_pixel(debug, vec2_offset_4_7(local)), t[7] );
	}
}
