#include "transform.h"
#include "codeblock.h"
#include "core8x8.h"
#include "system.h"
#include "intmath.h"
#include "threads.h"
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <strings.h>
#include <string.h>
#ifdef _OPENMP
	#include <omp.h>
#endif
#ifdef __AVX__
	#define ALIGNMENT 32
#else
	#define ALIGNMENT 16
#endif

/// interleave processing of individual scales
// #define CONFIG_TRANSFORM_INTERLACED

/// process the scales sequentially
#define CONFIG_TRANSFORM_SEQUENTIAL

/**
 * @brief First core of the code-block. Relatively to the code-block beginning.
 */
static
struct vec2_t codeblock_core_begin(
	const struct transform_t *transform
)
{
	const struct worker_t *worker = transform_get_worker_const(transform);

	return vec2_max(
		vec2_sub(
			vec2_max(worker->cb_global, transform->start),
			worker->cb_global
		),
		vec2_scalar(0)
	);
}

/**
 * @brief First core of the prolog code-block. Relatively to the code-block beginning.
 */
static
struct vec2_t codeblock_core_begin_prolog(
	const struct transform_t *transform
)
{
	const struct vec2_t begin1 = codeblock_core_begin(transform);
	const struct vec2_t begin2 = { transform->cb_size2.x - 8, 0 };

	return vec2_max(begin1, begin2);
}

/**
 * @brief First core of the code-block strip. Y-coordinate only. Relatively to the code-block beginning.
 *
 * @param[in] cb_global_y the Y-coordinate of the code-block strip
 */
static
int strip_core_begin_y(
	const struct transform_t *transform,
	int cb_global_y
)
{
	return max( ( max(cb_global_y, transform->start.y) - cb_global_y ), 0 );
}

/**
 * @brief First output coefficient of the code-block. Relatively to the code-block beginning.
 */
static
struct vec2_t codeblock_coeff_begin(
	const struct transform_t *transform
)
{
	const struct worker_t *worker = transform_get_worker_const(transform);

	const struct vec2_t begin_outer = codeblock_core_begin(transform);
	const struct vec2_t begin_inner = vec2_sub(transform->tc0, worker->cb_global);

	return vec2_max(begin_outer, begin_inner);
}

/**
 * @brief First core after the code-block. Relatively to the code-block beginning.
 */
static
struct vec2_t codeblock_core_end(
	const struct transform_t *transform
)
{
	const struct worker_t *worker = transform_get_worker_const(transform);

	return vec2_sub(
		vec2_min(
			vec2_add(worker->cb_global, transform->cb_size2),
			transform->stop
		),
		worker->cb_global
	);
}

/**
 * @brief First core after the code-block strip. Y-coordinate only. Relatively to the code-block beginning.
 *
 * @param[in] cb_global_y the Y-coordinate of the code-block strip
 */
static
int strip_core_end_y(
	const struct transform_t *transform,
	int cb_global_y
)
{
	return min(cb_global_y + transform->cb_size2.y, transform->stop.y) - cb_global_y;
}

/**
 * @brief First output coefficient after the code-block. Relatively to the code-block beginning.
 */
static
struct vec2_t codeblock_coeff_end(
	const struct transform_t *transform
)
{
	const struct worker_t *worker = transform_get_worker_const(transform);

	const struct vec2_t end_outer = codeblock_core_end(transform);
	const struct vec2_t end_inner = vec2_sub(transform->size, vec2_sub(worker->cb_global, transform->tc0));

	return vec2_min(end_outer, end_inner);
}

/**
 * @brief The input lines required to transform a strip of code-blocks.
 *
 * The requested lines may generally overlap between the individual strips.
 *
 * @param[in] cb_global_y the strip coordinate
 *
 * @returns [min; max+1) pair (first inclusive, second exclusive)
 */
struct range2_t strip_query_lines(
	const struct transform_t *transform,
	int cb_global_y
)
{
	assert( transform );

	// due to shortened/truncated evaluation of the code-block
	const int begin_y = strip_core_begin_y(transform, cb_global_y);
	const int end_y = strip_core_end_y(transform, cb_global_y);

	const int lag_y = 3;

	const struct range2_t global_range_y = {
		cb_global_y+begin_y+lag_y,
		cb_global_y+end_y  +lag_y
	};

	const struct range2_t local_range_y = {
		global_range_y.i0 - transform->tc0.y,
		global_range_y.i1 - transform->tc0.y
	};

	// initialize to the most left top sample
	struct range2_t lines = {
		coord_extr(local_range_y.i0, transform->size.y),
		coord_extr(local_range_y.i0, transform->size.y)
	};

	// go through the y-axis
	for(int y = local_range_y.i0+1; y < local_range_y.i1; y++)
	{
		const int y_pse = coord_extr(y, transform->size.y);

		lines.i0 = min(lines.i0, y_pse);
		lines.i1 = max(lines.i1, y_pse);
	}

	// change the meaning from [min;max] to [min;max+1)
	return (struct range2_t){ lines.i0, lines.i1 + 1 };
}

int transform_finished(
	const struct transform_t *transform
)
{
	assert( transform );

	const struct worker_t *master = transform_get_master_worker_const( transform );

	assert( master );

	return master->cb_global.y >= transform->tc1_cb.y;
}

struct range2_t transform_prepare_strip(
	struct transform_t *transform
)
{
	assert( transform );

	const struct worker_t *master = transform_get_master_worker_const( transform );

	assert( master );

	assert( master->cb_global.y < transform->tc1_cb.y && "already finished" );

	assert( transform->tc0_cb.y + transform->cb_size2.y <= transform->tc1_cb.y && "internal error" );

	// the input lines required to transform a code-block strip at cb_global.y
	const struct range2_t this_lines = strip_query_lines(
		transform,
		master->cb_global.y
	);

	// assume the standalone strip
	transform->lines = this_lines;
	transform->cb_step = transform->cb_size2.y;

	struct range2_t next_lines;

	for(
		// start at +1 strip
		int i = 1;
		// is there a next strip?
		master->cb_global.y + i*transform->cb_size2.y < transform->tc1_cb.y
		// AND does it overlap with the previous ones?
		&& (next_lines = strip_query_lines(transform, master->cb_global.y + i*transform->cb_size2.y)).i0 < transform->lines.i1;
		// next strip
		i++
	)
	{
		transform->lines = (struct range2_t){
			min(next_lines.i0, transform->lines.i0),
			max(next_lines.i1, transform->lines.i1)
		};
		transform->cb_step += transform->cb_size2.y;
	}

	return transform->lines;
}

/**
 * @brief Notify the next transform level that a data was produced.
 *
 * @param[in] c1 the LL sub-bad was produced up to this line (in reference/global coordinate system of the next scale, exclusive)
 */
void transform_notify_next_level(
	struct transform_t *transform,
	const struct vec2_t c1
)
{
	assert( transform );

#if   defined(CONFIG_TRANSFORM_INTERLACED)
	// interlaced processing
	if( transform->next_level && !transform_finished(transform->next_level) )
	{
		// local coordinate system
		const struct range2_t lines = transform_prepare_strip(transform->next_level);

		// do we have enough data?
		if( c1.y >= transform->next_level->tc0.y + lines.i1 )
		{
			transform_process_strip(
				transform->next_level,
				transform->ll_viewport,
				vec2_zero
			);

			// discard processed virtual memory

			const struct vec2_t viewport_offset = convert_tc_to_next_tr(vec2_sub(transform->tc0, transform->tc0_cb));

			assert( lines.i1 + viewport_offset.y <= transform->super_ll.y && "this should not have happened" );

			assert( lines.i1 >= 0 && "the first valid line is < 0" );

			imageptr_discard(
				transform->llband,
				vec2_create( 0, lines.i1 + viewport_offset.y )
			);

			// possibly more strips to process, notify again
			transform_notify_next_level(transform, c1);
		}
	}
#elif defined(CONFIG_TRANSFORM_SEQUENTIAL)
	UNUSED(c1);

	// sequential processing
	if( transform_finished(transform) && transform->next_level )
	{
		while( !transform_finished(transform->next_level) )
		{
			const struct range2_t lines = transform_prepare_strip(transform->next_level);
			transform_process_strip(
				transform->next_level,
				transform->ll_viewport,
				vec2_zero
			);

			// discard processed virtual memory

			const struct vec2_t viewport_offset = convert_tc_to_next_tr(vec2_sub(transform->tc0, transform->tc0_cb));

			assert( lines.i1 + viewport_offset.y <= transform->super_ll.y && "this should not have happened" );

			assert( lines.i1 >= 0 && "the first valid line is < 0" );

			imageptr_discard(
				transform->llband,
				vec2_create( 0, lines.i1 + viewport_offset.y )
			);
		}
	}
#else
#	error Either CONFIG_TRANSFORM_INTERLACED, or CONFIG_TRANSFORM_SEQUENTIAL must be defined.
#endif
}

/**
 * @brief Transform the code-block.
 *
 * Do not write output and do not update x-buffer.
 *
 * @param data the input tile
 * @param data_offset user-specific data offset
 */
void transform_process_codeblock_prolog(
	struct transform_t *transform,
	const struct imageptr_t *data,
	const struct vec2_t data_offset
)
{
	const struct worker_t *worker = transform_get_worker(transform);

	assert( worker );

	assert( transform );

	const int buff_elem_size = 4;

	assert( worker->buffer_x );

	// initialization of x-buffer is not really needed in the case of CDF 9/7 wavelet because the horizontal filtering is performed prior the vertical one
#if 0
	float *buffer_x_global = transform->buffer_x + buff_elem_size * (worker->cb_global.x - transform->tc0_cb.x);

	// copy x-buffer fragment into local buffer
	memcpy(
		worker->buffer_x,
		buffer_x_global,
		buff_elem_size * transform->cb_size2.x * sizeof(float)
	);
#endif

	// lag
	const struct vec2_t lag = { 3, 3 };

	// core size
	const struct vec2_t step = { 8, 8 };

	assert( vec2_is_codeblock(worker->cb_global, transform->cb_exp) && "the code-block coordinates are not at the code-block boundary" );

	// cb_global == floor_codeblock(cb_global+cb_size2-1, cb_exp)
// 	assert( vec2_equals(worker->cb_global, vec2_floor_codeblock(
// 		vec2_add(vec2_add(worker->cb_global, transform->cb_size2), vec2_scalar(-1)),
// 		transform->cb_exp)) && "the code-block coordinates does not match the code-block size" );
	assert( worker->cb_global.x == floor_codeblock(worker->cb_global.x+transform->cb_size2.x-1, transform->cb_exp.x) && "the code-block coordinates does not match the code-block size" );
	assert( worker->cb_global.y == floor_codeblock(worker->cb_global.y+transform->cb_size2.y-1, transform->cb_exp.y) && "the code-block coordinates does not match the code-block size" );

	// shortened evaluation of the code-block
	const struct vec2_t begin = codeblock_core_begin_prolog(transform);

	// shortened evaluation of the code-block
	const struct vec2_t end = codeblock_core_end(transform);

	assert( worker->cb_global.x == floor_codeblock(worker->cb_global.x+begin.x, transform->cb_exp.x) && "the beginning of the code-block is outside the code-block" );
	assert( worker->cb_global.y == floor_codeblock(worker->cb_global.y+begin.y, transform->cb_exp.y) && "the beginning of the code-block is outside the code-block" );

	assert( worker->cb_global.x == floor_codeblock(worker->cb_global.x+end.x-1, transform->cb_exp.x) && "the end of the code-block is outside the code-block" );
	assert( worker->cb_global.y == floor_codeblock(worker->cb_global.y+end.y-1, transform->cb_exp.y) && "the end of the code-block is outside the code-block" );

	struct imageptr_t input_v = {
		.data = imageptr_pixel(data, vec2_neg(data_offset)),
		.stride = data->stride
	}, *input = &input_v;

	// offset to the beginning of the code-block
	struct vec2_t cb_offset;
	const struct vec2_t cb_offset_begin = vec2_floor8(begin);
	const struct vec2_t cb_offset_end = vec2_ceil8(end);

	const struct vec2_t size = vec2_sub(transform->tc1, transform->tc0);

	float *buffer_y;
	float *buffer_y_init = worker->buffer_y
		+ buff_elem_size*cb_offset_begin.y;

	struct imageptr_t data_v = *input, *datainc = &data_v;

	struct vec2_t source_local;
	const struct vec2_t source_local_init = vec2_sub(vec2_add(vec2_add(worker->cb_global, cb_offset_begin), lag), transform->tc0);

	// for each core
	for(
		buffer_y = buffer_y_init,
		cb_offset.y = cb_offset_begin.y,
		source_local.y = source_local_init.y,
		datainc->data = imageptr_pixel(input, vec2_create(0, source_local.y))
		;
		cb_offset.y < cb_offset_end.y
		;
		buffer_y += step.y*buff_elem_size,
		cb_offset.y += step.y,
		source_local.y += step.y,
		datainc->data = imageptr_pixel(input, vec2_create(0, source_local.y))
	)
	{
		for(
			cb_offset.x = cb_offset_begin.x,
			source_local.x = source_local_init.x,
			datainc->data = imageptr_pixel(datainc, vec2_create(source_local.x, 0))
			;
			cb_offset.x < cb_offset_end.x
			;
			cb_offset.x += step.x,
			source_local.x += step.x,
			datainc->data = imageptr_pixel(datainc, vec2_create(step.x, 0))
		)
		{
			assert( vec2_is_core8x8(cb_offset) );

			if( source_local.x >= 0 && source_local.y >= 0 && source_local.x+7 < size.x &&  source_local.y+7 < size.y )
			{
				// call fast core
				load_core_8x8_honly_fast(
					buffer_y,
					datainc
				);
			}
			else
			{
				// call slow core
				load_core_8x8_honly_slow(
					buffer_y,
					input,
					source_local,
					size
				);
			}
		}
	}
}

/**
 * @brief Transform the code-block.
 *
 * @param data the input tile
 * @param data_offset user-specific data offset
 * @param codeblock to store the result
 *
 * @note Slow implementation.
 */
void transform_process_codeblock_slow(
	struct transform_t *transform,
	const struct imageptr_t *data,
	const struct vec2_t data_offset,
	struct codeblock_t *codeblock
)
{
	const struct worker_t *worker = transform_get_worker(transform);

	assert( transform );

	// lag
	const struct vec2_t lag = { 3, 3 };

	// core size
	const struct vec2_t step = { 8, 8 };

	const int buff_elem_size = 4;

	assert( vec2_is_codeblock(worker->cb_global, transform->cb_exp) && "the code-block coordinates are not at the code-block boundary" );

	// cb_global == floor_codeblock(cb_global+cb_size2-1, cb_exp)
// 	assert( vec2_equals(worker->cb_global, vec2_floor_codeblock(
// 		vec2_add(vec2_add(worker->cb_global, transform->cb_size2), vec2_scalar(-1)),
// 		transform->cb_exp)) && "the code-block coordinates does not match the code-block size" );
	assert( worker->cb_global.x == floor_codeblock(worker->cb_global.x+transform->cb_size2.x-1, transform->cb_exp.x) && "the code-block coordinates does not match the code-block size" );
	assert( worker->cb_global.y == floor_codeblock(worker->cb_global.y+transform->cb_size2.y-1, transform->cb_exp.y) && "the code-block coordinates does not match the code-block size" );

	// shortened evaluation of the code-block
	const struct vec2_t begin = codeblock_core_begin(transform);

	assert( worker->cb_global.x == floor_codeblock(worker->cb_global.x+begin.x, transform->cb_exp.x) && "the beginning of the code-block is outside the code-block" );
	assert( worker->cb_global.y == floor_codeblock(worker->cb_global.y+begin.y, transform->cb_exp.y) && "the beginning of the code-block is outside the code-block" );

	// shortened evaluation of the code-block
	const struct vec2_t end = codeblock_core_end(transform);

	assert( worker->cb_global.x == floor_codeblock(worker->cb_global.x+end.x-1, transform->cb_exp.x) && "the end of the code-block is outside the code-block" );
	assert( worker->cb_global.y == floor_codeblock(worker->cb_global.y+end.y-1, transform->cb_exp.y) && "the end of the code-block is outside the code-block" );

	assert( codeblock );

	// find coordinates of valid code-block data
	codeblock->local_c0 = codeblock_coeff_begin(transform);
	codeblock->local_c1 = codeblock_coeff_end(transform);
	codeblock->local_size = vec2_sub(codeblock->local_c1, codeblock->local_c0);

	// NOTE
	// global target coordinates: cb_global + cb_offset
	// global source coordinates: cb_global + cb_offset + lag
	// target coordinates: tc0_offset
	// source coordinates: tc0_offset + lag
	// code-block coordinates: cb_offset
	// NOTE: tc0_cb_offset.x - cb_offset.x = transform->cb_global.x - transform->tc0_cb.x

	const struct vec2_t step1 = convert_tc_to_next_tr_fast(step);

	// offset to the beginning of the code-block
	struct vec2_t cb_offset;
	const struct vec2_t cb_offset_begin = vec2_floor8(begin);
	const struct vec2_t cb_offset_end = vec2_ceil8(end);

	struct vec2_t cb_offset1;
	const struct vec2_t cb_offset1_init = convert_tc_to_next_tr_fast(cb_offset_begin);

	const struct vec2_t tc0_cb_offset_init = vec2_add( vec2_sub(worker->cb_global, transform->tc0_cb),
		cb_offset_begin);

#ifndef NDEBUG
	struct vec2_t tc0_cb_offset;
#endif

	const struct vec2_t tc0_cb_offset1_init = convert_tc_to_next_tr_fast(tc0_cb_offset_init);
	struct vec2_t ll_cb_offset1;
	const struct vec2_t ll_cb_offset1_init = (transform->next_level) ? tc0_cb_offset1_init
		: convert_tc_to_next_tr_fast(cb_offset_begin);

	// buffers
	float *buffer_x;
	float *buffer_y;
	float *buffer_x_init = (transform->buffer_x + buff_elem_size * vec2_sub(worker->cb_global, transform->tc0_cb).x)
		+ buff_elem_size*cb_offset_begin.x;
	float *buffer_y_init = worker->buffer_y
		+ buff_elem_size*cb_offset_begin.y;

	struct imageptr_t input_v = {
		.data = imageptr_pixel(data, vec2_neg(data_offset)),
		.stride = data->stride
	}, *input = &input_v;

	struct imageptr_t data_v = *input, *datainc = &data_v;

	struct imageptr_t *llband_init = (transform->next_level) ? transform->llband : codeblock->llband;
	struct imageptr_t *hlband_init = codeblock->hlband;
	struct imageptr_t *lhband_init = codeblock->lhband;
	struct imageptr_t *hhband_init = codeblock->hhband;

	struct imageptr_t llband_v = *llband_init, *llband = &llband_v;
	struct imageptr_t hlband_v = *hlband_init, *hlband = &hlband_v;
	struct imageptr_t lhband_v = *lhband_init, *lhband = &lhband_v;
	struct imageptr_t hhband_v = *hhband_init, *hhband = &hhband_v;

	const struct vec2_t size = vec2_sub(transform->tc1, transform->tc0);

	struct vec2_t source_local;
	const struct vec2_t source_local_init = vec2_sub(vec2_add(vec2_add(worker->cb_global, cb_offset_begin), lag), transform->tc0);

	// for each core
	for(
		buffer_y = buffer_y_init,
#ifndef NDEBUG
		tc0_cb_offset.y = tc0_cb_offset_init.y,
#endif
		ll_cb_offset1.y = ll_cb_offset1_init.y,
		cb_offset1.y = cb_offset1_init.y,
		cb_offset.y = cb_offset_begin.y,
		llband->data = imageptr_pixel(llband_init, vec2_create(0, ll_cb_offset1.y)),
		hlband->data = imageptr_pixel(hlband_init, vec2_create(0, cb_offset1.y)),
		lhband->data = imageptr_pixel(lhband_init, vec2_create(0, cb_offset1.y)),
		hhband->data = imageptr_pixel(hhband_init, vec2_create(0, cb_offset1.y)),
		source_local.y = source_local_init.y,
		datainc->data = imageptr_pixel(input, vec2_create(0, source_local.y))
		;
		cb_offset.y < cb_offset_end.y
		;
		buffer_y += step.y*buff_elem_size,
#ifndef NDEBUG
		tc0_cb_offset.y += step.y,
#endif
		ll_cb_offset1.y += step1.y,
		cb_offset1.y += step1.y,
		cb_offset.y += step.y,
		llband->data = imageptr_pixel(llband_init, vec2_create(0, ll_cb_offset1.y)),
		hlband->data = imageptr_pixel(hlband_init, vec2_create(0, cb_offset1.y)),
		lhband->data = imageptr_pixel(lhband_init, vec2_create(0, cb_offset1.y)),
		hhband->data = imageptr_pixel(hhband_init, vec2_create(0, cb_offset1.y)),
		source_local.y += step.y,
		datainc->data = imageptr_pixel(input, vec2_create(0, source_local.y))
	)
	{
		for(
			buffer_x = buffer_x_init,
#ifndef NDEBUG
			tc0_cb_offset.x = tc0_cb_offset_init.x,
#endif
			ll_cb_offset1.x = ll_cb_offset1_init.x,
			cb_offset1.x = cb_offset1_init.x,
			cb_offset.x = cb_offset_begin.x,
			llband->data = imageptr_pixel(llband, vec2_create(ll_cb_offset1.x, 0)),
			hlband->data = imageptr_pixel(hlband, vec2_create(cb_offset1.x, 0)),
			lhband->data = imageptr_pixel(lhband, vec2_create(cb_offset1.x, 0)),
			hhband->data = imageptr_pixel(hhband, vec2_create(cb_offset1.x, 0)),
			source_local.x = source_local_init.x,
			datainc->data = imageptr_pixel(datainc, vec2_create(source_local.x, 0))
			;
			cb_offset.x < cb_offset_end.x
			;
			buffer_x += step.x*buff_elem_size,
#ifndef NDEBUG
			tc0_cb_offset.x += step.x,
#endif
			ll_cb_offset1.x += step1.x,
			cb_offset1.x += step1.x,
			cb_offset.x += step.x,
			llband->data = imageptr_pixel(llband, vec2_create(step1.x, 0)),
			hlband->data = imageptr_pixel(hlband, vec2_create(step1.x, 0)),
			lhband->data = imageptr_pixel(lhband, vec2_create(step1.x, 0)),
			hhband->data = imageptr_pixel(hhband, vec2_create(step1.x, 0)),
			source_local.x += step.x,
			datainc->data = imageptr_pixel(datainc, vec2_create(step.x, 0))
		)
		{
			assert( vec2_is_core8x8(cb_offset) );

			if( source_local.x >= 0 && source_local.y >= 0 && source_local.x+7 < size.x &&  source_local.y+7 < size.y )
			{
				// call fast core
				load_core_save_8x8_fast(
					buffer_x,
					buffer_y,
					datainc,
					llband,
					hlband,
					lhband,
					hhband
				);
			}
			else
			{
				// call slow core
				load_core_save_8x8_slow(
					buffer_x,
					buffer_y,
					input,
					source_local,
					size,
					llband,
					hlband,
					lhband,
					hhband
				);
			}
#ifndef NDEBUG
			save_8x8_debug(
				llband,
				hlband,
				lhband,
				hhband,
				transform->debug,
				tc0_cb_offset
			);
#endif
		}
	}
}

/**
 * @brief Transform the code-block.
 *
 * @param data the input tile
 * @param data_offset user-specific data offset
 * @param codeblock to store the result
 *
 * @note Fast implementation.
 */
void transform_process_codeblock_fast(
	struct transform_t *transform,
	const struct imageptr_t *data,
	const struct vec2_t data_offset,
	struct codeblock_t *codeblock
)
{
	const struct worker_t *worker = transform_get_worker(transform);

	assert( transform );

	// lag
	const struct vec2_t lag = { 3, 3 };

	// core size
	const struct vec2_t step = { 8, 8 };

	const int buff_elem_size = 4;

	assert( vec2_is_codeblock(worker->cb_global, transform->cb_exp) && "the code-block coordinates are not at the code-block boundary" );

	// cb_global == floor_codeblock(cb_global+cb_size2-1, cb_exp)
// 	assert( vec2_equals(worker->cb_global, vec2_floor_codeblock(
// 		vec2_add(vec2_add(worker->cb_global, transform->cb_size2), vec2_scalar(-1)),
// 		transform->cb_exp)) && "the code-block coordinates does not match the code-block size" );
	assert( worker->cb_global.x == floor_codeblock(worker->cb_global.x+transform->cb_size2.x-1, transform->cb_exp.x) && "the code-block coordinates does not match the code-block size" );
	assert( worker->cb_global.y == floor_codeblock(worker->cb_global.y+transform->cb_size2.y-1, transform->cb_exp.y) && "the code-block coordinates does not match the code-block size" );

#ifndef NDEBUG
	// shortened evaluation of the code-block
	const struct vec2_t begin = codeblock_core_begin(transform);
#endif

	assert( worker->cb_global.x == floor_codeblock(worker->cb_global.x+begin.x, transform->cb_exp.x) && "the beginning of the code-block is outside the code-block" );
	assert( worker->cb_global.y == floor_codeblock(worker->cb_global.y+begin.y, transform->cb_exp.y) && "the beginning of the code-block is outside the code-block" );

#ifndef NDEBUG
	// shortened evaluation of the code-block
	const struct vec2_t end = codeblock_core_end(transform);
#endif

	assert( worker->cb_global.x == floor_codeblock(worker->cb_global.x+end.x-1, transform->cb_exp.x) && "the end of the code-block is outside the code-block" );
	assert( worker->cb_global.y == floor_codeblock(worker->cb_global.y+end.y-1, transform->cb_exp.y) && "the end of the code-block is outside the code-block" );

	assert( codeblock );

	// find coordinates of valid code-block data
	codeblock->local_c0 = vec2_zero;
	codeblock->local_c1 = transform->cb_size2;
	codeblock->local_size = transform->cb_size2;

	assert( 0 == begin.x && 0 == begin.y && transform->cb_size2.x == end.x && transform->cb_size2.y == end.y && "not a fast codeblock" );

	const struct vec2_t step1 = convert_tc_to_next_tr_fast(step);

	// offset to the beginning of the code-block
	struct vec2_t cb_offset;

	// cb_offset1 = cb_offset / 2
	struct vec2_t cb_offset1;

	// tc0_offset = (cb_global + cb_offset) - tc0
	struct vec2_t tc0_offset;
	const struct vec2_t tc0_offset_init = vec2_sub(worker->cb_global, transform->tc0);

	// source_global = (cb_global + cb_offset) + lag
	struct vec2_t source_local;
	const struct vec2_t source_local_init = vec2_sub(vec2_sub(vec2_add(worker->cb_global, lag), data_offset), transform->tc0);

#ifndef NDEBUG
	// tc0_cb_offset = (cb_global + cb_offset) - tc0_cb
	struct vec2_t tc0_cb_offset;
#endif
	const struct vec2_t tc0_cb_offset_init = vec2_sub(worker->cb_global, transform->tc0_cb);

	// tc0_cb_offset1 = tc0_cb_offset / 2
	const struct vec2_t tc0_cb_offset1_init = convert_tc_to_next_tr_fast(tc0_cb_offset_init);

	// where to store the LL band?
	struct imageptr_t *llband_init = (transform->next_level) ? transform->llband : codeblock->llband;

	struct imageptr_t *hlband_init = codeblock->hlband;
	struct imageptr_t *lhband_init = codeblock->lhband;
	struct imageptr_t *hhband_init = codeblock->hhband;
	struct imageptr_t llband_v = *llband_init, *llband = &llband_v;
	struct imageptr_t hlband_v = *hlband_init, *hlband = &hlband_v;
	struct imageptr_t lhband_v = *lhband_init, *lhband = &lhband_v;
	struct imageptr_t hhband_v = *hhband_init, *hhband = &hhband_v;
	struct imageptr_t data_v = *data, *datainc = &data_v;

	struct vec2_t ll_cb_offset1;
	const struct vec2_t ll_cb_offset1_init = (transform->next_level) ? tc0_cb_offset1_init : vec2_zero;

	// buffers
	float *buffer_x;
	float *buffer_y;
	float *buffer_x_init = (transform->buffer_x + buff_elem_size * vec2_sub(worker->cb_global, transform->tc0_cb).x);
	float *buffer_y_init = worker->buffer_y;

	// for each core
	for(
		buffer_y = buffer_y_init,
		cb_offset.y = 0,
		cb_offset1.y = 0,
		tc0_offset.y = tc0_offset_init.y,
		source_local.y = source_local_init.y,
#ifndef NDEBUG
		tc0_cb_offset.y = tc0_cb_offset_init.y,
#endif
		ll_cb_offset1.y = ll_cb_offset1_init.y,
		llband->data = imageptr_pixel(llband_init, vec2_create(0, ll_cb_offset1.y)),
		hlband->data = imageptr_pixel(hlband_init, vec2_create(0, cb_offset1.y)),
		lhband->data = imageptr_pixel(lhband_init, vec2_create(0, cb_offset1.y)),
		hhband->data = imageptr_pixel(hhband_init, vec2_create(0, cb_offset1.y)),
		datainc->data = imageptr_pixel(data, vec2_create(0, source_local.y))
		;
		cb_offset.y < transform->cb_size2.y
		;
		buffer_y += step.y*buff_elem_size,
		cb_offset.y += step.y,
		cb_offset1.y += step1.y,
		tc0_offset.y += step.y,
		source_local.y += step.y,
#ifndef NDEBUG
		tc0_cb_offset.y += step.y,
#endif
		ll_cb_offset1.y += step1.y,
		llband->data = imageptr_pixel(llband_init, vec2_create(0, ll_cb_offset1.y)),
		hlband->data = imageptr_pixel(hlband_init, vec2_create(0, cb_offset1.y)),
		lhband->data = imageptr_pixel(lhband_init, vec2_create(0, cb_offset1.y)),
		hhband->data = imageptr_pixel(hhband_init, vec2_create(0, cb_offset1.y)),
		datainc->data = imageptr_pixel(data, vec2_create(0, source_local.y))
	)
	{
		for(
			buffer_x = buffer_x_init,
			cb_offset.x = 0,
			cb_offset1.x = 0,
			tc0_offset.x = tc0_offset_init.x,
			source_local.x = source_local_init.x,
#ifndef NDEBUG
			tc0_cb_offset.x = tc0_cb_offset_init.x,
#endif
			ll_cb_offset1.x = ll_cb_offset1_init.x,
			llband->data = imageptr_pixel(llband, vec2_create(ll_cb_offset1.x, 0)),
			datainc->data = imageptr_pixel(datainc, vec2_create(source_local.x, 0))
			;
			cb_offset.x < transform->cb_size2.x
			;
			buffer_x += step.x*buff_elem_size,
			cb_offset.x += step.x,
			cb_offset1.x += step1.x,
			tc0_offset.x += step.x,
			source_local.x += step.x,
#ifndef NDEBUG
			tc0_cb_offset.x += step.x,
#endif
			ll_cb_offset1.x += step1.x,
			llband->data = imageptr_pixel(llband, vec2_create(step1.x, 0)),
			hlband->data = imageptr_pixel(hlband, vec2_create(step1.x, 0)),
			lhband->data = imageptr_pixel(lhband, vec2_create(step1.x, 0)),
			hhband->data = imageptr_pixel(hhband, vec2_create(step1.x, 0)),
			datainc->data = imageptr_pixel(datainc, vec2_create(step.x, 0))
		)
		{
			assert( vec2_is_core8x8(cb_offset) );

			load_core_save_8x8_fast(
				buffer_x,
				buffer_y,
				datainc,
				llband,
				hlband,
				lhband,
				hhband
			);

#ifndef NDEBUG
			// (llband, hlband, lhband, hhband) => transform->debug @ tc0_cb_offset
			save_8x8_debug(
				llband,
				hlband,
				lhband,
				hhband,
				transform->debug,
				tc0_cb_offset
			);
#endif /* NDEBUG */
		}
	}
}

void transform_acquire_and_process_codeblock_slow(
	struct transform_t *transform,
	const struct imageptr_t *data_lines,
	const struct vec2_t data_offset
)
{
	const struct worker_t *worker = transform_get_worker(transform);

	// acquire an empty code-block structure
	struct codeblock_t *codeblock = codeblock_acquire(
		transform,
		convert_tc_to_next_tr(worker->cb_global)
	);

	// transform the code-block into the acquired code-block structure
	transform_process_codeblock_slow(
		transform,
		data_lines,
		data_offset,
		codeblock
	);

	// EBCOT on the code-block
	codeblock_process_pool(transform, codeblock);

	// release the code-block structure
	codeblock_release(transform, codeblock);
}

void transform_acquire_and_process_codeblock_fast(
	struct transform_t *transform,
	const struct imageptr_t *data_lines,
	const struct vec2_t data_offset
)
{
	const struct worker_t *worker = transform_get_worker(transform);

	// acquire an empty code-block structure
	struct codeblock_t *codeblock = codeblock_acquire(
		transform,
		convert_tc_to_next_tr(worker->cb_global)
	);

	// transform the code-block into the acquired code-block structure
	transform_process_codeblock_fast(
		transform,
		data_lines,
		data_offset,
		codeblock
	);

	// EBCOT on the code-block
	codeblock_process_pool(transform, codeblock);

	// release the code-block structure
	codeblock_release(transform, codeblock);
}

BARRIER_DEFINE(barrier);

void transform_process_strip(
	struct transform_t *transform,
	const struct imageptr_t *data_lines,
	const struct vec2_t data_offset
)
{
	assert( transform );

	struct worker_t *master = transform_get_master_worker( transform );

	assert( master->cb_global.y < transform->tc1_cb.y && "already finished" );

	for(int yy = 0; yy < transform->cb_step; yy += transform->cb_size2.y)
	{
		// NOTE: multi-threaded with OpenMP

		const int threads = transform->threads;

		const int segment_x = ceil_codeblock(ceil_div(transform->super.x, threads), transform->cb_exp.x);

		assert( segment_x * threads >= transform->super.x && "threads does not cover entire frame" );

		BARRIER_INIT(barrier, threads);

		#pragma omp parallel num_threads(threads)
		{
			BARRIER_WAIT(barrier);
			const int t = threads_get_thread_id();
			struct worker_t *worker = transform_get_worker(transform);

			// prolog codeblocks: t=0 does not need to call prolog
			if( t > 0 )
			{
				// init thread
				worker->cb_global.y = master->cb_global.y;

				// prolog position: -cb_size2.x
				worker->cb_global.x = transform->tc0_cb.x + (t+0)*segment_x - transform->cb_size2.x;

				// out of frame?
				if( worker->cb_global.x < transform->tc1_cb.x )
				{
					// fill reentrant y-buffers
					transform_process_codeblock_prolog(
						transform,
						data_lines,
						data_offset
					);
				}
			}

			// the barrier is not really needed as on the content of the x-buffer does not matter
#if 0
			threads_barrier();
			BARRIER_WAIT(barrier);
#endif

			const int strip_start_y = master->cb_global.y;
			const int strip_stop_y = master->cb_global.y + transform->cb_size2.y;
			const int fast_start_y = transform->regular_tc0_cb.y;
			const int fast_stop_y = transform->regular_tc1_cb.y;

			if( strip_start_y >= fast_start_y && strip_stop_y <= fast_stop_y )
			{
				// fast processing

				// WARNING: the (tc0_cb, regular_tc0_cb, regular_tc1_cb, tc1_cb) may overlap
				const int segment_start_x = transform->tc0_cb.x + (t+0)*segment_x;
				const int segment_stop_x  = min(transform->tc1_cb.x, transform->tc0_cb.x + (t+1)*segment_x);

				const int segment_slow_stop_x = min(transform->regular_tc0_cb.x, segment_stop_x);
				const int segment_fast_stop_x = min(transform->regular_tc1_cb.x, segment_stop_x);

				const int segment_step_x = transform->cb_size2.x;

				for(
					worker->cb_global.x = segment_start_x;
					worker->cb_global.x < segment_slow_stop_x;
					worker->cb_global.x += segment_step_x
				)
				{
					transform_acquire_and_process_codeblock_slow(transform, data_lines, data_offset);
				}

				for(
					;
					worker->cb_global.x < segment_fast_stop_x;
					worker->cb_global.x += segment_step_x
				)
				{
					transform_acquire_and_process_codeblock_fast(transform, data_lines, data_offset);
				}

				for(
					;
					worker->cb_global.x < segment_stop_x;
					worker->cb_global.x += segment_step_x
				)
				{
					transform_acquire_and_process_codeblock_slow(transform, data_lines, data_offset);
				}
			}
			else
			{
				// slow processing
				for(
					worker->cb_global.x = transform->tc0_cb.x + (t+0)*segment_x;
					worker->cb_global.x < min(transform->tc1_cb.x, transform->tc0_cb.x + (t+1)*segment_x);
					worker->cb_global.x += transform->cb_size2.x
				)
				{
					transform_acquire_and_process_codeblock_slow(
						transform,
						data_lines,
						data_offset
					);
				}
			}

			BARRIER_WAIT(barrier);
		}

		BARRIER_DESTROY(barrier);

		master->cb_global.y += transform->cb_size2.y;
	}

	// notify next level, only transform->cb_global.y is valid
	transform_notify_next_level(
		transform,
		convert_tc_to_next_tr(master->cb_global)
	);
}

struct transform_t *transform_create(
	const struct vec2_t tc0,
	const struct vec2_t tc1,
	const struct vec2_t cb_exp,
	int n,
	int N,
	codeblock_callback_t codeblock_callback,
	void *codeblock_params
)
{
	struct transform_t *transform = malloc( sizeof(struct transform_t) );

	if( !transform )
		return NULL;

	transform->next_level = NULL;

	if( n < N )
	{
		transform->next_level = transform_create(
			convert_tc_to_next_tr(tc0),
			convert_tc_to_next_tr(tc1),
			cb_exp,
			n+1,
			N,
			codeblock_callback,
			codeblock_params
		);
	}

	transform->codeblock_callback = codeblock_callback;
	transform->codeblock_params = codeblock_params;

	// TODO: replace with 'r'
	transform->N = N;
	transform->n = n;

	transform->tc0 = tc0;
	transform->tc1 = tc1;
	transform->size = vec2_sub(tc1, tc0);

	assert( transform->size.x > 0 && transform->size.y > 0 && "No data in input area. Too many levels of decomposition?" );

	transform->cb_exp = cb_exp;
	transform->cb_size1 = vec2_create( 1<<cb_exp.x, 1<<cb_exp.y );
	transform->cb_size2 = vec2_create( 2<<cb_exp.x, 2<<cb_exp.y );

	transform->start = vec2_sub(vec2_ceil2(tc0), vec2_scalar(8)); // ceil2(tc0) - 8
	transform->stop = vec2_ceil2(tc1); // ceil2(tc1)
	const struct vec2_t regular_start = vec2_add( vec2_floor2(tc0), vec2_scalar(2)); // floor2(tc0) + 2
	const struct vec2_t regular_stop = vec2_sub( vec2_floor2(tc1), vec2_scalar(8)); // floor2(tc1) - 8

	transform->tc0_cb = vec2_floor_codeblock(transform->start, cb_exp);
	transform->tc1_cb = vec2_ceil_codeblock(transform->stop, cb_exp);
	transform->regular_tc0_cb = vec2_ceil_codeblock(regular_start, cb_exp);
	transform->regular_tc1_cb = vec2_floor_codeblock(regular_stop, cb_exp);

	// CDF 9/7 requires 4 coefficients
	const int buff_elem_size = 4;

	transform->super = vec2_sub(transform->tc1_cb, transform->tc0_cb);

	transform->buffer_x = memalign(ALIGNMENT, buff_elem_size * transform->super.x * sizeof(float));

	if( !transform->buffer_x )
		return NULL;

	// threads

	transform->threads = threads_suggest_transform_threads(transform);

	// create workers

	transform->workers = malloc( transform->threads * sizeof(struct worker_t) );

	if( !transform->workers )
		return NULL;

	// FIXME: adjust the number of code-blocks
	transform->cb_pool = 8;

	for(int t = 0; t < transform->threads; t++)
	{
		if( worker_create(&transform->workers[t], transform) )
			return NULL;
	}

#ifndef NDEBUG
	transform->debug = imageptr_create(transform->super);
#endif

	// super size for LL band, do not confuse with next level super size
	transform->super_ll = convert_tc_to_next_tr(transform->super);

	// the input for the next level
	transform->llband = imageptr_create_llband(transform->super_ll, transform->cb_size1);

	if( !transform->llband )
		return NULL;

	// viewport in the LL sub-band frame
	const struct vec2_t viewport_offset = vec2_sub(transform->tc0, transform->tc0_cb);

	transform->ll_viewport = imageptr_viewport(
		transform->llband,
		convert_tc_to_next_tr(viewport_offset)
	);

	// init processing loop
	transform_get_master_worker(transform)->cb_global.y = transform->tc0_cb.y;

	dprintf("created transform n=%i / N=%i of size=(%i,%i) at [%i,%i]..(%i,%i) with cb_size1=(%i,%i) using %i threads\n",
		transform->n,
		transform->N,
		transform->size.x, transform->size.y,
		transform->tc0.x, transform->tc0.y,
		transform->tc1.x, transform->tc1.y,
		transform->cb_size1.x, transform->cb_size1.y,
		transform->threads
	);

	return transform;
}

void transform_dump(
	const struct transform_t *transform
)
{
	assert( transform );

	dprintf("[ DUMP ] n=%i\n", transform->n);

#ifndef NDEBUG
// 	imageptr_log_dump_fmt(transform->debug, transform->super, "transform-frame-%i.pgm", transform->n);

	const struct vec2_t viewport_offset = vec2_sub(transform->tc0, transform->tc0_cb);

	struct imageptr_t *viewport = imageptr_viewport(transform->debug, viewport_offset);

	imageptr_log_dump_fmt(viewport, transform->size, "transform-viewport-%i.pgm", transform->n);

	free(viewport);
#endif

#if 0
	imageptr_log_dump_fmt(transform->llband, transform->super_ll, "llsubband-frame-%i.pgm", transform->n);

	imageptr_log_dump_fmt(
		transform->ll_viewport,
		convert_tc_to_next_tr(transform->size),
		"llsubband-viewport-%i.pgm",
		transform->n
	);
#endif

	if( transform->next_level )
	{
		transform_dump(transform->next_level);
	}
}

void transform_destroy(
	struct transform_t *transform
)
{
	assert( transform );

	if( transform->next_level )
	{
		transform_destroy(transform->next_level);
	}

	free(transform->buffer_x);

	imageptr_destroy_llband(transform->llband, transform->super_ll, transform->cb_size1);

#ifndef NDEBUG
	imageptr_destroy(transform->debug, transform->super);
#endif

	free(transform->ll_viewport);

	// workers

	for(int t = 0; t < transform->threads; t++)
	{
		worker_destroy(&transform->workers[t]);
	}

	free(transform->workers);

	free(transform);
}

int worker_create(
	struct worker_t *worker,
	struct transform_t *transform
)
{
	assert( worker );
	assert( transform );

	const int buff_elem_size = 4;

	worker->buffer_x = memalign(ALIGNMENT, buff_elem_size * transform->cb_size2.x * sizeof(float));
	worker->buffer_y = memalign(ALIGNMENT, buff_elem_size * transform->cb_size2.y * sizeof(float));

	if( !worker->buffer_x || !worker->buffer_y )
		return 1;

	worker->transform = transform;

	worker->codeblock = codeblock_create_pool(transform);

	if( !worker->codeblock )
		return 1;

	return 0;
}

void worker_destroy(
	struct worker_t *worker
)
{
	assert( worker );

	free( worker->buffer_x );
	free( worker->buffer_y );

	codeblock_destroy_pool(worker->transform, worker->codeblock);
}

struct worker_t *transform_get_worker(
	struct transform_t *transform
)
{
	assert( transform );

	const int tid = threads_get_thread_id();

	return &transform->workers[tid];
}

const struct worker_t *transform_get_worker_const(
	const struct transform_t *transform
)
{
	assert( transform );

	const int tid = threads_get_thread_id();

	return &transform->workers[tid];
}

struct worker_t *transform_get_master_worker(
	struct transform_t *transform
)
{
	assert( transform );

	return &transform->workers[0];
}

const struct worker_t *transform_get_master_worker_const(
	const struct transform_t *transform
)
{
	assert( transform );

	return &transform->workers[0];
}

int transform_process_tile(
	const struct vec2_t tc0,
	const struct vec2_t tc1,
	const struct vec2_t cb_exp,
	int N,
	const struct imageptr_t *imageptr
)
{
	assert( imageptr );

	struct transform_t *transform = transform_create(
		tc0,
		tc1,
		cb_exp,
		1,
		N,
		NULL,
		NULL
	);

	if( !transform )
		return -1;

	// loop
	transform_loop_tc(
		transform,
		imageptr
	);

	transform_destroy(transform);

	return 0;
}

void transform_loop_tc(
	struct transform_t *transform,
	const struct imageptr_t *imageptr
)
{
	while( !transform_finished(transform) )
	{
		transform_prepare_strip(transform);

		transform_process_strip(
			transform,
			imageptr,
			vec2_zero
		);
	}
}
