/* MotionComp_SSE2.c */
/* 2009/07/02        */

#include "StdAfx.h"

#include "MotionComp_MMX.h"
#include "MotionComp_SSE2.h"

/* */

#pragma warning(disable : 4799)

/* */

static void MotionComp_Compensate16x16_SSE2(
	UINT8*         p,
	INT32          pitch,
	const Plane_t* r,
	INT32          x,
	INT32          y);

static void MotionComp_Compensate8x8_SSE2(
	UINT8*         p,
	INT32          pitch,
	const Plane_t* r,
	INT32          x,
	INT32          y);

static void MotionComp_Compensate16x16H_SSE2(
	UINT8*         p,
	INT32          pitch,
	const Plane_t* r,
	INT32          x0,
	INT32          y0,
	INT32          x1,
	INT32          y1);

static void MotionComp_Compensate8x8H_SSE2(
	UINT8*         p,
	INT32          pitch,
	const Plane_t* r,
	INT32          x0,
	INT32          y0,
	INT32          x1,
	INT32          y1);

/* */

void MotionComp_Compensate16x16_SSE2(
	UINT8*         p,
	INT32          pitch,
	const Plane_t* r,
	INT32          x,
	INT32          y)
{
	if (x >= 0 && x + 16 < r->CX &&
		y >= 0 && y + 16 < r->CY) {
		const UINT8* s = r->Plane + y * r->Pitch + x;
		UINT8*       d = p;

		__m128i s0, s1, s2, s3, s4, s5, s6, s7;

		s0 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
		s1 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
		s2 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
		s3 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
		s4 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
		s5 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
		s6 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
		s7 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;

		_mm_store_si128((__m128i*)d, s0); d += pitch;
		_mm_store_si128((__m128i*)d, s1); d += pitch;
		_mm_store_si128((__m128i*)d, s2); d += pitch;
		_mm_store_si128((__m128i*)d, s3); d += pitch;
		_mm_store_si128((__m128i*)d, s4); d += pitch;
		_mm_store_si128((__m128i*)d, s5); d += pitch;
		_mm_store_si128((__m128i*)d, s6); d += pitch;
		_mm_store_si128((__m128i*)d, s7); d += pitch;

		s0 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
		s1 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
		s2 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
		s3 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
		s4 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
		s5 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
		s6 = _mm_loadu_si128((const __m128i*)s); s += r->Pitch;
		s7 = _mm_loadu_si128((const __m128i*)s);

		_mm_store_si128((__m128i*)d, s0); d += pitch;
		_mm_store_si128((__m128i*)d, s1); d += pitch;
		_mm_store_si128((__m128i*)d, s2); d += pitch;
		_mm_store_si128((__m128i*)d, s3); d += pitch;
		_mm_store_si128((__m128i*)d, s4); d += pitch;
		_mm_store_si128((__m128i*)d, s5); d += pitch;
		_mm_store_si128((__m128i*)d, s6); d += pitch;
		_mm_store_si128((__m128i*)d, s7);

	} else {
		MotionComp_Compensate8x8_SSE2(p,                 pitch, r, x,     y    );
		MotionComp_Compensate8x8_SSE2(p + 8,             pitch, r, x + 8, y    );
		MotionComp_Compensate8x8_SSE2(p     + 8 * pitch, pitch, r, x,     y + 8);
		MotionComp_Compensate8x8_SSE2(p + 8 + 8 * pitch, pitch, r, x + 8, y + 8);
	}
}

void MotionComp_Compensate8x8_SSE2(
	UINT8*         p,
	INT32          pitch,
	const Plane_t* r,
	INT32          x,
	INT32          y)
{
	if (x < 0 || x + 8 >= r->CX ||
		y < 0 || y + 8 >= r->CY) {
		Block_Extract8x8_MMX(r, x, y, p, pitch);

	} else {
		const UINT8* s  = r->Plane + y * r->Pitch + x;
		INT32        p0 = r->Pitch;
		UINT8*       d  = p;

		__m64 s0, s1, s2, s3, s4, s5, s6, s7;

		s0 = *((const __m64*)s); s += p0;
		s1 = *((const __m64*)s); s += p0;
		s2 = *((const __m64*)s); s += p0;
		s3 = *((const __m64*)s); s += p0;
		s4 = *((const __m64*)s); s += p0;
		s5 = *((const __m64*)s); s += p0;
		s6 = *((const __m64*)s); s += p0;
		s7 = *((const __m64*)s);

		*((__m64*)d) = s0; d += pitch;
		*((__m64*)d) = s1; d += pitch;
		*((__m64*)d) = s2; d += pitch;
		*((__m64*)d) = s3; d += pitch;
		*((__m64*)d) = s4; d += pitch;
		*((__m64*)d) = s5; d += pitch;
		*((__m64*)d) = s6; d += pitch;
		*((__m64*)d) = s7;
	}
}

/* */

ALIGN(0x10) static const UINT8 MASK_1[16] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };

void MotionComp_Compensate16x16H_SSE2(
	UINT8*         p,
	INT32          pitch,
	const Plane_t* r,
	INT32          x0,
	INT32          y0,
	INT32          x1,
	INT32          y1)
{
	if (x0 >= 0 && x0 + 16 < r->CX &&
		y0 >= 0 && y0 + 16 < r->CY &&
		x1 >= 0 && x1 + 16 < r->CX &&
		y1 >= 0 && y1 + 16 < r->CY) {
		const UINT8* s0 = r->Plane + y0 * r->Pitch + x0;
		const UINT8* s1 = r->Plane + y1 * r->Pitch + x1;

		UINT8* d = p;

		__m128i S0, S1, D;
		const __m128i M = *((const __m128i*)MASK_1);

		S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
		S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
		_mm_store_si128((__m128i*)d, D); d += pitch;

		S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
		S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
		_mm_store_si128((__m128i*)d, D); d += pitch;

		S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
		S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
		_mm_store_si128((__m128i*)d, D); d += pitch;

		S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
		S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
		_mm_store_si128((__m128i*)d, D); d += pitch;

		S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
		S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
		_mm_store_si128((__m128i*)d, D); d += pitch;

		S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
		S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
		_mm_store_si128((__m128i*)d, D); d += pitch;

		S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
		S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
		_mm_store_si128((__m128i*)d, D); d += pitch;

		S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
		S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
		_mm_store_si128((__m128i*)d, D); d += pitch;

		S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
		S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
		_mm_store_si128((__m128i*)d, D); d += pitch;

		S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
		S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
		_mm_store_si128((__m128i*)d, D); d += pitch;

		S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
		S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
		_mm_store_si128((__m128i*)d, D); d += pitch;

		S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
		S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
		_mm_store_si128((__m128i*)d, D); d += pitch;

		S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
		S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
		_mm_store_si128((__m128i*)d, D); d += pitch;

		S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
		S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
		_mm_store_si128((__m128i*)d, D); d += pitch;

		S0 = _mm_loadu_si128((const __m128i*)s0); s0 += r->Pitch;
		S1 = _mm_loadu_si128((const __m128i*)s1); s1 += r->Pitch;
		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
		_mm_store_si128((__m128i*)d, D); d += pitch;

		S0 = _mm_loadu_si128((const __m128i*)s0);
		S1 = _mm_loadu_si128((const __m128i*)s1);
		D  = _mm_sub_epi8(_mm_avg_epu8(S0, S1), _mm_and_si128(_mm_xor_si128(S0, S1), M));
		_mm_store_si128((__m128i*)d, D);

	} else {
		MotionComp_Compensate8x8H_SSE2(p,                 pitch, r, x0,     y0    , x1,     y1    );
		MotionComp_Compensate8x8H_SSE2(p + 8,             pitch, r, x0 + 8, y0    , x1 + 8, y1    );
		MotionComp_Compensate8x8H_SSE2(p     + 8 * pitch, pitch, r, x0,     y0 + 8, x1,     y1 + 8);
		MotionComp_Compensate8x8H_SSE2(p + 8 + 8 * pitch, pitch, r, x0 + 8, y0 + 8, x1 + 8, y1 + 8);
	}
}

void MotionComp_Compensate8x8H_SSE2(
	UINT8*         p,
	INT32          pitch,
	const Plane_t* r,
	INT32          x0,
	INT32          y0,
	INT32          x1,
	INT32          y1)
{
	ALIGN(0x10) UINT8 b0[64], b1[64];

	const UINT8* s0 = r->Plane + y0 * r->Pitch + x0;
	INT32        p0 = r->Pitch;

	const UINT8* s1 = r->Plane + y1 * r->Pitch + x1;
	INT32        p1 = r->Pitch;

	UINT8* d = p;

	__m64 S0, S1, D;
	const __m64 M = *((const __m64*)MASK_1);

	if (x0 < 0 || x0 + 8 >= r->CX ||
		y0 < 0 || y0 + 8 >= r->CY ||
		x1 < 0 || x1 + 8 >= r->CX ||
		y1 < 0 || y1 + 8 >= r->CY) {
		s0 = b0;
		p0 = 8;

		s1 = b1;
		p1 = 8;

		Block_Extract8x8_MMX(r, x0, y0, b0, 8);
		Block_Extract8x8_MMX(r, x1, y1, b1, 8);
	}

	S0 = *((const __m64*)s0); s0 += p0;
	S1 = *((const __m64*)s1); s1 += p1;
	D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
	*((__m64*)d) = D; d += pitch;

	S0 = *((const __m64*)s0); s0 += p0;
	S1 = *((const __m64*)s1); s1 += p1;
	D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
	*((__m64*)d) = D; d += pitch;

	S0 = *((const __m64*)s0); s0 += p0;
	S1 = *((const __m64*)s1); s1 += p1;
	D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
	*((__m64*)d) = D; d += pitch;

	S0 = *((const __m64*)s0); s0 += p0;
	S1 = *((const __m64*)s1); s1 += p1;
	D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
	*((__m64*)d) = D; d += pitch;

	S0 = *((const __m64*)s0); s0 += p0;
	S1 = *((const __m64*)s1); s1 += p1;
	D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
	*((__m64*)d) = D; d += pitch;

	S0 = *((const __m64*)s0); s0 += p0;
	S1 = *((const __m64*)s1); s1 += p1;
	D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
	*((__m64*)d) = D; d += pitch;

	S0 = *((const __m64*)s0); s0 += p0;
	S1 = *((const __m64*)s1); s1 += p1;
	D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
	*((__m64*)d) = D; d += pitch;

	S0 = *((const __m64*)s0);
	S1 = *((const __m64*)s1);
	D  = _mm_sub_pi8(_mm_avg_pu8(S0, S1), _mm_and_si64(_mm_xor_si64(S0, S1), M));
	*((__m64*)d) = D;
}

/* */

void MotionComp_Block16x16_SSE2(
	Plane_t*              p,
	INT32                 x,
	INT32                 y,
	const Plane_t*        r,
	const MotionVector_t* mv)
{
	INT32 dx = ((mv->X & 1) != 0);
	INT32 dy = ((mv->Y & 1) != 0);

	INT32 vx[2] = { mv->X >> 1 };
	INT32 vy[2] = { mv->Y >> 1 };

	UINT8* d = p->Plane + y * p->Pitch + x;

	if (dx == 0 && dy == 0) {
		MotionComp_Compensate16x16_SSE2(
			d,
			p->Pitch,
			r,
			x + vx[0],
			y + vy[0]);

	} else {
		vx[1] = vx[0];
		vy[1] = vy[0];

		vx[mv->X >= 0] += dx;
		vy[mv->Y >= 0] += dy;

		MotionComp_Compensate16x16H_SSE2(
			d,
			p->Pitch,
			r,
			x + vx[0],
			y + vy[0],
			x + vx[1],
			y + vy[1]);
	}
}

void MotionComp_Block8x8Y_SSE2(
	Plane_t*              p,
	INT32                 x,
	INT32                 y,
	const Plane_t*        r,
	const MotionVector_t* mv)
{
	INT32 dx = ((mv->X & 1) != 0);
	INT32 dy = ((mv->Y & 1) != 0);

	INT32 vx[2] = { mv->X >> 1 };
	INT32 vy[2] = { mv->Y >> 1 };

	UINT8* d = p->Plane + y * p->Pitch + x;

	if (dx == 0 && dy == 0) {
		MotionComp_Compensate8x8_SSE2(
			d,
			p->Pitch,
			r,
			x + vx[0],
			y + vy[0]);

	} else {
		vx[1] = vx[0];
		vy[1] = vy[0];

		vx[mv->X >= 0] += dx;
		vy[mv->Y >= 0] += dy;

		MotionComp_Compensate8x8H_SSE2(
			d,
			p->Pitch,
			r,
			x + vx[0],
			y + vy[0],
			x + vx[1],
			y + vy[1]);
	}
}

void MotionComp_Block8x8C_SSE2(
	Plane_t*              p,
	INT32                 x,
	INT32                 y,
	const Plane_t*        r,
	const MotionVector_t* mv0)
{
	MotionVector_t mv = {
		(mv0->X >> 1) | (mv0->X & 1),
		(mv0->Y >> 1) | (mv0->Y & 1)
	};

	INT32 dx = ((mv.X & 1) != 0);
	INT32 dy = ((mv.Y & 1) != 0);

	INT32 vx[2] = { mv.X >> 1 };
	INT32 vy[2] = { mv.Y >> 1 };

	UINT8* d = p->Plane + y * p->Pitch + x;

	if (dx == 0 && dy == 0) {
		MotionComp_Compensate8x8_SSE2(
			d,
			p->Pitch,
			r,
			x + vx[0],
			y + vy[0]);

	} else {
		vx[1] = vx[0];
		vy[1] = vy[0];

		vx[mv.X >= 0] += dx;
		vy[mv.Y >= 0] += dy;

		MotionComp_Compensate8x8H_SSE2(
			d,
			p->Pitch,
			r,
			x + vx[0],
			y + vy[0],
			x + vx[1],
			y + vy[1]);
	}
}

/* */

