/* MotionComp_MMX.c */
/* 2009/07/09       */

#include "StdAfx.h"

#include "MotionComp_MMX.h"

/* */

#pragma warning(disable : 4799)

/* */

void Block_Extract8x8_MMX(
	const Plane_t* plane,
	INT32          x,
	INT32          y,
	UINT8*         block,
	INT32          pitch)
{
	ALIGN(0x10) UINT8 r[64 * 4];

	INT32 xx = (x < 0) ? 0 : ((x + 8 >= plane->CX) ? plane->CX - 8 : x);
	INT32 yy = (y < 0) ? 0 : ((y + 8 >= plane->CY) ? plane->CY - 8 : y);

	const UINT8* ss = plane->Plane + yy * plane->Pitch + xx;

	INT32 xf = ((x < 0) << 1) | (x + 8 >= plane->CX);
	INT32 yf = ((y < 0) << 1) | (y + 8 >= plane->CY);

	INT32 xy = (xf << 2) | yf;

	UINT8* rr = r;

	{
		const UINT8* s = NULL;
		UINT8*       d = NULL;

		switch (xy) {
		case 10: /* 10 10 */
			s = ss;
			d = r;
			break;

		case  6: /* 01 10 */
			s = ss + 7;
			d = r + 8;
			break;

		case  9: /* 10 01 */
			s = ss + 7 * plane->Pitch;
			d = r + 8 * 16;
			break;

		case  5: /* 01 01 */
			s = ss + 7 * plane->Pitch + 7;
			d = r + 8 * 16 + 8;
			break;
		}

		if (d != NULL) {
			__m64 pix = _mm_set1_pi8(s[0]);
			*((__m64*)(d + 16 * 0)) = pix;
			*((__m64*)(d + 16 * 1)) = pix;
			*((__m64*)(d + 16 * 2)) = pix;
			*((__m64*)(d + 16 * 3)) = pix;
			*((__m64*)(d + 16 * 4)) = pix;
			*((__m64*)(d + 16 * 5)) = pix;
			*((__m64*)(d + 16 * 6)) = pix;
			*((__m64*)(d + 16 * 7)) = pix;
		}
	}

	{
		const UINT8* sx = NULL;
		UINT8*       dx = r;

		const UINT8* sy = NULL;
		UINT8*       dy = r;

		if (xf == 2) {
			sx = ss;
			dy += 8;
			rr += 8;
		} else if (xf == 1) {
			sx = ss + 7;
			dx += 8;
		}

		if (yf == 2) {
			sy = ss;
			dx += 64 * 2;
			rr += 64 * 2;
		} else if (yf == 1) {
			sy = ss + 7 * plane->Pitch;
			dy += 64 * 2;
		}

		if (sx != NULL) {
			*((__m64*)(dx + 16 * 0)) = _mm_set1_pi8(sx[0 * plane->Pitch]);
			*((__m64*)(dx + 16 * 1)) = _mm_set1_pi8(sx[1 * plane->Pitch]);
			*((__m64*)(dx + 16 * 2)) = _mm_set1_pi8(sx[2 * plane->Pitch]);
			*((__m64*)(dx + 16 * 3)) = _mm_set1_pi8(sx[3 * plane->Pitch]);
			*((__m64*)(dx + 16 * 4)) = _mm_set1_pi8(sx[4 * plane->Pitch]);
			*((__m64*)(dx + 16 * 5)) = _mm_set1_pi8(sx[5 * plane->Pitch]);
			*((__m64*)(dx + 16 * 6)) = _mm_set1_pi8(sx[6 * plane->Pitch]);
			*((__m64*)(dx + 16 * 7)) = _mm_set1_pi8(sx[7 * plane->Pitch]);
		}

		if (sy != NULL) {
			__m64 pix = *((const __m64*)sy);
			*((__m64*)(dy + 16 * 0)) = pix;
			*((__m64*)(dy + 16 * 1)) = pix;
			*((__m64*)(dy + 16 * 2)) = pix;
			*((__m64*)(dy + 16 * 3)) = pix;
			*((__m64*)(dy + 16 * 4)) = pix;
			*((__m64*)(dy + 16 * 5)) = pix;
			*((__m64*)(dy + 16 * 6)) = pix;
			*((__m64*)(dy + 16 * 7)) = pix;
		}
	}

	*((__m64*)(rr + 16 * 0)) = *((const __m64*)(ss + 0 * plane->Pitch));
	*((__m64*)(rr + 16 * 1)) = *((const __m64*)(ss + 1 * plane->Pitch));
	*((__m64*)(rr + 16 * 2)) = *((const __m64*)(ss + 2 * plane->Pitch));
	*((__m64*)(rr + 16 * 3)) = *((const __m64*)(ss + 3 * plane->Pitch));
	*((__m64*)(rr + 16 * 4)) = *((const __m64*)(ss + 4 * plane->Pitch));
	*((__m64*)(rr + 16 * 5)) = *((const __m64*)(ss + 5 * plane->Pitch));
	*((__m64*)(rr + 16 * 6)) = *((const __m64*)(ss + 6 * plane->Pitch));
	*((__m64*)(rr + 16 * 7)) = *((const __m64*)(ss + 7 * plane->Pitch));

	if (x < 0) {
		if (x <= -8) x = -8;
		rr += x;
	} else if (x > plane->CX - 8) {
		x -= plane->CX - 8;
		if (x >= 8) x = 8;
		rr += x;
	}

	if (y < 0) {
		if (y <= -8) y = -8;
		rr += y * 16;
	} else if (y > plane->CY - 8) {
		y -= plane->CY - 8;
		if (y >= 8) y = 8;
		rr += y * 16;
	}

	*((__m64*)(block + 0 * pitch)) = *((const __m64*)(rr + 16 * 0));
	*((__m64*)(block + 1 * pitch)) = *((const __m64*)(rr + 16 * 1));
	*((__m64*)(block + 2 * pitch)) = *((const __m64*)(rr + 16 * 2));
	*((__m64*)(block + 3 * pitch)) = *((const __m64*)(rr + 16 * 3));
	*((__m64*)(block + 4 * pitch)) = *((const __m64*)(rr + 16 * 4));
	*((__m64*)(block + 5 * pitch)) = *((const __m64*)(rr + 16 * 5));
	*((__m64*)(block + 6 * pitch)) = *((const __m64*)(rr + 16 * 6));
	*((__m64*)(block + 7 * pitch)) = *((const __m64*)(rr + 16 * 7));
}

/* */

static void MotionComp_Compensate16x16_MMX(
	UINT8*         p,
	INT32          pitch,
	const Plane_t* r,
	INT32          x,
	INT32          y);

static void MotionComp_Compensate8x8_MMX(
	UINT8*         p,
	INT32          pitch,
	const Plane_t* r,
	INT32          x,
	INT32          y);

static void MotionComp_Compensate16x16H_MMX(
	UINT8*         p,
	INT32          pitch,
	const Plane_t* r,
	INT32          x0,
	INT32          y0,
	INT32          x1,
	INT32          y1);

static void MotionComp_Compensate8x8H_MMX(
	UINT8*         p,
	INT32          pitch,
	const Plane_t* r,
	INT32          x0,
	INT32          y0,
	INT32          x1,
	INT32          y1);

/* */

void MotionComp_Compensate16x16_MMX(
	UINT8*         p,
	INT32          pitch,
	const Plane_t* r,
	INT32          x,
	INT32          y)
{
	if (x >= 0 && x + 16 < r->CX &&
		y >= 0 && y + 16 < r->CY) {
		const UINT8* s = r->Plane + y * r->Pitch + x;
		const UINT8* e = s + 16 * r->Pitch;
		UINT8*       d = p;

		__m64 s0, s1, s2, s3, s4, s5, s6, s7;

		while (s < e) {
			s0 = *((const __m64*)(s + 0));
			s1 = *((const __m64*)(s + 8)); s += r->Pitch;
			s2 = *((const __m64*)(s + 0));
			s3 = *((const __m64*)(s + 8)); s += r->Pitch;
			s4 = *((const __m64*)(s + 0));
			s5 = *((const __m64*)(s + 8)); s += r->Pitch;
			s6 = *((const __m64*)(s + 0));
			s7 = *((const __m64*)(s + 8)); s += r->Pitch;

			*((__m64*)(d + 0)) = s0;
			*((__m64*)(d + 8)) = s1; d += pitch;
			*((__m64*)(d + 0)) = s2;
			*((__m64*)(d + 8)) = s3; d += pitch;
			*((__m64*)(d + 0)) = s4;
			*((__m64*)(d + 8)) = s5; d += pitch;
			*((__m64*)(d + 0)) = s6;
			*((__m64*)(d + 8)) = s7; d += pitch;
		}

	} else {
		MotionComp_Compensate8x8_MMX(p,                 pitch, r, x,     y    );
		MotionComp_Compensate8x8_MMX(p + 8,             pitch, r, x + 8, y    );
		MotionComp_Compensate8x8_MMX(p     + 8 * pitch, pitch, r, x,     y + 8);
		MotionComp_Compensate8x8_MMX(p + 8 + 8 * pitch, pitch, r, x + 8, y + 8);
	}
}

void MotionComp_Compensate8x8_MMX(
	UINT8*         p,
	INT32          pitch,
	const Plane_t* r,
	INT32          x,
	INT32          y)
{
	if (x < 0 || x + 8 >= r->CX ||
		y < 0 || y + 8 >= r->CY) {
		Block_Extract8x8_MMX(r, x, y, p, pitch);

	} else {
		const UINT8* s  = r->Plane + y * r->Pitch + x;
		INT32        p0 = r->Pitch;
		UINT8*       d  = p;

		__m64 s0, s1, s2, s3, s4, s5, s6, s7;

		s0 = *((const __m64*)s); s += p0;
		s1 = *((const __m64*)s); s += p0;
		s2 = *((const __m64*)s); s += p0;
		s3 = *((const __m64*)s); s += p0;
		s4 = *((const __m64*)s); s += p0;
		s5 = *((const __m64*)s); s += p0;
		s6 = *((const __m64*)s); s += p0;
		s7 = *((const __m64*)s);

		*((__m64*)d) = s0; d += pitch;
		*((__m64*)d) = s1; d += pitch;
		*((__m64*)d) = s2; d += pitch;
		*((__m64*)d) = s3; d += pitch;
		*((__m64*)d) = s4; d += pitch;
		*((__m64*)d) = s5; d += pitch;
		*((__m64*)d) = s6; d += pitch;
		*((__m64*)d) = s7;
	}
}

/* */

ALIGN(0x10) static const UINT8 MASK_FE[8] = { 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe, 0xfe };

void MotionComp_Compensate16x16H_MMX(
	UINT8*         p,
	INT32          pitch,
	const Plane_t* r,
	INT32          x0,
	INT32          y0,
	INT32          x1,
	INT32          y1)
{
	if (x0 >= 0 && x0 + 16 < r->CX &&
		y0 >= 0 && y0 + 16 < r->CY &&
		x1 >= 0 && x1 + 16 < r->CX &&
		y1 >= 0 && y1 + 16 < r->CY) {
		const UINT8* s0 = r->Plane + y0 * r->Pitch + x0;
		const UINT8* s1 = r->Plane + y1 * r->Pitch + x1;

		UINT8* d = p;
		UINT8* e = d + 16 * pitch;

		__m64 S0, S1, D;
		const __m64 F = *((const __m64*)MASK_FE);

		while (d < e) {
			S0 = *((const __m64*)(s0 + 0));
			S1 = *((const __m64*)(s1 + 0));
			D  = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
			*((__m64*)(d + 0)) = D;

			S0 = *((const __m64*)(s0 + 8)); s0 += r->Pitch;
			S1 = *((const __m64*)(s1 + 8)); s1 += r->Pitch;
			D  = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
			*((__m64*)(d + 8)) = D; d += pitch;

			S0 = *((const __m64*)(s0 + 0));
			S1 = *((const __m64*)(s1 + 0));
			D  = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
			*((__m64*)(d + 0)) = D;

			S0 = *((const __m64*)(s0 + 8)); s0 += r->Pitch;
			S1 = *((const __m64*)(s1 + 8)); s1 += r->Pitch;
			D  = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
			*((__m64*)(d + 8)) = D; d += pitch;
		}

	} else {
		MotionComp_Compensate8x8H_MMX(p,                 pitch, r, x0,     y0    , x1,     y1    );
		MotionComp_Compensate8x8H_MMX(p + 8,             pitch, r, x0 + 8, y0    , x1 + 8, y1    );
		MotionComp_Compensate8x8H_MMX(p     + 8 * pitch, pitch, r, x0,     y0 + 8, x1,     y1 + 8);
		MotionComp_Compensate8x8H_MMX(p + 8 + 8 * pitch, pitch, r, x0 + 8, y0 + 8, x1 + 8, y1 + 8);
	}
}

void MotionComp_Compensate8x8H_MMX(
	UINT8*         p,
	INT32          pitch,
	const Plane_t* r,
	INT32          x0,
	INT32          y0,
	INT32          x1,
	INT32          y1)
{
	ALIGN(0x10) UINT8 b0[64], b1[64];

	const UINT8* s0 = r->Plane + y0 * r->Pitch + x0;
	INT32        p0 = r->Pitch;

	const UINT8* s1 = r->Plane + y1 * r->Pitch + x1;
	INT32        p1 = r->Pitch;

	UINT8* d = p;

	__m64 S0, S1, D;
	const __m64 F = *((const __m64*)MASK_FE);

	if (x0 < 0 || x0 + 8 >= r->CX ||
		y0 < 0 || y0 + 8 >= r->CY ||
		x1 < 0 || x1 + 8 >= r->CX ||
		y1 < 0 || y1 + 8 >= r->CY) {
		s0 = b0;
		p0 = 8;

		s1 = b1;
		p1 = 8;

		Block_Extract8x8_MMX(r, x0, y0, b0, 8);
		Block_Extract8x8_MMX(r, x1, y1, b1, 8);
	}

	S0 = *((const __m64*)s0); s0 += p0;
	S1 = *((const __m64*)s1); s1 += p1;
	D  = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
	*((__m64*)d) = D; d += pitch;

	S0 = *((const __m64*)s0); s0 += p0;
	S1 = *((const __m64*)s1); s1 += p1;
	D  = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
	*((__m64*)d) = D; d += pitch;

	S0 = *((const __m64*)s0); s0 += p0;
	S1 = *((const __m64*)s1); s1 += p1;
	D  = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
	*((__m64*)d) = D; d += pitch;

	S0 = *((const __m64*)s0); s0 += p0;
	S1 = *((const __m64*)s1); s1 += p1;
	D  = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
	*((__m64*)d) = D; d += pitch;

	S0 = *((const __m64*)s0); s0 += p0;
	S1 = *((const __m64*)s1); s1 += p1;
	D  = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
	*((__m64*)d) = D; d += pitch;

	S0 = *((const __m64*)s0); s0 += p0;
	S1 = *((const __m64*)s1); s1 += p1;
	D  = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
	*((__m64*)d) = D; d += pitch;

	S0 = *((const __m64*)s0); s0 += p0;
	S1 = *((const __m64*)s1); s1 += p1;
	D  = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
	*((__m64*)d) = D; d += pitch;

	S0 = *((const __m64*)s0);
	S1 = *((const __m64*)s1);
	D  = _mm_add_pi8(_mm_and_si64(S0, S1), _mm_srli_si64(_mm_and_si64(_mm_xor_si64(S0, S1), F), 1));
	*((__m64*)d) = D;
}

/* */

void MotionComp_Block16x16_MMX(
	Plane_t*              p,
	INT32                 x,
	INT32                 y,
	const Plane_t*        r,
	const MotionVector_t* mv)
{
	INT32 dx = ((mv->X & 1) != 0);
	INT32 dy = ((mv->Y & 1) != 0);

	INT32 vx[2] = { mv->X >> 1 };
	INT32 vy[2] = { mv->Y >> 1 };

	UINT8* d = p->Plane + y * p->Pitch + x;

	if (dx == 0 && dy == 0) {
		MotionComp_Compensate16x16_MMX(
			d,
			p->Pitch,
			r,
			x + vx[0],
			y + vy[0]);

	} else {
		vx[1] = vx[0];
		vy[1] = vy[0];

		vx[mv->X >= 0] += dx;
		vy[mv->Y >= 0] += dy;

		MotionComp_Compensate16x16H_MMX(
			d,
			p->Pitch,
			r,
			x + vx[0],
			y + vy[0],
			x + vx[1],
			y + vy[1]);
	}
}

void MotionComp_Block8x8Y_MMX(
	Plane_t*              p,
	INT32                 x,
	INT32                 y,
	const Plane_t*        r,
	const MotionVector_t* mv)
{
	INT32 dx = ((mv->X & 1) != 0);
	INT32 dy = ((mv->Y & 1) != 0);

	INT32 vx[2] = { mv->X >> 1 };
	INT32 vy[2] = { mv->Y >> 1 };

	UINT8* d = p->Plane + y * p->Pitch + x;

	if (dx == 0 && dy == 0) {
		MotionComp_Compensate8x8_MMX(
			d,
			p->Pitch,
			r,
			x + vx[0],
			y + vy[0]);

	} else {
		vx[1] = vx[0];
		vy[1] = vy[0];

		vx[mv->X >= 0] += dx;
		vy[mv->Y >= 0] += dy;

		MotionComp_Compensate8x8H_MMX(
			d,
			p->Pitch,
			r,
			x + vx[0],
			y + vy[0],
			x + vx[1],
			y + vy[1]);
	}
}

void MotionComp_Block8x8C_MMX(
	Plane_t*              p,
	INT32                 x,
	INT32                 y,
	const Plane_t*        r,
	const MotionVector_t* mv0)
{
	MotionVector_t mv = {
		(mv0->X >> 1) | (mv0->X & 1),
		(mv0->Y >> 1) | (mv0->Y & 1)
	};

	INT32 dx = ((mv.X & 1) != 0);
	INT32 dy = ((mv.Y & 1) != 0);

	INT32 vx[2] = { mv.X >> 1 };
	INT32 vy[2] = { mv.Y >> 1 };

	UINT8* d = p->Plane + y * p->Pitch + x;

	if (dx == 0 && dy == 0) {
		MotionComp_Compensate8x8_MMX(
			d,
			p->Pitch,
			r,
			x + vx[0],
			y + vy[0]);

	} else {
		vx[1] = vx[0];
		vy[1] = vy[0];

		vx[mv.X >= 0] += dx;
		vy[mv.Y >= 0] += dy;

		MotionComp_Compensate8x8H_MMX(
			d,
			p->Pitch,
			r,
			x + vx[0],
			y + vy[0],
			x + vx[1],
			y + vy[1]);
	}
}

/* */

