/* CSConverter_MMX.c */
/* 2009/07/09        */

#include "StdAfx.h"

#include "TheoraDecoder.h"

#include "CSConverter.h"

/* */

#pragma warning(disable : 4799)

/* */

static __inline void CopyCSC_8(
	UINT8*       d,
	const UINT8* s,
	INT32        cx)
{
	UINT8* p = d;
	UINT8* e = p + cx;

	const UINT8* q = s;

	for (; p < e; p += 8, q += 8) {
		*((__m64*)p) = *((const __m64*)q);
	}
}

/* */

void QT_CSConvert_YV12_MMX(
	const QT_Output_t* output,
	QT_Frame_t*        frame)
{
	UINT8* pb0 = (UINT8*)(frame->Frame);
	UINT8* pb1 = pb0 + frame->Rasters * frame->Pitch;
	UINT8* pb2 = pb1 + frame->Rasters * frame->Pitch / 4;
	UINT8* end;

	INT32 r0 = output->CY - frame->Y;

	const UINT8* s0 = output->Plane[0] + frame->X + (r0     - 1) * output->CX;
	const UINT8* s1 = output->Plane[2] + frame->X + (r0 / 2 - 1) * output->CX / 2;
	const UINT8* s2 = output->Plane[1] + frame->X + (r0 / 2 - 1) * output->CX / 2;

	INT32 cx2 = frame->CX / 2;

	end = pb0 + frame->CY * frame->Pitch;
	while (pb0 < end) {
		CopyCSC_8(pb0, s0, frame->CX);
		pb0 += frame->Pitch;
		s0  -= output->CX;
	}

	end = pb1 + (frame->CY / 2) * (frame->Pitch / 2);
	while (pb1 < end) {
		CopyCSC_8(pb1, s1, cx2);
		pb1 += frame->Pitch / 2;
		s1  -= cx2;
	}

	end = pb2 + (frame->CY / 2) * (frame->Pitch / 2);
	while (pb2 < end) {
		CopyCSC_8(pb2, s2, cx2);
		pb2 += frame->Pitch / 2;
		s2  -= cx2;
	}

	_mm_empty();
}

void QT_CSConvert_YUY2_MMX(
	const QT_Output_t* output,
	QT_Frame_t*        frame)
{
	UINT8* pb  = (UINT8*)(frame->Frame);
	UINT8* end = pb + frame->CY * frame->Pitch;

	INT32 r0 = output->CY - frame->Y;

	const UINT8* s0 = output->Plane[0] + frame->X + (r0     - 1) * output->CX;
	const UINT8* s1 = output->Plane[1] + frame->X + (r0 / 2 - 1) * output->CX / 2;
	const UINT8* s2 = output->Plane[2] + frame->X + (r0 / 2 - 1) * output->CX / 2;

	__m64 Y0, Y1, UV0, UV1;
	__m64 P0, P1;

	for (; pb < end; pb += frame->Pitch * 2, s0 -= output->CX * 2, s1 -= output->CX / 2, s2 -= output->CX / 2) {
		UINT8* pb0 = pb;
		UINT8* pb1 = pb + frame->Pitch;
		UINT8* pe0 = pb + frame->CX * 2;

		const UINT8* y0 = s0;
		const UINT8* y1 = s0 - output->CX;
		const UINT8* u  = s1;
		const UINT8* v  = s2;

		for (; pb0 < pe0; pb0 += 8 * 2, pb1 += 8 * 2, y0 += 8, y1 += 8, u += 4, v += 4) {
			Y0 = *((const __m64*)y0);
			Y1 = *((const __m64*)y1);

			UV0 = _mm_unpacklo_pi8(
				_mm_cvtsi32_si64(*((const UINT32*)u)),
				_mm_cvtsi32_si64(*((const UINT32*)v)));

			UV1 = _mm_unpackhi_pi32(UV0, UV0);

			P0 = _mm_unpacklo_pi8(Y0, UV0);

			Y0 = _mm_unpackhi_pi32(Y0, Y0);

			P1 = _mm_unpacklo_pi8(Y0, UV1);

			*((__m64*)(pb0 + 0)) = P0;
			*((__m64*)(pb0 + 8)) = P1;

			P0 = _mm_unpacklo_pi8(Y1, UV0);

			Y1 = _mm_unpackhi_pi32(Y1, Y1);

			P1 = _mm_unpacklo_pi8(Y1, UV1);

			*((__m64*)(pb1 + 0)) = P0;
			*((__m64*)(pb1 + 8)) = P1;
		}
	}

	_mm_empty();
}

/* */

