/*************************************************************************************************/
/*!
   	@file		pp_sse_texture_weight16.h
	@author 	Fanzo
*/
/*************************************************************************************************/
#pragma		once

///////////////////////////////////////////////////////////////////////////////////////////////////
//include files
//=================================================================================================
// const
//=================================================================================================
///////////////////////////////
// b


///////////////////////////////
// w













///////////////////////////////
// dw
























///////////////////////////////
// ps













//=================================================================================================
// convert
//=================================================================================================
///////////////////////////////////////////////////////////
// 

///////////////////////////////////////////////////////////
//dw 




///////////////////////////////////////////////////////////
//w






///////////////////////////////////////////////////////////
//b






//=================================================================================================
// cmpadd
//=================================================================================================









//=================================================================================================
// cmpsub
//=================================================================================================









//=================================================================================================
// cmpset
//=================================================================================================









//=================================================================================================
// not
//=================================================================================================

//=================================================================================================
// cmpadd , cmpsub
//=================================================================================================




//=================================================================================================
// set
//=================================================================================================









//=================================================================================================
// set
//=================================================================================================









//=================================================================================================
// rotate
//=================================================================================================
////////////////////////
//ps






////////////////////////
//dw






////////////////////////
//w








////////////////////////
//b
















//=================================================================================================
// fill
//=================================================================================================
////////////////////////
//ps




////////////////////////
//dw




////////////////////////
//w










////////////////////////
//b


















//=================================================================================================
// Load
//=================================================================================================




//=================================================================================================
// Load
//=================================================================================================
//=================================================================================================
// loop
//=================================================================================================
///////////////////////////////////////////////////////////////////////////////////////////////////
// Loop



///////////////////////////////////////////////////////////////////////////////////////////////////
// LoopCount

/*=================================================================================================
[ pixelformat ]
0 <= a <= 255   0 <= r <= 255   0 <= g <= 255   0 <= b <= 255  
0 <= aa <= 255   0 <= ar <= 255   0 <= ag <= 255   0 <= ab <= 255
rgb			byte	[-,r,g,b]
rgba		byte	[a,r,g,b]
a			byte	[a,-,-,-]
wrgb		word	[-,r,g,b]
wrgba		word	[a,r,g,b]
wa			word	[a,-,-,-]
drgb		dword	[-,r,g,b]
drgba		dword	[a,r,g,b]
da			dword	[a,-,-,-]
frgb		float	[-,r,g,b]
frgba		float	[a,r,g,b]
fa			float	[a,-,-,-]

rgb2		[ - , - , rgb,rgb]
rgb4		[rgb,rgb,rgb,rgb]
rgba2		[-,-,rgba,rgba]
rgba4		[rgba,rgba,rgba,rgba]
wrgb2		[wrgb,wrgb]
wrgba2		[wrgba,wrgba]
=================================================================================================*/

//=================================================================================================
// misc
//=================================================================================================
///////////////////////////////////////////////////////////////////////////////////////////////////
//sc_color




//=================================================================================================
// color to
//=================================================================================================
// wrgb

// wrgba

// wa

// frgb

// frgba

// fa

//=================================================================================================
// misc
//=================================================================================================
///////////////////////////////////////////////////////////////////////////////////////////////////
// a to aaaa

///////////////////////////////////////////////////////////////////////////////////////////////////
// a to aaaa


//=================================================================================================
// convert
//=================================================================================================
///////////////////////////////////////////////////////////////////////////////////////////////////
//rgb_to

// rgb

// rgba

// a

// wrgb

// wrgba

// wa

// drgb

// drgba

// da

// frgb

// frgba

// fa

///////////////////////////////////////////////////////////////////////////////////////////////////
//rgba_to

// rgb

// rgba

// a

// wrgb

// wrgba

// wa

// drgb

// drgba

// da

// frgb

// frgba

// fa

///////////////////////////////////////////////////////////////////////////////////////////////////
//wrgb_to

// rgb

// rgba

// a

// wrgb

// wrgba

// wa

// drgb

// drgba

// da

// frgb

// frgba

// fa

///////////////////////////////////////////////////////////////////////////////////////////////////
//wrgba_to

// rgb

// rgba

// a

// wrgb

// wrgba

// wa

// drgb

// drgba

// da

// frgb

// frgba

// fa

///////////////////////////////////////////////////////////////////////////////////////////////////
//drgb_to

// rgb

// rgba

// a

// wrgb

// wrgba

// wa

// drgb

// drgba

// da

// frgb

// frgba

// fa

///////////////////////////////////////////////////////////////////////////////////////////////////
//drgba_to

// rgb

// rgba

// a

// wrgb

// wrgba

// wa

// drgb

// drgba

// da

// frgb

// frgba

// fa

///////////////////////////////////////////////////////////////////////////////////////////////////
//frgb_to

// rgb

// rgba

// a

// wrgb

// wrgba

// wa

// drgb

// drgba

// da

// frgb

// frgba

// fa

///////////////////////////////////////////////////////////////////////////////////////////////////
//frgba_to

// rgb

// rgba

// a

// wrgb

// wrgba

// wa

// drgb

// drgba

// da

// frgb

// frgba

// fa

///////////////////////////////////////////////////////////////////////////////////////////////////
//a_to

// rgb

// rgba

// a

// wrgb

// wrgba

// wa

// drgb

// drgba

// da

// frgb

// frgba

// fa

///////////////////////////////////////////////////////////////////////////////////////////////////
//fa_to

// a

// wa

// da

// fa

///////////////////////////////////////////////////////////////////////////////////////////////////
//wa_to

// a

// wa

// da

// fa

//=================================================================================================
// mul alpha
//=================================================================================================
// wrgb

// wrgba

// wa

// frgb

// frgba

// fa

//=================================================================================================
// convert pack
//=================================================================================================
///////////////////////////////////////////////////////////////////////////////////////////////////
//

///////////////////////////////////////////////////////////////
// rgb

// rgb->rgb2

// rgb2->rgb

// rgb2->rgb4

// rgb4->rgb2

// rgb2->wrgb2

// wrgb2->rgb2

///////////////////////////////////////////////////////////////
// rgba

// rgba->rgba2

// rgba2->rgba

// rgba2->rgba4

// rgba4->rgba2

// rgba2->wrgba2

// wrgba2->rgba2

// wrgba->wrgba2

// wrgba2->wrgba

//=================================================================================================
// load var
//=================================================================================================

// rgb

// rgba

// frgb

// frgba

// wrgb

// wrgba

// drgb

// drgba

//=================================================================================================
// load addr
//=================================================================================================

// rgb

// rgba

// a

//=================================================================================================
// load addr index
//=================================================================================================

// rgb

// rgba

// a


//=================================================================================================
// load addr index
//=================================================================================================

// rgb

// rgba

//=================================================================================================
// load addr byteindex
//=================================================================================================

// rgb

// rgba

//=================================================================================================
// store var
//=================================================================================================

// rgb

// rgba

// frgb

// frgba

// wrgb

// wrgba

// drgb

// drgba

//=================================================================================================
// store addr
//=================================================================================================

// rgb

// rgba

// a

//=================================================================================================
// store addr index
//=================================================================================================

// rgb

// rgba

//=================================================================================================
// add pixel size
//=================================================================================================

// rgb

// rgba

// a

// rgb

// rgba

// rgba

//=================================================================================================
// sub pixel size
//=================================================================================================

// rgb

// rgba

// a

// rgb

// rgba

// rgba

//=================================================================================================
// pixoffset_to_byteoffset
//=================================================================================================


//=================================================================================================
// load addr color16
//=================================================================================================
/*
	destformat	= rgb , rgba , rgbaaa , wrgb , wrgba , wrgbaaa , drgb , drgba , drgbaaa
	srcformat	= rgb , rgba , rgbaaa
	src_pitchbyte_dw4_var	= pitch , pitch , pitch , pitch
	color_var	= color_var_00 , color_var_10 , color_var_20 , color_var_30 , ... , color_var_33
	alpha_var	= alpha_var_00 , alpha_var_10 , alpha_var_20 , alpha_var_30 , ... , alpha_var_33
	x_dw4_xmm	= x3 : x2 : x1 : x0
	y_dw4_xmm	= y3 : y2 : y1 : y0
	w_dw4_xmm	= w : w : w : w
*/

//=================================================================================================
// add pixel
//=================================================================================================
// wrgb

// wrgba

// wa

//=================================================================================================
// shift pixel
//=================================================================================================
// wrgb

// wrgba

// wa

//=================================================================================================
// gradation linear w
//=================================================================================================
//-------------------------------------------------------------------------------------------------

//-------------------------------------------------------------------------------------------------

//-------------------------------------------------------------------------------------------------

//=================================================================================================
// gradation linear f
//=================================================================================================
//-------------------------------------------------------------------------------------------------

//-------------------------------------------------------------------------------------------------

//-------------------------------------------------------------------------------------------------
/*=================================================================================================
[ uvformat ]
fuv		[ 0 , u , 0 , v ]
duv		[ 0 , u , 0 , v ]
wuv		[ 0 , 0 , 0 , u , 0 , 0 , v ]

=================================================================================================*/

//=================================================================================================
// convert
//=================================================================================================
//-------------------------------------------------------------------------------------------------
//fuv to wuv with repeat

//-------------------------------------------------------------------------------------------------
//fuv to wuv with clamp

//-------------------------------------------------------------------------------------------------
//dxy to offset

//-------------------------------------------------------------------------------------------------
// duv to dxy repeat

//-------------------------------------------------------------------------------------------------
// duv to dxy clamp

//-------------------------------------------------------------------------------------------------
// dxy to wxy2 repeat

//-------------------------------------------------------------------------------------------------
// dxy to wxy2 clamp

//-------------------------------------------------------------------------------------------------
// dxy to wxy4 repeat

//-------------------------------------------------------------------------------------------------
// dxy to wxy4 clamp

#pragma pack( push , 8 )		//set align

namespace icubic
{
//=================================================================================================
// weight16
//=================================================================================================
//=================================================================================================

//=================================================================================================

//=================================================================================================

//=================================================================================================
// texture weight16
//=================================================================================================

//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_repeat_rgb_rgb
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_rgb() , PixelSize_rgb() , PixelSize_rgb() , PixelSize_rgb() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		cvtps2dq	xmm4 , xmm4
		pand		xmm4 , const_dw_0_0xFFFF_0_0xFFFF
		pmuludq		xmm4 , dwh
		psrld		xmm4 , 8
		psubd		xmm4 , const_dw_0_128_0_128
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		shufps	xmm0 , xmm0 , 0
		mulps		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[1*16]
		addps		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		shufps	xmm1 , xmm1 , 0
		mulps		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[5*16]
		addps		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		shufps	xmm2 , xmm2 , 0
		mulps		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[9*16]
		addps		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		shufps	xmm3 , xmm3 , 0
		mulps		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[13*16]
		addps		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		shufps	xmm4 , xmm4 , 0
		mulps		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm1
		addps		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6
			
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	[ eax ] , xmm4
		add		eax , 4
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_repeat_rgb_rgba
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_rgba() , PixelSize_rgba() , PixelSize_rgba() , PixelSize_rgba() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		cvtps2dq	xmm4 , xmm4
		pand		xmm4 , const_dw_0_0xFFFF_0_0xFFFF
		pmuludq		xmm4 , dwh
		psrld		xmm4 , 8
		psubd		xmm4 , const_dw_0_128_0_128
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		shufps	xmm0 , xmm0 , 0
		mulps		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[1*16]
		addps		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		shufps	xmm1 , xmm1 , 0
		mulps		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[5*16]
		addps		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		shufps	xmm2 , xmm2 , 0
		mulps		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[9*16]
		addps		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		shufps	xmm3 , xmm3 , 0
		mulps		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[13*16]
		addps		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		shufps	xmm4 , xmm4 , 0
		mulps		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm1
		addps		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6
			
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	[ eax ] , xmm4
		add		eax , 4
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_repeat_rgb_a
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_a() , PixelSize_a() , PixelSize_a() , PixelSize_a() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		cvtps2dq	xmm4 , xmm4
		pand		xmm4 , const_dw_0_0xFFFF_0_0xFFFF
		pmuludq		xmm4 , dwh
		psrld		xmm4 , 8
		psubd		xmm4 , const_dw_0_128_0_128
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		shufps	xmm0 , xmm0 , 0
		mulps		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[1*16]
		addps		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		shufps	xmm1 , xmm1 , 0
		mulps		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[5*16]
		addps		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		shufps	xmm2 , xmm2 , 0
		mulps		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[9*16]
		addps		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		shufps	xmm3 , xmm3 , 0
		mulps		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[13*16]
		addps		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		shufps	xmm4 , xmm4 , 0
		mulps		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm1
		addps		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6
			
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	[ eax ] , xmm4
		add		eax , 4
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_repeat_rgba_rgb
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_rgb() , PixelSize_rgb() , PixelSize_rgb() , PixelSize_rgb() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		cvtps2dq	xmm4 , xmm4
		pand		xmm4 , const_dw_0_0xFFFF_0_0xFFFF
		pmuludq		xmm4 , dwh
		psrld		xmm4 , 8
		psubd		xmm4 , const_dw_0_128_0_128
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		shufps	xmm0 , xmm0 , 0
		mulps		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[1*16]
		addps		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		shufps	xmm1 , xmm1 , 0
		mulps		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[5*16]
		addps		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		shufps	xmm2 , xmm2 , 0
		mulps		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[9*16]
		addps		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		shufps	xmm3 , xmm3 , 0
		mulps		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[13*16]
		addps		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		shufps	xmm4 , xmm4 , 0
		mulps		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm1
		addps		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6
			
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		por			xmm4 , const_b_scala_0xFF
		movd	[ eax ] , xmm4
		add		eax , 4
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_repeat_rgba_rgba
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_rgba() , PixelSize_rgba() , PixelSize_rgba() , PixelSize_rgba() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		cvtps2dq	xmm4 , xmm4
		pand		xmm4 , const_dw_0_0xFFFF_0_0xFFFF
		pmuludq		xmm4 , dwh
		psrld		xmm4 , 8
		psubd		xmm4 , const_dw_0_128_0_128
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		shufps	xmm0 , xmm0 , 0
		mulps		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[1*16]
		addps		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		shufps	xmm1 , xmm1 , 0
		mulps		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[5*16]
		addps		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		shufps	xmm2 , xmm2 , 0
		mulps		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[9*16]
		addps		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		shufps	xmm3 , xmm3 , 0
		mulps		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[13*16]
		addps		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		shufps	xmm4 , xmm4 , 0
		mulps		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm1
		addps		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6
			
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	[ eax ] , xmm4
		add		eax , 4
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_repeat_rgba_a
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_a() , PixelSize_a() , PixelSize_a() , PixelSize_a() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		cvtps2dq	xmm4 , xmm4
		pand		xmm4 , const_dw_0_0xFFFF_0_0xFFFF
		pmuludq		xmm4 , dwh
		psrld		xmm4 , 8
		psubd		xmm4 , const_dw_0_128_0_128
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		shufps	xmm0 , xmm0 , 0
		mulps		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[1*16]
		addps		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		shufps	xmm1 , xmm1 , 0
		mulps		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[5*16]
		addps		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		shufps	xmm2 , xmm2 , 0
		mulps		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[9*16]
		addps		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		shufps	xmm3 , xmm3 , 0
		mulps		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[13*16]
		addps		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		shufps	xmm4 , xmm4 , 0
		mulps		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm1
		addps		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6
			
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	[ eax ] , xmm4
		add		eax , 4
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_repeat_a_rgb
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_rgb() , PixelSize_rgb() , PixelSize_rgb() , PixelSize_rgb() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		cvtps2dq	xmm4 , xmm4
		pand		xmm4 , const_dw_0_0xFFFF_0_0xFFFF
		pmuludq		xmm4 , dwh
		psrld		xmm4 , 8
		psubd		xmm4 , const_dw_0_128_0_128
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		mulss		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[1*16]
		addss		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[2*16]
		addss		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[2*16]
		addss		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		mulss		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[5*16]
		addss		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[6*16]
		addss		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[6*16]
		addss		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		mulss		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[9*16]
		addss		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[10*16]
		addss		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[10*16]
		addss		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		mulss		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[13*16]
		addss		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[14*16]
		addss		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[14*16]
		addss		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		mulss		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm1
		addss		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm2
		addss		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm2
		addss		xmm4 , xmm6
			
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	ecx , xmm4
		mov		[ eax ] , cl
		add		eax , 1
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_repeat_a_rgba
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_rgba() , PixelSize_rgba() , PixelSize_rgba() , PixelSize_rgba() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		cvtps2dq	xmm4 , xmm4
		pand		xmm4 , const_dw_0_0xFFFF_0_0xFFFF
		pmuludq		xmm4 , dwh
		psrld		xmm4 , 8
		psubd		xmm4 , const_dw_0_128_0_128
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		mulss		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[1*16]
		addss		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[2*16]
		addss		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[2*16]
		addss		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		mulss		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[5*16]
		addss		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[6*16]
		addss		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[6*16]
		addss		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		mulss		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[9*16]
		addss		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[10*16]
		addss		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[10*16]
		addss		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		mulss		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[13*16]
		addss		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[14*16]
		addss		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[14*16]
		addss		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		mulss		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm1
		addss		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm2
		addss		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm2
		addss		xmm4 , xmm6
			
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	ecx , xmm4
		mov		[ eax ] , cl
		add		eax , 1
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_repeat_a_a
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_a() , PixelSize_a() , PixelSize_a() , PixelSize_a() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		cvtps2dq	xmm4 , xmm4
		pand		xmm4 , const_dw_0_0xFFFF_0_0xFFFF
		pmuludq		xmm4 , dwh
		psrld		xmm4 , 8
		psubd		xmm4 , const_dw_0_128_0_128
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		mulss		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[1*16]
		addss		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[2*16]
		addss		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[2*16]
		addss		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		mulss		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[5*16]
		addss		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[6*16]
		addss		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[6*16]
		addss		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		mulss		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[9*16]
		addss		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[10*16]
		addss		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[10*16]
		addss		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		mulss		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[13*16]
		addss		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[14*16]
		addss		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[14*16]
		addss		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		mulss		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm1
		addss		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm2
		addss		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm2
		addss		xmm4 , xmm6
			
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	ecx , xmm4
		mov		[ eax ] , cl
		add		eax , 1
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}

//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_clamp_rgb_rgb
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_rgb() , PixelSize_rgb() , PixelSize_rgb() , PixelSize_rgb() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		maxps		xmm4 , const_f_0
		minps		xmm4 , const_f_0_0xFFFF_0_0xFFFF
		cvtps2dq	xmm4 , xmm4
		pmuludq		xmm4 , dwhm1
		psrld		xmm4 , 8
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		pminsw		xmm4 , whm1
		pmaxsw		xmm4 , const_w_0

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		shufps	xmm0 , xmm0 , 0
		mulps		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[1*16]
		addps		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		shufps	xmm1 , xmm1 , 0
		mulps		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[5*16]
		addps		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		shufps	xmm2 , xmm2 , 0
		mulps		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[9*16]
		addps		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		shufps	xmm3 , xmm3 , 0
		mulps		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[13*16]
		addps		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		shufps	xmm4 , xmm4 , 0
		mulps		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm1
		addps		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6
			
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	[ eax ] , xmm4
		add		eax , 4
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_clamp_rgb_rgba
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_rgba() , PixelSize_rgba() , PixelSize_rgba() , PixelSize_rgba() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		maxps		xmm4 , const_f_0
		minps		xmm4 , const_f_0_0xFFFF_0_0xFFFF
		cvtps2dq	xmm4 , xmm4
		pmuludq		xmm4 , dwhm1
		psrld		xmm4 , 8
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		pminsw		xmm4 , whm1
		pmaxsw		xmm4 , const_w_0

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		shufps	xmm0 , xmm0 , 0
		mulps		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[1*16]
		addps		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		shufps	xmm1 , xmm1 , 0
		mulps		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[5*16]
		addps		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		shufps	xmm2 , xmm2 , 0
		mulps		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[9*16]
		addps		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		shufps	xmm3 , xmm3 , 0
		mulps		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[13*16]
		addps		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		shufps	xmm4 , xmm4 , 0
		mulps		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm1
		addps		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6
			
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	[ eax ] , xmm4
		add		eax , 4
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_clamp_rgb_a
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_a() , PixelSize_a() , PixelSize_a() , PixelSize_a() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		maxps		xmm4 , const_f_0
		minps		xmm4 , const_f_0_0xFFFF_0_0xFFFF
		cvtps2dq	xmm4 , xmm4
		pmuludq		xmm4 , dwhm1
		psrld		xmm4 , 8
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		pminsw		xmm4 , whm1
		pmaxsw		xmm4 , const_w_0

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		shufps	xmm0 , xmm0 , 0
		mulps		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[1*16]
		addps		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		shufps	xmm1 , xmm1 , 0
		mulps		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[5*16]
		addps		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		shufps	xmm2 , xmm2 , 0
		mulps		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[9*16]
		addps		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		shufps	xmm3 , xmm3 , 0
		mulps		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[13*16]
		addps		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		shufps	xmm4 , xmm4 , 0
		mulps		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm1
		addps		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6
			
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	[ eax ] , xmm4
		add		eax , 4
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_clamp_rgba_rgb
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_rgb() , PixelSize_rgb() , PixelSize_rgb() , PixelSize_rgb() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		maxps		xmm4 , const_f_0
		minps		xmm4 , const_f_0_0xFFFF_0_0xFFFF
		cvtps2dq	xmm4 , xmm4
		pmuludq		xmm4 , dwhm1
		psrld		xmm4 , 8
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		pminsw		xmm4 , whm1
		pmaxsw		xmm4 , const_w_0

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		shufps	xmm0 , xmm0 , 0
		mulps		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[1*16]
		addps		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		shufps	xmm1 , xmm1 , 0
		mulps		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[5*16]
		addps		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		shufps	xmm2 , xmm2 , 0
		mulps		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[9*16]
		addps		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		shufps	xmm3 , xmm3 , 0
		mulps		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[13*16]
		addps		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		shufps	xmm4 , xmm4 , 0
		mulps		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm1
		addps		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6
			
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		por			xmm4 , const_b_scala_0xFF
		movd	[ eax ] , xmm4
		add		eax , 4
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_clamp_rgba_rgba
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_rgba() , PixelSize_rgba() , PixelSize_rgba() , PixelSize_rgba() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		maxps		xmm4 , const_f_0
		minps		xmm4 , const_f_0_0xFFFF_0_0xFFFF
		cvtps2dq	xmm4 , xmm4
		pmuludq		xmm4 , dwhm1
		psrld		xmm4 , 8
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		pminsw		xmm4 , whm1
		pmaxsw		xmm4 , const_w_0

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		shufps	xmm0 , xmm0 , 0
		mulps		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[1*16]
		addps		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		shufps	xmm1 , xmm1 , 0
		mulps		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[5*16]
		addps		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		shufps	xmm2 , xmm2 , 0
		mulps		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[9*16]
		addps		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		shufps	xmm3 , xmm3 , 0
		mulps		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[13*16]
		addps		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		shufps	xmm4 , xmm4 , 0
		mulps		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm1
		addps		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6
			
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	[ eax ] , xmm4
		add		eax , 4
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_clamp_rgba_a
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_a() , PixelSize_a() , PixelSize_a() , PixelSize_a() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		maxps		xmm4 , const_f_0
		minps		xmm4 , const_f_0_0xFFFF_0_0xFFFF
		cvtps2dq	xmm4 , xmm4
		pmuludq		xmm4 , dwhm1
		psrld		xmm4 , 8
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		pminsw		xmm4 , whm1
		pmaxsw		xmm4 , const_w_0

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		shufps	xmm0 , xmm0 , 0
		mulps		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[1*16]
		addps		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		shufps	xmm1 , xmm1 , 0
		mulps		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[5*16]
		addps		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		shufps	xmm2 , xmm2 , 0
		mulps		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[9*16]
		addps		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		shufps	xmm3 , xmm3 , 0
		mulps		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[13*16]
		addps		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		shufps	xmm4 , xmm4 , 0
		mulps		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm1
		addps		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6
			
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	[ eax ] , xmm4
		add		eax , 4
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_clamp_a_rgb
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_rgb() , PixelSize_rgb() , PixelSize_rgb() , PixelSize_rgb() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		maxps		xmm4 , const_f_0
		minps		xmm4 , const_f_0_0xFFFF_0_0xFFFF
		cvtps2dq	xmm4 , xmm4
		pmuludq		xmm4 , dwhm1
		psrld		xmm4 , 8
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		pminsw		xmm4 , whm1
		pmaxsw		xmm4 , const_w_0

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		mulss		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[1*16]
		addss		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[2*16]
		addss		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[2*16]
		addss		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		mulss		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[5*16]
		addss		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[6*16]
		addss		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[6*16]
		addss		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		mulss		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[9*16]
		addss		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[10*16]
		addss		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[10*16]
		addss		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		mulss		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[13*16]
		addss		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[14*16]
		addss		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[14*16]
		addss		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		mulss		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm1
		addss		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm2
		addss		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm2
		addss		xmm4 , xmm6
			
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	ecx , xmm4
		mov		[ eax ] , cl
		add		eax , 1
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_clamp_a_rgba
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_rgba() , PixelSize_rgba() , PixelSize_rgba() , PixelSize_rgba() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		maxps		xmm4 , const_f_0
		minps		xmm4 , const_f_0_0xFFFF_0_0xFFFF
		cvtps2dq	xmm4 , xmm4
		pmuludq		xmm4 , dwhm1
		psrld		xmm4 , 8
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		pminsw		xmm4 , whm1
		pmaxsw		xmm4 , const_w_0

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		mulss		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[1*16]
		addss		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[2*16]
		addss		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[2*16]
		addss		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		mulss		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[5*16]
		addss		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[6*16]
		addss		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[6*16]
		addss		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		mulss		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[9*16]
		addss		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[10*16]
		addss		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[10*16]
		addss		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		mulss		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[13*16]
		addss		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[14*16]
		addss		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[14*16]
		addss		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		mulss		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm1
		addss		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm2
		addss		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm2
		addss		xmm4 , xmm6
			
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	ecx , xmm4
		mov		[ eax ] , cl
		add		eax , 1
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_clamp_a_a
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_a() , PixelSize_a() , PixelSize_a() , PixelSize_a() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		maxps		xmm4 , const_f_0
		minps		xmm4 , const_f_0_0xFFFF_0_0xFFFF
		cvtps2dq	xmm4 , xmm4
		pmuludq		xmm4 , dwhm1
		psrld		xmm4 , 8
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		pminsw		xmm4 , whm1
		pmaxsw		xmm4 , const_w_0

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		mulss		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[1*16]
		addss		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[2*16]
		addss		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[2*16]
		addss		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		mulss		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[5*16]
		addss		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[6*16]
		addss		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[6*16]
		addss		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		mulss		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[9*16]
		addss		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[10*16]
		addss		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[10*16]
		addss		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		mulss		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[13*16]
		addss		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[14*16]
		addss		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[14*16]
		addss		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		mulss		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm1
		addss		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm2
		addss		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm2
		addss		xmm4 , xmm6
			
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	ecx , xmm4
		mov		[ eax ] , cl
		add		eax , 1
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}

//=================================================================================================
// texture weight16 alpha
//=================================================================================================

//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_alpha_repeat_rgb_rgb
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const pp_alpha&	alpha , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	__m128	sc_alpha_faaaa = sse_set_ps1( (float)alpha.a * ( 1.0f / 256.0f ) );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_rgb() , PixelSize_rgb() , PixelSize_rgb() , PixelSize_rgb() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		cvtps2dq	xmm4 , xmm4
		pand		xmm4 , const_dw_0_0xFFFF_0_0xFFFF
		pmuludq		xmm4 , dwh
		psrld		xmm4 , 8
		psubd		xmm4 , const_dw_0_128_0_128
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		shufps	xmm0 , xmm0 , 0
		mulps		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[1*16]
		addps		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		shufps	xmm1 , xmm1 , 0
		mulps		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[5*16]
		addps		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		shufps	xmm2 , xmm2 , 0
		mulps		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[9*16]
		addps		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		shufps	xmm3 , xmm3 , 0
		mulps		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[13*16]
		addps		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		shufps	xmm4 , xmm4 , 0
		mulps		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm1
		addps		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6
			
		mulps		xmm4 , sc_alpha_faaaa
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	[ eax ] , xmm4
		add		eax , 4
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_alpha_repeat_rgb_rgba
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const pp_alpha&	alpha , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	__m128	sc_alpha_faaaa = sse_set_ps1( (float)alpha.a * ( 1.0f / 256.0f ) );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_rgba() , PixelSize_rgba() , PixelSize_rgba() , PixelSize_rgba() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		cvtps2dq	xmm4 , xmm4
		pand		xmm4 , const_dw_0_0xFFFF_0_0xFFFF
		pmuludq		xmm4 , dwh
		psrld		xmm4 , 8
		psubd		xmm4 , const_dw_0_128_0_128
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		shufps	xmm0 , xmm0 , 0
		mulps		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[1*16]
		addps		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		shufps	xmm1 , xmm1 , 0
		mulps		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[5*16]
		addps		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		shufps	xmm2 , xmm2 , 0
		mulps		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[9*16]
		addps		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		shufps	xmm3 , xmm3 , 0
		mulps		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[13*16]
		addps		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		shufps	xmm4 , xmm4 , 0
		mulps		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm1
		addps		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6
			
		mulps		xmm4 , sc_alpha_faaaa
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	[ eax ] , xmm4
		add		eax , 4
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_alpha_repeat_rgb_a
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const pp_alpha&	alpha , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	__m128	sc_alpha_faaaa = sse_set_ps1( (float)alpha.a * ( 1.0f / 256.0f ) );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_a() , PixelSize_a() , PixelSize_a() , PixelSize_a() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		cvtps2dq	xmm4 , xmm4
		pand		xmm4 , const_dw_0_0xFFFF_0_0xFFFF
		pmuludq		xmm4 , dwh
		psrld		xmm4 , 8
		psubd		xmm4 , const_dw_0_128_0_128
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		shufps	xmm0 , xmm0 , 0
		mulps		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[1*16]
		addps		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		shufps	xmm1 , xmm1 , 0
		mulps		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[5*16]
		addps		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		shufps	xmm2 , xmm2 , 0
		mulps		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[9*16]
		addps		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		shufps	xmm3 , xmm3 , 0
		mulps		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[13*16]
		addps		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		shufps	xmm4 , xmm4 , 0
		mulps		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm1
		addps		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6
			
		mulps		xmm4 , sc_alpha_faaaa
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	[ eax ] , xmm4
		add		eax , 4
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_alpha_repeat_rgba_rgb
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const pp_alpha&	alpha , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	__m128	sc_alpha_faaaa = sse_set_ps1( (float)alpha.a * ( 1.0f / 256.0f ) );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_rgb() , PixelSize_rgb() , PixelSize_rgb() , PixelSize_rgb() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		cvtps2dq	xmm4 , xmm4
		pand		xmm4 , const_dw_0_0xFFFF_0_0xFFFF
		pmuludq		xmm4 , dwh
		psrld		xmm4 , 8
		psubd		xmm4 , const_dw_0_128_0_128
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		shufps	xmm0 , xmm0 , 0
		mulps		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[1*16]
		addps		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		shufps	xmm1 , xmm1 , 0
		mulps		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[5*16]
		addps		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		shufps	xmm2 , xmm2 , 0
		mulps		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[9*16]
		addps		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		shufps	xmm3 , xmm3 , 0
		mulps		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[13*16]
		addps		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		shufps	xmm4 , xmm4 , 0
		mulps		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm1
		addps		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6
			
		mulps		xmm4 , sc_alpha_faaaa
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		por			xmm4 , const_b_scala_0xFF
		movd	[ eax ] , xmm4
		add		eax , 4
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_alpha_repeat_rgba_rgba
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const pp_alpha&	alpha , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	__m128	sc_alpha_faaaa = sse_set_ps1( (float)alpha.a * ( 1.0f / 256.0f ) );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_rgba() , PixelSize_rgba() , PixelSize_rgba() , PixelSize_rgba() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		cvtps2dq	xmm4 , xmm4
		pand		xmm4 , const_dw_0_0xFFFF_0_0xFFFF
		pmuludq		xmm4 , dwh
		psrld		xmm4 , 8
		psubd		xmm4 , const_dw_0_128_0_128
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		shufps	xmm0 , xmm0 , 0
		mulps		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[1*16]
		addps		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		shufps	xmm1 , xmm1 , 0
		mulps		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[5*16]
		addps		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		shufps	xmm2 , xmm2 , 0
		mulps		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[9*16]
		addps		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		shufps	xmm3 , xmm3 , 0
		mulps		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[13*16]
		addps		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		shufps	xmm4 , xmm4 , 0
		mulps		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm1
		addps		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6
			
		mulps		xmm4 , sc_alpha_faaaa
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	[ eax ] , xmm4
		add		eax , 4
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_alpha_repeat_rgba_a
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const pp_alpha&	alpha , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	__m128	sc_alpha_faaaa = sse_set_ps1( (float)alpha.a * ( 1.0f / 256.0f ) );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_a() , PixelSize_a() , PixelSize_a() , PixelSize_a() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		cvtps2dq	xmm4 , xmm4
		pand		xmm4 , const_dw_0_0xFFFF_0_0xFFFF
		pmuludq		xmm4 , dwh
		psrld		xmm4 , 8
		psubd		xmm4 , const_dw_0_128_0_128
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		shufps	xmm0 , xmm0 , 0
		mulps		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[1*16]
		addps		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		shufps	xmm1 , xmm1 , 0
		mulps		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[5*16]
		addps		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		shufps	xmm2 , xmm2 , 0
		mulps		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[9*16]
		addps		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		shufps	xmm3 , xmm3 , 0
		mulps		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[13*16]
		addps		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		shufps	xmm4 , xmm4 , 0
		mulps		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm1
		addps		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6
			
		mulps		xmm4 , sc_alpha_faaaa
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	[ eax ] , xmm4
		add		eax , 4
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_alpha_repeat_a_rgb
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const pp_alpha&	alpha , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	__m128	sc_alpha_faaaa = sse_set_ps1( (float)alpha.a * ( 1.0f / 256.0f ) );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_rgb() , PixelSize_rgb() , PixelSize_rgb() , PixelSize_rgb() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		cvtps2dq	xmm4 , xmm4
		pand		xmm4 , const_dw_0_0xFFFF_0_0xFFFF
		pmuludq		xmm4 , dwh
		psrld		xmm4 , 8
		psubd		xmm4 , const_dw_0_128_0_128
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		mulss		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[1*16]
		addss		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[2*16]
		addss		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[2*16]
		addss		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		mulss		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[5*16]
		addss		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[6*16]
		addss		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[6*16]
		addss		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		mulss		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[9*16]
		addss		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[10*16]
		addss		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[10*16]
		addss		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		mulss		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[13*16]
		addss		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[14*16]
		addss		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[14*16]
		addss		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		mulss		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm1
		addss		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm2
		addss		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm2
		addss		xmm4 , xmm6
			
		mulps		xmm4 , sc_alpha_faaaa
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	ecx , xmm4
		mov		[ eax ] , cl
		add		eax , 1
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_alpha_repeat_a_rgba
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const pp_alpha&	alpha , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	__m128	sc_alpha_faaaa = sse_set_ps1( (float)alpha.a * ( 1.0f / 256.0f ) );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_rgba() , PixelSize_rgba() , PixelSize_rgba() , PixelSize_rgba() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		cvtps2dq	xmm4 , xmm4
		pand		xmm4 , const_dw_0_0xFFFF_0_0xFFFF
		pmuludq		xmm4 , dwh
		psrld		xmm4 , 8
		psubd		xmm4 , const_dw_0_128_0_128
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		mulss		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[1*16]
		addss		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[2*16]
		addss		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[2*16]
		addss		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		mulss		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[5*16]
		addss		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[6*16]
		addss		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[6*16]
		addss		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		mulss		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[9*16]
		addss		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[10*16]
		addss		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[10*16]
		addss		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		mulss		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[13*16]
		addss		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[14*16]
		addss		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[14*16]
		addss		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		mulss		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm1
		addss		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm2
		addss		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm2
		addss		xmm4 , xmm6
			
		mulps		xmm4 , sc_alpha_faaaa
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	ecx , xmm4
		mov		[ eax ] , cl
		add		eax , 1
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_alpha_repeat_a_a
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const pp_alpha&	alpha , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	__m128	sc_alpha_faaaa = sse_set_ps1( (float)alpha.a * ( 1.0f / 256.0f ) );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_a() , PixelSize_a() , PixelSize_a() , PixelSize_a() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		cvtps2dq	xmm4 , xmm4
		pand		xmm4 , const_dw_0_0xFFFF_0_0xFFFF
		pmuludq		xmm4 , dwh
		psrld		xmm4 , 8
		psubd		xmm4 , const_dw_0_128_0_128
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , whm1
		pand		xmm5 , wh
		psubsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5
		movaps		xmm5 , xmm4
		pcmpgtw		xmm5 , const_w_m1
		pandn	xmm5 , const_dw_0xFFFFFFFF
		pand		xmm5 , wh
		paddsw		xmm4 , xmm5

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		mulss		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[1*16]
		addss		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[2*16]
		addss		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[2*16]
		addss		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		mulss		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[5*16]
		addss		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[6*16]
		addss		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[6*16]
		addss		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		mulss		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[9*16]
		addss		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[10*16]
		addss		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[10*16]
		addss		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		mulss		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[13*16]
		addss		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[14*16]
		addss		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[14*16]
		addss		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		mulss		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm1
		addss		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm2
		addss		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm2
		addss		xmm4 , xmm6
			
		mulps		xmm4 , sc_alpha_faaaa
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	ecx , xmm4
		mov		[ eax ] , cl
		add		eax , 1
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}

//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_alpha_clamp_rgb_rgb
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const pp_alpha&	alpha , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	__m128	sc_alpha_faaaa = sse_set_ps1( (float)alpha.a * ( 1.0f / 256.0f ) );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_rgb() , PixelSize_rgb() , PixelSize_rgb() , PixelSize_rgb() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		maxps		xmm4 , const_f_0
		minps		xmm4 , const_f_0_0xFFFF_0_0xFFFF
		cvtps2dq	xmm4 , xmm4
		pmuludq		xmm4 , dwhm1
		psrld		xmm4 , 8
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		pminsw		xmm4 , whm1
		pmaxsw		xmm4 , const_w_0

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		shufps	xmm0 , xmm0 , 0
		mulps		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[1*16]
		addps		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		shufps	xmm1 , xmm1 , 0
		mulps		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[5*16]
		addps		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		shufps	xmm2 , xmm2 , 0
		mulps		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[9*16]
		addps		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		shufps	xmm3 , xmm3 , 0
		mulps		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[13*16]
		addps		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		shufps	xmm4 , xmm4 , 0
		mulps		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm1
		addps		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6
			
		mulps		xmm4 , sc_alpha_faaaa
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	[ eax ] , xmm4
		add		eax , 4
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_alpha_clamp_rgb_rgba
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const pp_alpha&	alpha , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	__m128	sc_alpha_faaaa = sse_set_ps1( (float)alpha.a * ( 1.0f / 256.0f ) );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_rgba() , PixelSize_rgba() , PixelSize_rgba() , PixelSize_rgba() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		maxps		xmm4 , const_f_0
		minps		xmm4 , const_f_0_0xFFFF_0_0xFFFF
		cvtps2dq	xmm4 , xmm4
		pmuludq		xmm4 , dwhm1
		psrld		xmm4 , 8
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		pminsw		xmm4 , whm1
		pmaxsw		xmm4 , const_w_0

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		shufps	xmm0 , xmm0 , 0
		mulps		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[1*16]
		addps		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		shufps	xmm1 , xmm1 , 0
		mulps		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[5*16]
		addps		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		shufps	xmm2 , xmm2 , 0
		mulps		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[9*16]
		addps		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		shufps	xmm3 , xmm3 , 0
		mulps		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[13*16]
		addps		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		shufps	xmm4 , xmm4 , 0
		mulps		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm1
		addps		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6
			
		mulps		xmm4 , sc_alpha_faaaa
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	[ eax ] , xmm4
		add		eax , 4
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_alpha_clamp_rgb_a
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const pp_alpha&	alpha , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	__m128	sc_alpha_faaaa = sse_set_ps1( (float)alpha.a * ( 1.0f / 256.0f ) );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_a() , PixelSize_a() , PixelSize_a() , PixelSize_a() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		maxps		xmm4 , const_f_0
		minps		xmm4 , const_f_0_0xFFFF_0_0xFFFF
		cvtps2dq	xmm4 , xmm4
		pmuludq		xmm4 , dwhm1
		psrld		xmm4 , 8
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		pminsw		xmm4 , whm1
		pmaxsw		xmm4 , const_w_0

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		packuswb	xmm4 , const_dw_0
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		shufps	xmm0 , xmm0 , 0
		mulps		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[1*16]
		addps		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		shufps	xmm1 , xmm1 , 0
		mulps		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[5*16]
		addps		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		shufps	xmm2 , xmm2 , 0
		mulps		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[9*16]
		addps		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		shufps	xmm3 , xmm3 , 0
		mulps		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[13*16]
		addps		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		shufps	xmm4 , xmm4 , 0
		mulps		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm1
		addps		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6
			
		mulps		xmm4 , sc_alpha_faaaa
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	[ eax ] , xmm4
		add		eax , 4
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_alpha_clamp_rgba_rgb
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const pp_alpha&	alpha , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	__m128	sc_alpha_faaaa = sse_set_ps1( (float)alpha.a * ( 1.0f / 256.0f ) );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_rgb() , PixelSize_rgb() , PixelSize_rgb() , PixelSize_rgb() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		maxps		xmm4 , const_f_0
		minps		xmm4 , const_f_0_0xFFFF_0_0xFFFF
		cvtps2dq	xmm4 , xmm4
		pmuludq		xmm4 , dwhm1
		psrld		xmm4 , 8
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		pminsw		xmm4 , whm1
		pmaxsw		xmm4 , const_w_0

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		shufps	xmm0 , xmm0 , 0
		mulps		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[1*16]
		addps		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		shufps	xmm1 , xmm1 , 0
		mulps		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[5*16]
		addps		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		shufps	xmm2 , xmm2 , 0
		mulps		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[9*16]
		addps		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		shufps	xmm3 , xmm3 , 0
		mulps		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[13*16]
		addps		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		shufps	xmm4 , xmm4 , 0
		mulps		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm1
		addps		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6
			
		mulps		xmm4 , sc_alpha_faaaa
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		por			xmm4 , const_b_scala_0xFF
		movd	[ eax ] , xmm4
		add		eax , 4
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_alpha_clamp_rgba_rgba
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const pp_alpha&	alpha , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	__m128	sc_alpha_faaaa = sse_set_ps1( (float)alpha.a * ( 1.0f / 256.0f ) );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_rgba() , PixelSize_rgba() , PixelSize_rgba() , PixelSize_rgba() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		maxps		xmm4 , const_f_0
		minps		xmm4 , const_f_0_0xFFFF_0_0xFFFF
		cvtps2dq	xmm4 , xmm4
		pmuludq		xmm4 , dwhm1
		psrld		xmm4 , 8
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		pminsw		xmm4 , whm1
		pmaxsw		xmm4 , const_w_0

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		shufps	xmm0 , xmm0 , 0
		mulps		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[1*16]
		addps		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		shufps	xmm1 , xmm1 , 0
		mulps		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[5*16]
		addps		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		shufps	xmm2 , xmm2 , 0
		mulps		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[9*16]
		addps		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		shufps	xmm3 , xmm3 , 0
		mulps		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[13*16]
		addps		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		shufps	xmm4 , xmm4 , 0
		mulps		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm1
		addps		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6
			
		mulps		xmm4 , sc_alpha_faaaa
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	[ eax ] , xmm4
		add		eax , 4
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_alpha_clamp_rgba_a
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const pp_alpha&	alpha , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	__m128	sc_alpha_faaaa = sse_set_ps1( (float)alpha.a * ( 1.0f / 256.0f ) );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_a() , PixelSize_a() , PixelSize_a() , PixelSize_a() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		maxps		xmm4 , const_f_0
		minps		xmm4 , const_f_0_0xFFFF_0_0xFFFF
		cvtps2dq	xmm4 , xmm4
		pmuludq		xmm4 , dwhm1
		psrld		xmm4 , 8
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		pminsw		xmm4 , whm1
		pmaxsw		xmm4 , const_w_0

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , xmm4
		punpcklbw	xmm4 , const_b_0
		pmullw		xmm4 , sc_color_wrgba
		psrlw		xmm4 , 8
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		shufps	xmm0 , xmm0 , 0
		mulps		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[1*16]
		addps		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[2*16]
		addps		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		shufps	xmm1 , xmm1 , 0
		mulps		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[5*16]
		addps		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[6*16]
		addps		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		shufps	xmm2 , xmm2 , 0
		mulps		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[9*16]
		addps		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[10*16]
		addps		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		shufps	xmm3 , xmm3 , 0
		mulps		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[13*16]
		addps		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , c[14*16]
		addps		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		shufps	xmm4 , xmm4 , 0
		mulps		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm1
		addps		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		shufps	xmm6 , xmm6 , 0
		mulps		xmm6 , xmm2
		addps		xmm4 , xmm6
			
		mulps		xmm4 , sc_alpha_faaaa
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	[ eax ] , xmm4
		add		eax , 4
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_alpha_clamp_a_rgb
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const pp_alpha&	alpha , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	__m128	sc_alpha_faaaa = sse_set_ps1( (float)alpha.a * ( 1.0f / 256.0f ) );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_rgb() , PixelSize_rgb() , PixelSize_rgb() , PixelSize_rgb() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		maxps		xmm4 , const_f_0
		minps		xmm4 , const_f_0_0xFFFF_0_0xFFFF
		cvtps2dq	xmm4 , xmm4
		pmuludq		xmm4 , dwhm1
		psrld		xmm4 , 8
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		pminsw		xmm4 , whm1
		pmaxsw		xmm4 , const_w_0

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		movaps		xmm4 , const_f_scala_1
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		mulss		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[1*16]
		addss		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[2*16]
		addss		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[2*16]
		addss		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		mulss		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[5*16]
		addss		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[6*16]
		addss		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[6*16]
		addss		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		mulss		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[9*16]
		addss		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[10*16]
		addss		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[10*16]
		addss		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		mulss		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[13*16]
		addss		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[14*16]
		addss		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[14*16]
		addss		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		mulss		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm1
		addss		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm2
		addss		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm2
		addss		xmm4 , xmm6
			
		mulps		xmm4 , sc_alpha_faaaa
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	ecx , xmm4
		mov		[ eax ] , cl
		add		eax , 1
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_alpha_clamp_a_rgba
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const pp_alpha&	alpha , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	__m128	sc_alpha_faaaa = sse_set_ps1( (float)alpha.a * ( 1.0f / 256.0f ) );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_rgba() , PixelSize_rgba() , PixelSize_rgba() , PixelSize_rgba() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		maxps		xmm4 , const_f_0
		minps		xmm4 , const_f_0_0xFFFF_0_0xFFFF
		cvtps2dq	xmm4 , xmm4
		pmuludq		xmm4 , dwhm1
		psrld		xmm4 , 8
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		pminsw		xmm4 , whm1
		pmaxsw		xmm4 , const_w_0

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		movd	xmm4 , dword ptr[ ecx ]
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		mulss		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[1*16]
		addss		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[2*16]
		addss		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[2*16]
		addss		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		mulss		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[5*16]
		addss		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[6*16]
		addss		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[6*16]
		addss		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		mulss		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[9*16]
		addss		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[10*16]
		addss		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[10*16]
		addss		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		mulss		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[13*16]
		addss		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[14*16]
		addss		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[14*16]
		addss		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		mulss		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm1
		addss		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm2
		addss		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm2
		addss		xmm4 , xmm6
			
		mulps		xmm4 , sc_alpha_faaaa
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	ecx , xmm4
		mov		[ eax ] , cl
		add		eax , 1
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}
//=================================================================================================
cb_inline
void pp_sse_texture_weight16f_m_alpha_clamp_a_a
		(
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const pp_alpha&	alpha , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	static const __m128	const_dw_0 = sse_set_dw( 0 , 0 , 0 , 0 );
	static const __m128	const_f_255 = sse_set_ps( 255.0f , 255.0f , 255.0f , 255.0f );
	static const __m128	const_f_r255 = sse_set_ps( 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f , 1.0f / 255.0f );
	static const __m128	const_b_0 = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_w_0 = sse_set_w( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
	static const __m128	const_dw_0xFFFFFFFF = sse_set_dw( 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF );
	static const __m128	const_w_m1 = sse_set_w( -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 );
	static const __m128	const_w_2_1_0_m1_2_1_0_m1 = sse_set_w( 2 , 1 , 0 , -1 , 2 , 1 , 0 , -1 );
	static const __m128	const_dw_0_0xFF_0_0xFF = sse_set_dw( 0 , 0xFF , 0 , 0xFF );
	static const __m128	const_dw_0_0xFFFF_0_0xFFFF = sse_set_dw( 0 , 0xFFFF , 0 , 0xFFFF );
	static const __m128	const_f_0x10000 = sse_set_ps( 65536.0f , 65536.0f , 65536.0f , 65536.0f );
	static const __m128	const_b_scala_0xFF = sse_set_ub( 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0xFF );
	static const __m128	const_f_scala_1 = sse_set_ps( 0.0f , 0.0f , 0.0f , 1.0f );
	static const __m128	const_f_0_0xFFFF_0_0xFFFF = sse_set_ps( 0.0f ,  (float)0xFFFF ,  0.0f ,  (float)0xFFFF );
	static const __m128	const_f_0 = sse_set_ps( 0.0f , 0.0f , 0.0f , 0.0f );
	static const __m128	const_dw_0_128_0_128 = sse_set_dw( 0 , 128 , 0 , 128 );
	__m128	sc_color_wrgba = sse_set_uw( 0 , 0 , 0 , 0 , ((uint32)b_color.b * (uint32)b_color.a) >> 8 , ((uint32)b_color.g * (uint32)b_color.a) >> 8 , ((uint32)b_color.r * (uint32)b_color.a) >> 8 , b_color.a );
	__m128	sc_alpha_faaaa = sse_set_ps1( (float)alpha.a * ( 1.0f / 256.0f ) );
	
	float	du	= ( ttu - ssu ) / (float)len;
	float	dv	= ( ttv - ssv ) / (float)len;
	__m128	duv			= sse_set_ps( 0 , du , 0 , dv );
	__m128	suv			= sse_set_ps( 0 , ssu + du / 2.0f  , 0 , ssv + dv / 2.0f );
	__m128	dwh			= sse_set_dw( 0 , src_w , 0 , src_h );
	__m128	dwhm1		= sse_set_dw( 0 , src_w - 1 , 0 , src_h - 1 );
	__m128	wh			= sse_set_w( src_w , src_w , src_w , src_w , src_h , src_h , src_h , src_h );
	__m128	whm1		= sse_set_w( src_w-1 , src_w-1 , src_w-1 , src_w-1 , src_h-1 , src_h-1 , src_h-1 , src_h-1 );
	__m128	pixbyte		= sse_set_dw( PixelSize_a() , PixelSize_a() , PixelSize_a() , PixelSize_a() );
	__m128	pitch		= sse_set_dw( src_pitchbyte , src_pitchbyte , src_pitchbyte , src_pitchbyte );
	__m128	dxy;
	__m128	offx;
	__m128	offy;
	__m128	c[16];
	_asm
	{
		mov		eax , dest
		movaps	xmm7 , suv
	len_loop_start:
		dec		len
		jl		len_loop_end
			movaps		xmm4 , xmm7
		mulps		xmm4 , const_f_0x10000
		maxps		xmm4 , const_f_0
		minps		xmm4 , const_f_0_0xFFFF_0_0xFFFF
		cvtps2dq	xmm4 , xmm4
		pmuludq		xmm4 , dwhm1
		psrld		xmm4 , 8
		movaps		xmm5 , xmm4
		pand		xmm5 , const_dw_0_0xFF_0_0xFF		
		psrld		xmm4 , 8
			movaps		dxy , xmm5
		pshufhw		xmm4 , xmm4 , 0x00
		pshuflw		xmm4 , xmm4 , 0x00
		paddw		xmm4 , const_w_2_1_0_m1_2_1_0_m1
		pminsw		xmm4 , whm1
		pmaxsw		xmm4 , const_w_0

			// offx(x3,x2,x1,x0) , offy(y3,y2,y1,y0)
			movaps		xmm5 , xmm4
			punpcklwd	xmm5 , const_w_0
			pmaddwd		xmm5 , pitch
			movaps		offy , xmm5
			punpckhwd	xmm4 , const_w_0
			pmaddwd		xmm4 , pixbyte
			movaps		offx , xmm4
			
			// load
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[0*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[1*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[2*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[0*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[3*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[4*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[5*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[6*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[1*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[7*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[8*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[9*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[10*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[2*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[11*16] , xmm4

		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[0*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[12*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[1*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[13*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[2*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[14*16] , xmm4
		mov		ecx , src
		add		ecx , offy.m128_i32[3*4]
		add		ecx , offx.m128_i32[3*4]
		mov		cl , byte ptr[ ecx ]
		movd	xmm4 , ecx
		punpcklbw	xmm4 , const_b_0
		punpcklwd	xmm4 , const_w_0
		cvtdq2ps	xmm4 , xmm4
		mulps		xmm4 , const_f_r255
			movaps		c[15*16] , xmm4

			// weight			
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm0 , [ edx + ecx * 4 ]
		mulss		xmm0 , c[0*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[1*16]
		addss		xmm0 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[2*16]
		addss		xmm0 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[2*16]
		addss		xmm0 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm1 , [ edx + ecx * 4 ]
		mulss		xmm1 , c[4*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[5*16]
		addss		xmm1 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[6*16]
		addss		xmm1 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[6*16]
		addss		xmm1 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm2 , [ edx + ecx * 4 ]
		mulss		xmm2 , c[8*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[9*16]
		addss		xmm2 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[10*16]
		addss		xmm2 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[10*16]
		addss		xmm2 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[2*4]
		movd		xmm3 , [ edx + ecx * 4 ]
		mulss		xmm3 , c[12*16]
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[13*16]
		addss		xmm3 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[2*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[14*16]
		addss		xmm3 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , c[14*16]
		addss		xmm3 , xmm6
		// c=tbl2[d]*c1 + tbl1[d]*c2 + tbl1[1-d]*c3 + tbl2[1-d]*c4
		// c1
		mov			edx , weighttbl2
		mov			ecx , dxy.m128_i32[0*4]
		movd		xmm4 , [ edx + ecx * 4 ]
		mulss		xmm4 , xmm0
		
		// c2
		mov			edx , weighttbl1
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm1
		addss		xmm4 , xmm6

		// c3
		mov			ecx , 256
		sub			ecx , dxy.m128_i32[0*4]
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm2
		addss		xmm4 , xmm6

		// c4		
		mov			edx , weighttbl2
		movd		xmm6 , [ edx + ecx * 4 ]
		mulss		xmm6 , xmm2
		addss		xmm4 , xmm6
			
		mulps		xmm4 , sc_alpha_faaaa
		mulps		xmm4 , const_f_255
		cvtps2dq	xmm4 , xmm4
		packssdw	xmm4 , const_dw_0
		packuswb	xmm4 , const_w_0
		movd	ecx , xmm4
		mov		[ eax ] , cl
		add		eax , 1
			addps		xmm7 , duv
		jmp		len_loop_start
	len_loop_end:
	}
}

//=================================================================================================
//!	texture weight16f
//!	@retval			---
//-------------------------------------------------------------------------------------------------
cb_inline
void pp_sse_texture_weight16f_m
		(
		pp_format		destformat , 
		void*			dest , 
		int32			len , 
		float			ssu , 
		float			ssv , 
		float			ttu , 
		float			ttv , 
		pp_format		srcformat , 
		const void*		src , 
		int32			src_pitchbyte , 
		int16			src_w , 
		int16			src_h , 
		const pp_color&	b_color , 
		const pp_alpha&	alpha , 
		pp_wraptype		wrap , 
		const float*	weighttbl1 , 
		const float*	weighttbl2
		)
{
	typedef void (*func)( void* , int32 , float , float , float , float , const void* , int32 , int16 , int16 , const pp_color& , const float* , const float* );
	typedef void (*func_a)( void* , int32 , float , float , float , float , const void* , int32 , int16 , int16 , const pp_color& , const pp_alpha& , const float* , const float* );
	static
	func	funclist[2][4][4] = 
	{
		{
			{
			pp_sse_texture_weight16f_m_repeat_rgb_rgb , 
			pp_sse_texture_weight16f_m_repeat_rgb_rgba , 
			pp_sse_texture_weight16f_m_repeat_rgb_a , 
			} , 
			{
			pp_sse_texture_weight16f_m_repeat_rgba_rgb , 
			pp_sse_texture_weight16f_m_repeat_rgba_rgba , 
			pp_sse_texture_weight16f_m_repeat_rgba_a , 
			} , 
			{
			pp_sse_texture_weight16f_m_repeat_a_rgb , 
			pp_sse_texture_weight16f_m_repeat_a_rgba , 
			pp_sse_texture_weight16f_m_repeat_a_a , 
			} , 
		} , 
		{
			{
			pp_sse_texture_weight16f_m_clamp_rgb_rgb , 
			pp_sse_texture_weight16f_m_clamp_rgb_rgba , 
			pp_sse_texture_weight16f_m_clamp_rgb_a , 
			} , 
			{
			pp_sse_texture_weight16f_m_clamp_rgba_rgb , 
			pp_sse_texture_weight16f_m_clamp_rgba_rgba , 
			pp_sse_texture_weight16f_m_clamp_rgba_a , 
			} , 
			{
			pp_sse_texture_weight16f_m_clamp_a_rgb , 
			pp_sse_texture_weight16f_m_clamp_a_rgba , 
			pp_sse_texture_weight16f_m_clamp_a_a , 
			} , 
		} , 
	};
	static
	func_a	funclist_a[2][4][4] = 
	{
		{
			{
			pp_sse_texture_weight16f_m_alpha_repeat_rgb_rgb , 
			pp_sse_texture_weight16f_m_alpha_repeat_rgb_rgba , 
			pp_sse_texture_weight16f_m_alpha_repeat_rgb_a , 
			} , 
			{
			pp_sse_texture_weight16f_m_alpha_repeat_rgba_rgb , 
			pp_sse_texture_weight16f_m_alpha_repeat_rgba_rgba , 
			pp_sse_texture_weight16f_m_alpha_repeat_rgba_a , 
			} , 
			{
			pp_sse_texture_weight16f_m_alpha_repeat_a_rgb , 
			pp_sse_texture_weight16f_m_alpha_repeat_a_rgba , 
			pp_sse_texture_weight16f_m_alpha_repeat_a_a , 
			} , 
		} , 
		{
			{
			pp_sse_texture_weight16f_m_alpha_clamp_rgb_rgb , 
			pp_sse_texture_weight16f_m_alpha_clamp_rgb_rgba , 
			pp_sse_texture_weight16f_m_alpha_clamp_rgb_a , 
			} , 
			{
			pp_sse_texture_weight16f_m_alpha_clamp_rgba_rgb , 
			pp_sse_texture_weight16f_m_alpha_clamp_rgba_rgba , 
			pp_sse_texture_weight16f_m_alpha_clamp_rgba_a , 
			} , 
			{
			pp_sse_texture_weight16f_m_alpha_clamp_a_rgb , 
			pp_sse_texture_weight16f_m_alpha_clamp_a_rgba , 
			pp_sse_texture_weight16f_m_alpha_clamp_a_a , 
			} , 
		} , 
	};
	if( alpha.a == 256 )
		( funclist[ wrap ][ destformat ][ srcformat ] )( dest , len , ssu , ssv , ttu , ttv , src , src_pitchbyte , src_w , src_h , b_color , weighttbl1 , weighttbl2 );
	else
		( funclist_a[ wrap ][ destformat ][ srcformat ] )( dest , len , ssu , ssv , ttu , ttv , src , src_pitchbyte , src_w , src_h , b_color , alpha , weighttbl1 , weighttbl2 );
}

};	//namespace

//using namespace icubic;		

#pragma pack( pop )			//release align
