/*************************************************************************************************/
/*!
   	@file		pp_sse.h
	@author 	Fanzo
*/
/*************************************************************************************************/
#pragma		once

///////////////////////////////////////////////////////////////////////////////////////////////////
//include files
#include <xmmintrin.h>



#pragma pack( push , 8 )		//set align

namespace icubic
{

///////////////////////////////////////////////////////////////////////////////////////////////////
// preprocessor deifne

///////////////////////////////////////////////////////////////////////////////////////////////////
// type define

///////////////////////////////////////////////////////////////////////////////////////////////////
// classes define

///////////////////////////////////////////////////////////////////////////////////////////////////
// global variable define

///////////////////////////////////////////////////////////////////////////////////////////////////
// global functions define

#define		mmx_set_pi16		_mm_set_pi16
#define		mmx_set_pi32		_mm_set_pi32


#define		sse_elem_ps( var , p )		(*( ( (float*)&(var) ) + ( 3 - p ) ))
#define		sse_addr_align16( addr )	( ( unsigned long )( ( unsigned char*)addr + 15 ) & ~15 )

#define		sse_add_ss			_mm_add_ss
#define		sse_add_ps			_mm_add_ps
#define		sse_sub_ss			_mm_sub_ss
#define		sse_sub_ps			_mm_sub_ps
#define		sse_mul_ss			_mm_mul_ss
#define		sse_mul_ps			_mm_mul_ps
#define		sse_div_ss			_mm_div_ss
#define		sse_div_ps			_mm_div_ps
#define		sse_sqrt_ss			_mm_sqrt_ss
#define		sse_sqrt_ps			_mm_sqrt_ps
#define		sse_rsqrt_ss		_mm_rsqrt_ss
#define		sse_rsqrt_ps		_mm_rsqrt_ps

// logic
#define		sse_and_ps			_mm_and_ps
#define		sse_andnot_ps		_mm_andnot_ps
#define		sse_or_ps			_mm_or_ps
#define		sse_xor_ps			_mm_xor_ps

// compare
#define		sse_cmplt_ss		_mm_cmplt_ss
#define		sse_cmplt_ps		_mm_cmplt_ps
#define		sse_cmpgt_ss		_mm_cmpgt_ss
#define		sse_cmpgt_ps		_mm_cmpgt_ps


#define		sse_setzero_ps		_mm_setzero_ps
#define		sse_set_ps			_mm_set_ps
#define		sse_set_ps1			_mm_set_ps1
#define		sse_shuffle_ps		_mm_shuffle_ps

//system
#define		sse_getcsr			_mm_getcsr
#define		sse_setcsr			_mm_setcsr

// shift
#define		sse_shift_left1_ps( a )			_mm_shuffle_ps( a , a , 0x93 )
#define		sse_shift_right1_ps( a )		_mm_shuffle_ps( a , a , 0x39 )
#define		sse_shift_left2_ps( a )			_mm_shuffle_ps( a , a , 0x4E )
#define		sse_shift_right2_ps( a )		_mm_shuffle_ps( a , a , 0x4E )
#define		sse_shift_left3_ps( a )			_mm_shuffle_ps( a , a , 0x39 )
#define		sse_shift_right3_ps( a )		_mm_shuffle_ps( a , a , 0x93 )

//=================================================================================================
//!	set_b
//!	@retval			---
//-------------------------------------------------------------------------------------------------
cb_inline
__m128 sse_set_b
		(
		icubic::int8		v0 , 
		icubic::int8		v1 , 
		icubic::int8		v2 , 
		icubic::int8		v3 , 
		icubic::int8		v4 , 
		icubic::int8		v5 , 
		icubic::int8		v6 , 
		icubic::int8		v7 , 
		icubic::int8		v8 , 
		icubic::int8		v9 , 
		icubic::int8		v10 , 
		icubic::int8		v11 , 
		icubic::int8		v12 , 
		icubic::int8		v13 , 
		icubic::int8		v14 , 
		icubic::int8		v15
		)
{
	__m128	r;
	r.m128_u8[ 15 ]	= v0;
	r.m128_u8[ 14 ]	= v1;
	r.m128_u8[ 13 ]	= v2;
	r.m128_u8[ 12 ]	= v3;
	r.m128_u8[ 11 ]	= v4;
	r.m128_u8[ 10 ]	= v5;
	r.m128_u8[ 9 ]	= v6;
	r.m128_u8[ 8 ]	= v7;
	r.m128_u8[ 7 ]	= v8;
	r.m128_u8[ 6 ]	= v9;
	r.m128_u8[ 5 ]	= v10;
	r.m128_u8[ 4 ]	= v11;
	r.m128_u8[ 3 ]	= v12;
	r.m128_u8[ 2 ]	= v13;
	r.m128_u8[ 1 ]	= v14;
	r.m128_u8[ 0 ]	= v15;
	return r;
}
//=================================================================================================
//!	set_b
//!	@retval			---
//-------------------------------------------------------------------------------------------------
cb_inline
__m128 sse_set_ub
		(
		icubic::uint8		v0 , 
		icubic::uint8		v1 , 
		icubic::uint8		v2 , 
		icubic::uint8		v3 , 
		icubic::uint8		v4 , 
		icubic::uint8		v5 , 
		icubic::uint8		v6 , 
		icubic::uint8		v7 , 
		icubic::uint8		v8 , 
		icubic::uint8		v9 , 
		icubic::uint8		v10 , 
		icubic::uint8		v11 , 
		icubic::uint8		v12 , 
		icubic::uint8		v13 , 
		icubic::uint8		v14 , 
		icubic::uint8		v15
		)
{
	__m128	r;
	r.m128_i8[ 15 ]	= v0;
	r.m128_i8[ 14 ]	= v1;
	r.m128_i8[ 13 ]	= v2;
	r.m128_i8[ 12 ]	= v3;
	r.m128_i8[ 11 ]	= v4;
	r.m128_i8[ 10 ]	= v5;
	r.m128_i8[ 9 ]	= v6;
	r.m128_i8[ 8 ]	= v7;
	r.m128_i8[ 7 ]	= v8;
	r.m128_i8[ 6 ]	= v9;
	r.m128_i8[ 5 ]	= v10;
	r.m128_i8[ 4 ]	= v11;
	r.m128_i8[ 3 ]	= v12;
	r.m128_i8[ 2 ]	= v13;
	r.m128_i8[ 1 ]	= v14;
	r.m128_i8[ 0 ]	= v15;
	return r;
}
//=================================================================================================
//!	set_w
//!	@retval			---
//-------------------------------------------------------------------------------------------------
cb_inline
__m128 sse_set_w
		(
		icubic::int16		v0 , 
		icubic::int16		v1 , 
		icubic::int16		v2 , 
		icubic::int16		v3 , 
		icubic::int16		v4 , 
		icubic::int16		v5 , 
		icubic::int16		v6 , 
		icubic::int16		v7
		)
{
	__m128	r;
	r.m128_i16[ 7 ] = v0;
	r.m128_i16[ 6 ] = v1;
	r.m128_i16[ 5 ] = v2;
	r.m128_i16[ 4 ] = v3;
	r.m128_i16[ 3 ] = v4;
	r.m128_i16[ 2 ] = v5;
	r.m128_i16[ 1 ] = v6;
	r.m128_i16[ 0 ] = v7;
	return r;
}
//=================================================================================================
//!	set_w
//!	@retval			---
//-------------------------------------------------------------------------------------------------
cb_inline
__m128 sse_set_uw
		(
		icubic::uint16		v0 , 
		icubic::uint16		v1 , 
		icubic::uint16		v2 , 
		icubic::uint16		v3 , 
		icubic::uint16		v4 , 
		icubic::uint16		v5 , 
		icubic::uint16		v6 , 
		icubic::uint16		v7
		)
{
	__m128	r;
	r.m128_u16[ 7 ] = v0;
	r.m128_u16[ 6 ] = v1;
	r.m128_u16[ 5 ] = v2;
	r.m128_u16[ 4 ] = v3;
	r.m128_u16[ 3 ] = v4;
	r.m128_u16[ 2 ] = v5;
	r.m128_u16[ 1 ] = v6;
	r.m128_u16[ 0 ] = v7;
	return r;
}
//=================================================================================================
//!	set_dw
//!	@retval			---
//-------------------------------------------------------------------------------------------------
cb_inline
__m128 sse_set_dw
		(
		icubic::int32		v0 , 
		icubic::int32		v1 , 
		icubic::int32		v2 , 
		icubic::int32		v3
		)
{
	__m128	r;
	r.m128_i32[ 0 ] = v3;
	r.m128_i32[ 1 ] = v2;
	r.m128_i32[ 2 ] = v1;
	r.m128_i32[ 3 ] = v0;
	return r;
}
//=================================================================================================
//!	set_dw
//!	@retval			---
//-------------------------------------------------------------------------------------------------
cb_inline
__m128 sse_set_udw
		(
		icubic::uint32		v0 , 
		icubic::uint32		v1 , 
		icubic::uint32		v2 , 
		icubic::uint32		v3
		)
{
	__m128	r;
	r.m128_u32[ 0 ] = v3;
	r.m128_u32[ 1 ] = v2;
	r.m128_u32[ 2 ] = v1;
	r.m128_u32[ 3 ] = v0;
	return r;
}
//=================================================================================================
//!	e0 + e1 + e2 + e3
//!	@retval			---
//-------------------------------------------------------------------------------------------------
cb_inline
float sse_elem_pppp_ps
		(
		__m128	a
		)
{
	__m128	t;
	t = sse_shift_left1_ps( a );
	a = sse_add_ss( a , t );
	t = sse_shift_left1_ps( t );
	a = sse_add_ss( a , t );
	t = sse_shift_left1_ps( t );
	a = sse_add_ss( a , t );
	float	*p = ( float*)&a;

	return *p;
}
//=================================================================================================
//!	e0 + e1 - e2 - e3
//!	@retval			---
//-------------------------------------------------------------------------------------------------
cb_inline
float sse_elem_ppmm_ps
		(
		__m128	a
		)
{
	__m128	t;
	t = sse_shift_left1_ps( a );

	__m128	zero = sse_setzero_ps();
	a = sse_sub_ss( zero , a );
	
	a = sse_add_ss( a , t );
	t = sse_shift_left1_ps( t );
	a = sse_add_ss( a , t );
	t = sse_shift_left1_ps( t );
	a = sse_sub_ss( a , t );
	float	*p = ( float*)&a;

	return *p;
}
//=================================================================================================
//!	e0 - e1 + e2 - e3
//!	@retval			---
//-------------------------------------------------------------------------------------------------
cb_inline
float sse_elem_pmpm_ps
		(
		__m128	a
		)
{
	__m128	t;
	t = sse_shift_right1_ps( a );

	__m128	zero = sse_setzero_ps();
	a = sse_sub_ss( zero , a );

	a = sse_add_ss( a , t );
	t = sse_shift_right1_ps( t );
	a = sse_sub_ss( a , t );
	t = sse_shift_right1_ps( t );
	a = sse_add_ss( a , t );
	float	*p = ( float*)&a;

	return *p;
}
//=================================================================================================
//!	e0 + e1 + e2
//!	@retval			---
//-------------------------------------------------------------------------------------------------
cb_inline
float sse_elem_ppp_ps
		(
		__m128	a
		)
{
	a = sse_shift_right1_ps( a );

	__m128	t;
	t = sse_shift_right1_ps( a );
	a = sse_add_ss( a , t );
	t = sse_shift_right1_ps( t );
	a = sse_add_ss( a , t );

	return sse_elem_ps( a , 3 );
}
//=================================================================================================
//!	add 2 element
//!	@retval			---
//-------------------------------------------------------------------------------------------------
cb_inline
float sse_elem_pp_ps
		(
		__m128	a
		)
{
	__m128	t = sse_add_ss( sse_shift_right2_ps( a ) , sse_shift_right3_ps( a ) );

	return sse_elem_ps( t , 3 );
}
//=================================================================================================
//!	sub 2 element
//!	@retval			---
//-------------------------------------------------------------------------------------------------
cb_inline
float sse_elem_pm_ps
		(
		__m128	a
		)
{
	__m128	t = sse_sub_ss( sse_shift_right3_ps( a ) , sse_shift_right2_ps( a )  );

	return sse_elem_ps( t , 3 );
}
//=================================================================================================
//!	mul matrix33 element
//!	@retval			---
//-------------------------------------------------------------------------------------------------
cb_inline
float sse_calc_raw_column_elem3
		(
		__m128	raw , 
		__m128	column 
		)
{
	__m128	t = sse_mul_ps( raw , column );
	return sse_elem_ppp_ps( t );
}
//=================================================================================================
//!	mul matrix33 element
//!	@retval			---
//-------------------------------------------------------------------------------------------------
cb_inline
float sse_calc_raw_column_elem3
		(
		__m128	raw , 
		float	column0 , 
		float	column1 , 
		float	column2
		)
{
	__m128	t = sse_mul_ps( raw , sse_set_ps( column0 , column1 , column2 , 0.0f ) );
	return sse_elem_ppp_ps( t );
}
//=================================================================================================
//!	abs
//!	@retval			---
//-------------------------------------------------------------------------------------------------
cb_inline
__m128 sse_abs_ps
		(
		__m128	a
		)
{
	__m128	zero = sse_setzero_ps();
	__m128	pls_mask = sse_cmpgt_ps( a , zero );
	
	return sse_sub_ps( sse_and_ps( pls_mask , a ) , sse_andnot_ps( pls_mask , a ) );
	
}
//=================================================================================================
//!	sqrt
//!	@retval			---
//-------------------------------------------------------------------------------------------------
cb_inline
float sse_sqrt_f
		(
		float	v
		)
{
	__m128	t;
	sse_elem_ps( t , 3 ) = v;
	t = sse_sqrt_ss( t );
	return sse_elem_ps( t , 3 );
}
//=================================================================================================
//!	length
//!	@retval			---
//-------------------------------------------------------------------------------------------------
cb_inline
float sse_length_f
		(
		float	x , 
		float	y
		)
{
	__m128	t = sse_set_ps( x , y , 0.0f , 0.0f );
	t = sse_mul_ps( t , t );

	return sse_sqrt_f( sse_elem_pp_ps( t ) );
}
//=================================================================================================
//!	e0 = length( s.e0 , s.e1 )
//! e1 = length( s.e1 , s.e2 )
//! e2 = length( s.e2 , s.e3 )
//! e3 = length( s.e3 , s.e0 )
//!	@retval		---
//-------------------------------------------------------------------------------------------------
cb_inline
__m128 sse_length_ps
		(
		__m128	s
		)
{
	s = sse_mul_ps( s , s );
	s = sse_add_ps( s , sse_shift_left1_ps( s ) );
	return sse_sqrt_ps( s );
}
//=================================================================================================
//!	e0 = 1/length( s.e0 , s.e1 )
//! e1 = 1/length( s.e1 , s.e2 )
//! e2 = 1/length( s.e2 , s.e3 )
//! e3 = 1/length( s.e3 , s.e0 )
//!	@retval		---
//-------------------------------------------------------------------------------------------------
cb_inline
__m128 sse_rlength_ps
		(
		__m128	s
		)
{
	s = sse_mul_ps( s , s );
	s = sse_add_ps( s , sse_shift_left1_ps( s ) );
	return sse_rsqrt_ps( s );
}
//=================================================================================================
//!	inner product
//!	@retval			---
//-------------------------------------------------------------------------------------------------
cb_inline
float sse_inner_f
		(
		float	x0 , 
		float	y0 , 
		float	x1 , 
		float	y1
		)
{
	__m128	src		= sse_set_ps( x0 , y0 , 0.0f , 0.0f );
	__m128	dest	= sse_set_ps( x1 , y1 , 0.0f , 0.0f );
	return sse_elem_pp_ps( sse_mul_ps( src, dest ) );
}
//=================================================================================================
//!	outer product
//!	@retval			---
//-------------------------------------------------------------------------------------------------
cb_inline
float sse_outer_f
		(
		float	x0 , 
		float	y0 , 
		float	x1 , 
		float	y1
		)
{
	__m128	src		= sse_set_ps( x0 , y0 , 0.0f , 0.0f );
	__m128	dest	= sse_set_ps( y1 , x1 , 0.0f , 0.0f );
	return sse_elem_pm_ps( sse_mul_ps( src, dest ) );
}

};	//namespace

#include	"../cpp/pp_cpp.h"
#include	"pp_sse_blender_onezero.h"
#include	"pp_sse_blender_overlap.h"
#include	"pp_sse_downsample.h"
#include	"pp_sse_oversample.h"
#include	"pp_sse_gradation_linear.h"
#include	"pp_sse_gradation_radial.h"
#include	"pp_sse_texture_nearest.h"
#include	"pp_sse_texture_weight4.h"
#include	"pp_sse_texture_weight16.h"

//using namespace icubic;		

#pragma pack( pop )			//release align
