//	Roast+ License

//	SIMD

#ifndef __SFJP_OPENMGL_roast_vecpro_HPP__
#define __SFJP_OPENMGL_roast_vecpro_HPP__

#include "roast/simd_lambda/simd_core.hpp"
#include "roast/std/parallelable.hpp"
#include <string>
#include <memory.h>

namespace roast{

	class vecpro_exeption
	{
	protected:
		std::string m_msg;
	public:
		vecpro_exeption(const char* msg){ m_msg = msg; }
		const char* get(){ return m_msg.c_str(); }
	};

	//////////////////////////////////////////////////

	template <int _SIMD_Ty, typename _SIMD_CLASS, int _MAX_COUNT, int _SIZE>	/*	_SIMD_Ty = ROAST_PARALLELABLE_TYPE_` (by roast/std/parallelable.hpp)	*/
	class vecpro_float
	{
	private:
		typedef float _Ty;
		typedef vecpro_float _ThisTy;

		int m_start_index;

	public:
		//	Constructor/Destructor
		vecpro_float(){
		}
		virtual ~vecpro_float(){}

		/////////////////////////////////////////////////////////

		inline _ThisTy& load_al(const _Ty *p_floats, int start_index){
			return load_aligned( p_floats, start_index );
		}
		inline _ThisTy& load_aligned(const _Ty *p_floats, int start_index)
		{
			int _START_INDEX = start_index;
			m_start_index = start_index/4;

			//	SSEłB
			if ( _SIMD_Ty >= ROAST_PARALLELABLE_TYPE_SIMD_SSE &&
				 _SIMD_Ty <= ROAST_PARALLELABLE_TYPE_SIMD_SSE_END )
			{
				const float *p_floats2 = p_floats - _START_INDEX;

				switch(_START_INDEX)
				{
				case 0:
					_SIMD_CLASS::movaps(0, p_floats2);

				case 4:
					if ( _SIZE+_START_INDEX <= 4 )
						break;
					_SIMD_CLASS::movaps(1, p_floats2+4);

				case 8:
					if ( _SIZE+_START_INDEX <= 8 )
						break;
					_SIMD_CLASS::movaps(2, p_floats2+8);

				case 12:
					if ( _SIZE+_START_INDEX <= 12 )
						break;
					_SIMD_CLASS::movaps(3, p_floats2+12);

				case 16:
					if ( _SIZE+_START_INDEX <= 16 )
						break;
					_SIMD_CLASS::movaps(4, p_floats2+16);

				case 20:
					if ( _SIZE+_START_INDEX <= 20 )
						break;
					_SIMD_CLASS::movaps(5, p_floats2+20);

				case 24:
					if ( _SIZE+_START_INDEX <= 24 )
						break;
					_SIMD_CLASS::movaps(6, p_floats2+24);

				case 28:
					if ( _SIZE+_START_INDEX <= 28 )
						break;
					_SIMD_CLASS::movaps(7, p_floats2+28);
				}
			}

			return *this;
		}


		inline _Ty* get_al(_Ty *p_floats){
			return get_aligned( p_floats, start_index );
		}
		inline _Ty* get_aligned(_Ty *p_floats){
			store_aligned( p_floats, start_index );
			return p_floats;
		}
		inline _ThisTy& store_al(_Ty *p_floats){
			return store_aligned( p_floats, start_index );
		}
		inline _ThisTy& store_aligned(_Ty *p_floats)
		{
			int _START_INDEX = start_index;

			//	SSEłB
			if ( _SIMD_Ty >= ROAST_PARALLELABLE_TYPE_SIMD_SSE &&
				 _SIMD_Ty <= ROAST_PARALLELABLE_TYPE_SIMD_SSE_END )
			{
				const float *p_floats2 = p_floats - _START_INDEX;

				switch(_START_INDEX)
				{
				case 0:
					_SIMD_CLASS::movaps(p_floats2, 0);

				case 4:
					if ( _SIZE+_START_INDEX <= 4 )
						break;
					_SIMD_CLASS::movaps(p_floats2+4, 1);

				case 8:
					if ( _SIZE+_START_INDEX <= 8 )
						break;
					_SIMD_CLASS::movaps(p_floats2+8, 2);

				case 12:
					if ( _SIZE+_START_INDEX <= 12 )
						break;
					_SIMD_CLASS::movaps(p_floats2+12, 3);

				case 16:
					if ( _SIZE+_START_INDEX <= 16 )
						break;
					_SIMD_CLASS::movaps(p_floats2+16, 4);

				case 20:
					if ( _SIZE+_START_INDEX <= 20 )
						break;
					_SIMD_CLASS::movaps(p_floats2+20, 5);

				case 24:
					if ( _SIZE+_START_INDEX <= 24 )
						break;
					_SIMD_CLASS::movaps(p_floats2+24, 6);

				case 28:
					if ( _SIZE+_START_INDEX <= 28 )
						break;
					_SIMD_CLASS::movaps(p_floats2+28, 7);
				}
			}

			return *this;
		}


		inline _ThisTy& operator *= (const _ThisTy& from)
		{
			//	SSEłB
			if ( _SIMD_Ty >= ROAST_PARALLELABLE_TYPE_SIMD_SSE &&
				 _SIMD_Ty <= ROAST_PARALLELABLE_TYPE_SIMD_SSE_END )
			{
				int _START_INDEX = m_start_index;
				switch(_START_INDEX)
				{
				case 0:
					_SIMD_CLASS::mulps(0, from.m_start_index);

				case 4:
					if ( _SIZE+_START_INDEX <= 4 )
						break;
					_SIMD_CLASS::mulps(1, from.m_start_index+1);

				case 8:
					if ( _SIZE+_START_INDEX <= 8 )
						break;
					_SIMD_CLASS::mulps(2, from.m_start_index+2);

				case 12:
					if ( _SIZE+_START_INDEX <= 12 )
						break;
					_SIMD_CLASS::mulps(3, from.m_start_index+3);

				case 16:
					if ( _SIZE+_START_INDEX <= 16 )
						break;
					_SIMD_CLASS::mulps(4, from.m_start_index+4);

				case 20:
					if ( _SIZE+_START_INDEX <= 20 )
						break;
					_SIMD_CLASS::mulps(5, from.m_start_index+5);

				case 24:
					if ( _SIZE+_START_INDEX <= 24 )
						break;
					_SIMD_CLASS::mulps(6, from.m_start_index+6);

				case 28:
					if ( _SIZE+_START_INDEX <= 28 )
						break;
					_SIMD_CLASS::mulps(7, from.m_start_index+7);
				}
			}

			return *this;
		}
	};

	/*typedef vecpro<ROAST_PARALLELABLE_TYPE_SIMD_SSE, roast::simd::sse1> vecpro_sse;
	typedef vecpro<ROAST_PARALLELABLE_TYPE_SIMD_SSE2, roast::simd::sse2> vecpro_sse2;
	typedef vecpro<ROAST_PARALLELABLE_TYPE_SIMD_SSE3, roast::simd::sse3> vecpro_sse3;
	typedef vecpro<ROAST_PARALLELABLE_TYPE_SIMD_SSE4, roast::simd::sse4> vecpro_sse4;*/
	/*typedef vecpro<roast::simd::sse1, 32> vecpro_sse;
	typedef vecpro<roast::simd::sse2, 32> vecpro_sse2;
	typedef vecpro<roast::simd::sse3, 32> vecpro_sse3;
	typedef vecpro<roast::simd::sse4, 32> vecpro_sse4;*/
	template <int _SIZE>
	class vecpro_float_sse2 : public vecpro_float<
		ROAST_PARALLELABLE_TYPE_SIMD_SSE2, roast::simd::sse2, 32, _SIZE>{};
}

#endif//__SFJP_OPENMGL_roast_vecpro_HPP__
