// Encoder.h
// (c) 2004-2005 exeal

#ifndef _ENCODER_H_
#define _ENCODER_H_

#include "../AscensionCommon.h"
#include <cassert>
#include <set>
#include <map>


namespace Ascension {
namespace Encodings {

///	Windows R[hy[W
typedef uint CodePage;

//	Windows R[hy[WɖA͎Ŏ镶R[h
const CodePage
	CPEX_UNICODE_UTF16LE		= 1200,		///< UTF-16
	CPEX_UNICODE_UTF16BE		= 1201,		///< UTF-16 big endian
	CPEX_UNICODE_UTF32LE		= 12000,	///< UTF-32
	CPEX_UNICODE_UTF32BE		= 12001,	///< UTF-32 big endian
	CPEX_AUTODETECT				= 50001,	///< 
	CPEX_JAPANESE_AUTODETECT	= 50932,	///< { (I)
	CPEX_KOREAN_AUTODETECT		= 50949,	///< ؍ (I)
	CPEX_AUTODETECT_SYSTEMLANG	= 70000,	///< I (VXě)
	CPEX_AUTODETECT_USERLANG	= 70001,	///< I ([Ǔ)
	CPEX_UNICODE_AUTODETECT		= 70010,	///< Unicode ()
	CPEX_UNICODE_UTF5			= 70011,	///< UTF-5
	CPEX_ARMENIAN_AUTODETECT	= 70020,	///< AjA (I)
	CPEX_ARMENIAN_ARMSCII7		= 70021,	///< AjA (ARMSCII-7)
	CPEX_ARMENIAN_ARMSCII8		= 70022,	///< AjA (ARMSCII-8)
	CPEX_ARMENIAN_ARMSCII8A		= 70023,	///< AjA (ARMSCII-8A)
	CPEX_VIETNAMESE_AUTODETECT	= 70030,	///< xgi (I)
	CPEX_VIETNAMESE_TCVN		= 70031,	///< xgi (TCVN)
	CPEX_VIETNAMESE_VISCII		= 70032,	///< xgi (VISCII)
	CPEX_VIETNAMESE_VPS			= 70033,	///< xgi (VPS)
	CPEX_JAPANESE_ISO2022JP		= 70040,	///< { (ISO-2022-JP)
	CPEX_JAPANESE_SHIFTJIS		= 70041,	///< { (Vtg JIS)
	CPEX_JAPANESE_ISO2022JP1	= 70042,	///< { (ISO-2022-JP-1)
	CPEX_JAPANESE_ISO2022JP2	= 70043,	///< { (ISO-2022-JP-2)
	CPEX_JAPANESE_EUC			= 70044,	///< { (EUC)
	CPEX_JAPANESE_ISO2022JP2004				= 70045,	///< { (ISO-2022-JP-2004)
	CPEX_JAPANESE_ISO2022JP2004_STRICT		= 70046,	///< { (ISO-2022-JP-2004-strict)
	CPEX_JAPANESE_ISO2022JP2004_COMPATIBLE	= 70047,	///< { (ISO-2022-JP-2004-compatible)
	CPEX_JAPANESE_ISO2022JP3			= 70048,	///< { (ISO-2022-JP-3)
	CPEX_JAPANESE_ISO2022JP3_STRICT		= 70049,	///< { (ISO-2022-JP-3-strict)
	CPEX_JAPANESE_ISO2022JP3_COMPATIBLE	= 70050,	///< { (ISO-2022-JP-3-compatible)
	CPEX_JAPANESE_SHIFTJIS2004	= 70051,	///< { (Shift_JIS-2004)
	CPEX_JAPANESE_EUCJIS2004	= 70052,	///< { (EUC-JIS-2004)
	CPEX_MULTILINGUAL_ISO2022_7BIT		= 70060,	///< }`K (ISO-2022, 7rbg)
	CPEX_MULTILINGUAL_ISO2022_7BITSS2	= 70061,	///< }`K (ISO-2022, 7rbg, SS2)
	CPEX_MULTILINGUAL_ISO2022_7BITSISO	= 70062,	///< }`K (ISO-2022, 7rbg, SI/SO)
	CPEX_MULTILINGUAL_ISO2022_8BITSS2	= 70063,	///< }`K (ISO-2022, 8rbg, SS2)
	CPEX_UNCATEGORIZED_BINARY	= 70070,	///< oCi
	CPEX_UNCATEGORIZED_NEXTSTEP	= 70071,	///< NEXTSTEP
	CPEX_UNCATEGORIZED_ATARIST	= 70072,	///< Atari ST/TT
	CPEX_THAI_TIS620	= 70080,	///< ^C (TIS 620-2533:1990)
	CPEX_LAO_MULELAO	= 70090,	///< I (MuleLao)
	CPEX_LAO_CP1133		= 70091,	///< I (ibm-1133)
	CPEX_IRISH_IS434	= 70100,	///< ACh (I.S. 434:1999)
	CPEX_TAMIL_TAB		= 70110,	///< ^~ (TAB)
	CPEX_TAMIL_TAM		= 70111,	///< ^~ (TAM)
	CPEX_TAMIL_TSCII	= 70112,	///< ^~ (TSCII 1.7)
	CPEX_HINDI_MACINTOSH	= 70115,	///< qfB[ (Macintosh, foiK)
	CPEX_GUJARATI_MACINTOSH	= 70116,	///< OW[g (Macintosh)
	CPEX_PANJABI_MACINTOSH	= 70117,	///< pWu (Macintosh, OL[)
	CPEX_CYRILLIC_MACINTOSH							= 10007,	///< L (Macintosh)
	CPEX_CYRILLIC_KOI8R								= 20866,	///< VA (KOI8-R)
	CPEX_CYRILLIC_RUSSIANSUPPORTFORDOS3				= 70120,	///< VA (DOS 3 VAT|[g)
	CPEX_CYRILLIC_RUSSIANSUPPORTFORDOS4ACADEMIC		= 70121,	///< VA (DOS 4 AJf~bNVAT|[g)
	CPEX_CYRILLIC_RUSSIANSUPPORTFORDOS3NONACADEMIC	= 70122,	///< VA (DOS 4 AJf~bNVAT|[g)
	CPEX_CYRILLIC_SOVIETKOI8BASIC					= 70123,	///< VA (\rGg KOI-8 {)
	CPEX_CYRILLIC_SOVIETKOI8ALTERNATIVE				= 70124,	///< VA (\rGg KOI-8 )
	CPEX_CYRILLIC_SOVIETKOI7						= 70125,	///< VA (\rGg KOI-7)
	CPEX_CYRILLIC_ECMA								= 70126,	///< L (ISO-IR-111, ECMA)
	CPEX_CYRILLIC_KOI8RU							= 70127,	///< L (KOI8-RU)
	CPEX_CYRILLIC_KOI8UNIFIED						= 70128,	///< L (KOI8 )
	CPEX_ISO8859_1	= 28591,	///< [bp (ISO-8859-1)
	CPEX_ISO8859_2	= 28592,	///< [bp (ISO-8859-2)
	CPEX_ISO8859_3	= 28593,	///< 새[bp (ISO-8859-3)
	CPEX_ISO8859_4	= 28594,	///< og (ISO-8859-4)
	CPEX_ISO8859_5	= 28595,	///< L (ISO-8859-5)
	CPEX_ISO8859_6	= 28596,	///< ArA (ISO-8859-6)
	CPEX_ISO8859_7	= 28597,	///< MV (ISO-8859-7)
	CPEX_ISO8859_8	= 28598,	///< wuC (ISO-8859-8)
	CPEX_ISO8859_9	= 28599,	///< gR (ISO-8859-9)
	CPEX_ISO8859_10	= 28600,	///< k (ISO-8859-10)
	CPEX_ISO8859_11	= 28601,	///< ^C (ISO-8859-11)
	CPEX_ISO8859_13	= 28603,	///< og (ISO-8859-13)
	CPEX_ISO8859_14	= 28604,	///< Pg (ISO-8859-14)
	CPEX_ISO8859_15	= 28605,	///< [bp (ISO-8859-15)
	CPEX_ISO8859_16	= 28606;	///< [bp (ISO-8859-16)

// oCgI[_[}[N
const uchar	UTF16LE_BOM[] = "\xFF\xFE";			///< UTF-16 gGfBA BOM
const uchar	UTF16BE_BOM[] = "\xFE\xFF";			///< UTF-16 rbOGfBA BOM
const uchar	UTF32LE_BOM[] = "\xFF\xFF\x00\x00";	///< UTF-32 gGfBA BOM
const uchar	UTF32BE_BOM[] = "\xFE\xFF\x00\x00";	///< UTF-32 rbOGfBA BOM
const uchar	UTF8_BOM[] = "\xEF\xBB\xBF";		///< UTF-8  BOM


// CEncoder class definition
/////////////////////////////////////////////////////////////////////////////

// ϊłȂꍇ̊̕ ([Ǔꂩ擾قȂ)
#define __NATIVE_DEFAULT_CHARACTER	'?'

// ZbgɃ}bvȂ
const wchar_t	__REPLACEMENT_CHARACTER = 0xFFFD;
const wchar_t	__RPCH					= __REPLACEMENT_CHARACTER;
const uchar		__NA					= 0x00;

template<typename Ch> void __SetDefaultChar(Ch& ch);
template<> inline void __SetDefaultChar(char& ch) {ch = __NATIVE_DEFAULT_CHARACTER;}
template<> inline void __SetDefaultChar(uchar& ch) {ch = __NATIVE_DEFAULT_CHARACTER;}
template<> inline void __SetDefaultChar(ushort& ch) {ch = __NATIVE_DEFAULT_CHARACTER;}
template<> inline void __SetDefaultChar(wchar_t& ch) {ch = __REPLACEMENT_CHARACTER;}
template<> inline void __SetDefaultChar(ulong& ch) {ch = __REPLACEMENT_CHARACTER;}

#define CONFIRM_ILLEGAL_CHAR(lhs)											\
	{																		\
		if(pCallback == 0 || pCallback->OnFoundUnconvertableCharacter()) {	\
			__SetDefaultChar(lhs);											\
			pCallback = 0;													\
		} else																\
			return 0;														\
	}

#define CFU_ARGLIST											\
	uchar* pszDest, std::size_t cchDest,					\
	const wchar_t* pwszSrc, std::size_t cchSrc /* = -1 */,	\
	IUnconvertableCharCallback* pCallback /* = 0 */

#define CTU_ARGLIST										\
	wchar_t* pwszDest, std::size_t cchDest,				\
	const uchar* pszSrc, std::size_t cchSrc /* = -1 */,	\
	IUnconvertableCharCallback* pCallback /* = 0 */

#define CFU_CHECKARGS()						\
	assert(pszDest != 0 && pwszSrc != 0);	\
	if(cchSrc == -1)						\
		cchSrc = wcslen(pwszSrc)

#define CTU_CHECKARGS()						\
	assert(pwszDest != 0 && pszSrc != 0);	\
	if(cchSrc == -1)						\
		cchSrc = strlen(reinterpret_cast<const char*>(pszSrc))

/// ϊłȂ̏R[obN
interface IUnconvertableCharCallback {
	/// fXgN^
	virtual ~IUnconvertableCharCallback() {}
	/**
	 *	t@Cǂݍݎ Unicode ɕϊłȂA
	 *	܂͕ۑɃlCeBuR[hɕϊłȂƂɌĂяoB
	 *	߂lɂ肻̕ǂ߂B
	 *	̃\bh1x̏1xĂяoȂ
	 *	@retval true	ϊłȂ̕ɕϊď𑱍s
	 *	@retval false	ǂݍ/ۑ𒼂ɒ~ (ϊ\bh0Ԃ)
	 */
	virtual bool OnFoundUnconvertableCharacter() = 0;
};

/// GR[_
class CEncoder : public Manah::CNoncopyable {
	// RXgN^
protected:
	CEncoder() {}
public:
	virtual ~CEncoder() {}

	// \bh
public:
	/**
	 *	UTF-16 ϊ
	 *	@param pszDest		[out] ϊ
	 *	@param cchDest		ϊ̒
	 *	@param pwszSrc		ϊ
	 *	@param cchSrc		ϊ̕
	 *	@param pCallback	ϊłȂ邽߂̃R[obNBnull ł悢
	 *	@return				ϊ̕
	 */
	virtual std::size_t ConvertFromUnicode(
							uchar* pszDest, std::size_t cchDest,
							const wchar_t* pwszSrc, std::size_t cchSrc = -1,
							IUnconvertableCharCallback* pCallback = 0) = 0;
	/**
	 *	UTF-16 ɕϊ
	 *	@param pwszDest		[out] ϊ
	 *	@param cchDest		ϊ̕
	 *	@param pszSrc		ϊ
	 *	@param cchSrc		ϊ̕
	 *	@param pCallBack	ϊłȂ邽߂̃R[obNBnull ł悢
	 *	@return				ϊ̕
	 */
	virtual std::size_t ConvertToUnicode(
							wchar_t* pwszDest, std::size_t cchDest,
							const uchar* pszSrc, std::size_t cchSrc = -1,
							IUnconvertableCharCallback* pCallBack = 0) = 0;
	/// UCS 1lCeBuɕϊ̂ɕKvȍőoCgԂ
	virtual uchar GetMaxNativeCharLength() const = 0;
	/// lCeBu1oCg UCS ɕϊ̂ɕKvȍő咷Ԃ (UTF-16 P)
	virtual uchar GetMaxUcsCharLength() const = 0;
};


/// GR[_̃t@Ng
class CEncoderFactory {
	// f[^^
public:
	typedef CEncoder*(*EncoderProducer)();
	typedef void(*CodePageDetector)(const uchar*, std::size_t, CodePage&, std::size_t&);

	// \bh
public:
	CEncoder*				CreateEncoder(CodePage cp);
	CodePage				DetectCodePage(const uchar* psz, std::size_t cch, CodePage cp);
	void					EnumCodePages(std::set<CodePage>& codePages) const;
	static CEncoderFactory&	GetInstance();
	CodePageDetector		GetUnicodeDetector() const;
	bool					IsCodePageForAutoDetection(CodePage cp) const;
	bool					IsCodePageForReadOnly(CodePage cp) const;
	bool					IsValidCodePage(CodePage cp) const;

	bool	RegisterCodePageForReadOnly(CodePage cp);
	bool	RegisterDetector(CodePage cp, CodePageDetector factoryMethod);
	bool	RegisterEncoder(CodePage cp, EncoderProducer factoryMethod);

	// f[^o
private:
	typedef std::map<CodePage, EncoderProducer>		EncoderMap;
	typedef std::map<CodePage, CodePageDetector>	DetectorMap;
	EncoderMap			m_registeredEncoders;
	DetectorMap			m_registeredDetectors;
	std::set<CodePage>	m_codePagesForReadOnly;
};


#define BEGIN_ENCODER_DEFINITION()	namespace {

#define END_ENCODER_DEFINITION()	}

#define DEFINE_ENCODER_CLASS_(cp, name)								\
	class CEncoder_##name : public CEncoder {						\
	private:														\
		CEncoder_##name();											\
	public:															\
		std::size_t	ConvertFromUnicode(CFU_ARGLIST);				\
		std::size_t	ConvertToUnicode(CTU_ARGLIST);					\
		uchar		GetMaxNativeCharLength() const;					\
		uchar		GetMaxUcsCharLength() const;					\
		static CEncoder*	Create() {return new CEncoder_##name;}	\
	};																\
	const bool b##name =											\
		CEncoderFactory::GetInstance().RegisterEncoder(cp, &CEncoder_##name::Create);

#define DEFINE_ENCODER_CLASS(cp, name, cch, ccp)						\
	DEFINE_ENCODER_CLASS_(cp, name)										\
	CEncoder_##name::CEncoder_##name() {}								\
	uchar CEncoder_##name::GetMaxNativeCharLength() const {return cch;}	\
	uchar CEncoder_##name::GetMaxUcsCharLength() const {return ccp;}

#define DEFINE_DETECTOR(cp, name)														\
	namespace {																			\
		void DetectCodePage_##name(const uchar* psz,									\
			std::size_t cch, CodePage& cpResult, std::size_t& cchConvertable);			\
		const bool b##name =															\
			CEncoderFactory::GetInstance().RegisterDetector(cp, &DetectCodePage_##name);\
	}

#define REGISTER_READONLY_CODEPAGE(cp)	\
	const bool	b##cp = CEncoderFactory::GetInstance().RegisterCodePageForReadOnly(cp)


// Windows ϊe[û܂܎gpGR[_
class CWindowsEncoder : public CEncoder {
private:
	CWindowsEncoder(CodePage cp) : m_codePage(cp) {
		if(!toBoolean(::IsValidCodePage(cp)))
			throw std::invalid_argument("Specified code page is not supported.");
	}
public:
	std::size_t ConvertFromUnicode(CFU_ARGLIST) {
		if(const int result = ::WideCharToMultiByte(m_codePage, 0,
				pwszSrc, cchSrc, reinterpret_cast<char*>(pszDest), cchDest, 0, 0))
			return result;
		return (pCallback == 0 || pCallback->OnFoundUnconvertableCharacter()) ?
			::WideCharToMultiByte(m_codePage, WC_DEFAULTCHAR, pwszSrc, cchSrc, reinterpret_cast<char*>(pszDest), cchDest, 0, 0) : 0;
	}
	std::size_t ConvertToUnicode(CTU_ARGLIST) {
		if(const int result = ::MultiByteToWideChar(m_codePage,
				MB_ERR_INVALID_CHARS, reinterpret_cast<const char*>(pszSrc), cchSrc, pwszDest, cchDest))
			return result;
		return (pCallback == 0 || pCallback->OnFoundUnconvertableCharacter()) ?
			::MultiByteToWideChar(m_codePage, 0, reinterpret_cast<const char*>(pszSrc), cchSrc, pwszDest, cchDest) : 0;
	}
	uchar GetMaxNativeCharLength() const {
		CPINFO	cpi;
		return toBoolean(::GetCPInfo(m_codePage, &cpi)) ? cpi.MaxCharSize : 0;
	}
	uchar GetMaxUcsCharLength() const {
		return 1;
	}
	friend class CEncoderFactory;
private:
	const CodePage	m_codePage;
};



/// B̃CX^XԂ
inline CEncoderFactory& CEncoderFactory::GetInstance() {
	static CEncoderFactory	instance;
	return instance;
}

/// Unicode ̎ʊԂBo^ĂȂ null
inline CEncoderFactory::CodePageDetector CEncoderFactory::GetUnicodeDetector() const {
	DetectorMap::const_iterator	it = m_registeredDetectors.find(CPEX_UNICODE_AUTODETECT);
	return (it != m_registeredDetectors.end()) ? it->second : 0;
}

/// ʂ̂߂̃R[hy[W
inline bool CEncoderFactory::IsCodePageForAutoDetection(CodePage cp) const {
	return m_registeredDetectors.find(cp) != m_registeredDetectors.end();
}

/// lCeBuGR[h UCS ւ̕ϊT|[gȂR[hy[W
inline bool CEncoderFactory::IsCodePageForReadOnly(CodePage cp) const {
	return m_codePagesForReadOnly.find(cp) != m_codePagesForReadOnly.end();
}

/// LȃR[hy[W
inline bool CEncoderFactory::IsValidCodePage(CodePage cp) const {
	return toBoolean(::IsValidCodePage(cp))
		|| IsCodePageForAutoDetection(cp)
		|| m_registeredEncoders.find(cp) != m_registeredEncoders.end();
}

/**
 *	lCeBuGR[h UCS ւ̕ϊT|[gȂR[hy[W̓o^
 *	@param cp	R[hy[W
 *	@return		
 */
inline bool CEncoderFactory::RegisterCodePageForReadOnly(CodePage cp) {
	return m_codePagesForReadOnly.insert(cp).second;	// VC extended return
}

/**
 *	ʊ̓o^
 *	@param cp				R[hy[W
 *	@param factoryMethod	ʂs֐
 *	@return					
 */
inline bool CEncoderFactory::RegisterDetector(CodePage cp, CodePageDetector factoryMethod) {
	assert(factoryMethod != 0);
	return m_registeredDetectors.insert(std::make_pair(cp, factoryMethod)).second;	// VC extended return
}

/**
 *	GR[_̓o^
 *	@param cp				R[hy[W
 *	@param factoryMethod	GR[_쐬֐
 *	@return					
 */
inline bool CEncoderFactory::RegisterEncoder(CodePage cp, EncoderProducer factoryMethod) {
	assert(factoryMethod != 0);
	return m_registeredEncoders.insert(std::make_pair(cp, factoryMethod)).second;	// VC extended return
}

} // namespace Encodings
} // namespace Ascension

#endif /* _ENCODER_H_ */

/* [EOF] */