#ifndef __HADDOCK__
-- $Id: Ucs.hs,v 1.3 2007/01/02 09:48:38 ha-tan Exp $
#endif
module Cinnamon.Ucs (
  ucs4CharToUtf8Chars, ucs4ToUtf8,
  utf8CharsToUcs4Char, utf8ToUcs4
) where

import Data.Bits ((.&.), (.|.), shiftL, shiftR)
import Data.Char (chr, ord)

{-
  参照: <http://www.ietf.org/rfc/rfc2279.txt>

  UCS-4 range (hex.)    UTF-8 octet sequence (binary)
  0000 0000-0000 007F   0xxxxxxx
  0000 0080-0000 07FF   110xxxxx 10xxxxxx
  0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx

  0001 0000-001F FFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  0020 0000-03FF FFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  0400 0000-7FFF FFFF   1111110x 10xxxxxx ... 10xxxxxx
-}

-- | UCS4の文字をUTF8の文字に変換します。
-- UTF8の文字は最大Char6文字分で表現されます。
ucs4CharToUtf8Chars :: Char -> [Char]
ucs4CharToUtf8Chars c
  | c' <= 0x0000007f = map chr $ marks 0x00 $ masks 1
  | c' <= 0x000007ff = map chr $ marks 0xc0 $ masks 2
  | c' <= 0x0000ffff = map chr $ marks 0xe0 $ masks 3
  | c' <= 0x1fffffff = map chr $ marks 0xf0 $ masks 4
  | c' <= 0x3fffffff = map chr $ marks 0xf8 $ masks 5
  | c' <= 0x7fffffff = map chr $ marks 0xfc $ masks 6
  | otherwise        = error "ucs4CharToUtf8Char: out of range."
  where
    c' :: Int
    c' = ord c

    mask :: Int -> Int
    mask n = (c' `shiftR` (n * 6)) .&. 0x3f

    masks :: Int -> [Int]
    masks n = map mask $ reverse $ [0 .. n - 1]

    marks :: Int -> [Int] -> [Int]
    marks m0 = zipWith (\ m c -> m .|. c) (m0 : repeat 0x80)

-- | UCS4の文字列をUTF8の文字列に変換します。
ucs4ToUtf8 :: String -> String
ucs4ToUtf8 = concatMap ucs4CharToUtf8Chars

-- | UTF8の文字をUCS4の文字に変換します。
-- UTF8の文字は最大Char6文字分で表現されます。
utf8CharsToUcs4Char :: Int -> [Char] -> Char
utf8CharsToUcs4Char n cs
  | n == 1 = chr $ unmasks 1 $ unmarks 0x7f $ map ord cs
  | n == 2 = chr $ unmasks 2 $ unmarks 0x1f $ map ord cs
  | n == 3 = chr $ unmasks 3 $ unmarks 0x0f $ map ord cs
  | n == 4 = chr $ unmasks 4 $ unmarks 0x07 $ map ord cs
  | n == 5 = chr $ unmasks 5 $ unmarks 0x03 $ map ord cs
  | n == 6 = chr $ unmasks 6 $ unmarks 0x01 $ map ord cs
  | otherwise = error "utf8CharsToUcs4Char: internal error."
  where
    unmask :: (Int, Int) -> Int -> Int
    unmask (n, x) a = (x `shiftL` (n * 6)) .|. a
    
    unmasks :: Int -> [Int] -> Int
    unmasks n xs = foldr unmask 0 $ zip (reverse $ [0 .. n - 1]) xs
    
    unmarks :: Int -> [Int] -> [Int]
    unmarks m0 = zipWith (\ m c -> m .&. c) (m0 : repeat 0x3f)

-- | UTF8の文字列をUCS4の文字列に変換します。
utf8ToUcs4 :: String -> String
utf8ToUcs4 [] = []
utf8ToUcs4 s @ (c : cs)
  | c' <= 0x7f = f 1
  | c' <= 0xdf = f 2
  | c' <= 0xef = f 3
  | c' <= 0xf7 = f 4
  | c' <= 0xfb = f 5
  | c' <= 0xfe = f 6
  | otherwise   = error "utf84ToUcs4: out of range."
  where
    c' :: Int
    c' = ord c

    f :: Int -> String
    f n =
      if (length s1 /= n)
        then error "utf84ToUcs4: utf8 string too short."
        else utf8CharsToUcs4Char n s1 : utf8ToUcs4 s2
      where
        (s1, s2) = splitAt n s
