Haskellで文字列を EUC-JP ⇔Shift_JIS 変換する。 - sirocco の書いてもすぐに忘れるメモ

とほほのWWW入門/ 漢字コードについて(各コード間の変換アルゴリズム)を参考にEUC-JPからShift-JISへ変換します。
ASCII
このアルゴリズムは全角文字のEUC-JP ⇔Shift_JIS 変換です。ASCIIについては変化ありませんのでそのままリストに追加していきます。
半角カナの処理
「ミケネコ研究所 / 文字コードの部屋 / 半角カナ」に半角カナの割り当て状況があります。

Shift_JISにおいて0xA1-0xDFの範囲の1バイト文字が半角カナです。
EUC-JPにおいては 0x8EA1-0x8EDFの範囲に半角カナが割り当てられています。

EUC-JP|  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
------+------------------------------------------------
8E A  |    。「  」 、 ・ ヲ ァ ィ ゥ ェ ォ ャ ュ ョ ッ
8E B  | ー ア イ ウ エ オ カ キ ク ケ コ サ シ ス セ ソ
8E C  | タ チ ツ テ ト ナ ニ ヌ ネ ノ ハ ヒ フ ヘ ホ マ
8E D  | ミ ム メ モ ヤ ユ ヨ ラ リ ル レ ロ ワ ン ゛ ゜

EUC-JPの半角カナは2バイト、Shift-JISは1バイトです。EUC-JPの1バイト目が8Eのときは半角カナですから8Eを読み捨て、次の1バイトを変換せずにそのまま変換スミとしてリストに追加していきます。
その逆にShift-JISから変換するときは1バイト文字から2バイト文字に変わります。半角カナに該当する文字は第1バイトを0x8Eとし、第2バイトにそのまま追加します。

import Data.Char

-- ====================================================================
eucToSjis :: String -> String
eucToSjis (x:xs) | n == 0x8E = case xs of         -- 半角カナ
                                 y:ys -> y:eucToSjis ys
                                 _    -> error "0x8E(conv error)"
                 | n <= 0xA0 =  x:eucToSjis xs    -- ASCII
                 | otherwise     = case xs of
                                     y:ys -> let (c1,c2) = jisToSjis(eucToJis (x,y)) in c1:c2:eucToSjis ys
                                     _    -> error "(conv error)"
    where
      n = ord x
      eucToJis ::(Char,Char) -> (Int,Int)
      eucToJis (c1,c2) = ((ord c1) - 0x80,(ord c2) - 0x80)
      
      jisToSjis ::(Int,Int)-> (Char,Char)
      jisToSjis (c1,c2) = let (o1,o2) = if c1 `mod` 2 == 1 then (((c1 + 1) `div` 2) + 0x70, c2 + 0x1f)
                                                           else ((c1 `div` 2) + 0x70,       c2 + 0x7d)
                          in (chr (if o1 >= 0xa0 then o1 + 0x40 else o1), chr (if o2 >= 0x7f then o2 + 1 else o2))
eucToSjis "" = ""
-- ====================================================================
sjisToEuc :: String -> String
sjisToEuc (x:xs) | n <= 0x7F              = x:sjisToEuc xs             -- ASCII
                 | n >= 0xA0 && n <= 0xDF = (chr 0x8E):x:sjisToEuc xs  -- 半角カナ
                 | otherwise     = case xs of
                                     y:ys -> let (c1,c2) = jisToEuc(sjisToJis (x,y)) in c1:c2:sjisToEuc ys
                                     _    -> error "(conv error)"
    where
      n = ord x
      jisToEuc ::(Int,Int) -> (Char,Char)
      jisToEuc (c1,c2) = (chr (c1 + 0x80), chr (c2 + 0x80))

      sjisToJis ::(Char,Char)-> (Int,Int)
      sjisToJis (c1,c2) = let o1 = if n1 >= 0xe0 then n1 - 0x40 else n1
                              o2 = if n2 >= 0x80 then n2 - 1    else n2
                          in  if o1 >= 0x9e then ((o1 - 0x70) * 2,      o2 - 0x7d)
                                            else (((o1 - 0x70) * 2) - 1,o2 - 0x1f)
          where
            n1 = ord c1
            n2 = ord c2

sjisToEuc "" = ""

EUCコードには第3バイトまで用いられる補助漢字（JIS X 0212-1990）がありますが、Shift-JISでは表現することが出来ませんので省略してあります。

Haskell の文字列に含まれる全角英数字を半角に、半角英数字を全角に変換する。

2013/07/29 Data.Text バージョンを追記。

import Data.Char
import qualified Data.Text   as T

-- ====================================================================
textEucToSjis :: T.Text -> T.Text
textEucToSjis text = 
  if T.null text then "" else eucToSjis (T.head text) (T.tail text)
    where
     eucToSjis :: Char -> T.Text -> T.Text
     eucToSjis x xs | n == 0x8E = if T.null xs                -- 半角カナ
                                      then error "0x8E(conv error)" 
                                      else T.cons (T.head xs) (textEucToSjis (T.tail xs))
                    | n <= 0xA0 = x `T.cons` (textEucToSjis xs) -- ASCII
                    | otherwise = if T.null xs
                                    then error "(conv error)"
                                    else let (c1,c2) = jisToSjis(eucToJis (x,T.head xs))
                                         in  c1 `T.cons` (c2 `T.cons` (textEucToSjis (T.tail xs)))
      where
        n = ord x
        eucToJis ::(Char,Char) -> (Int,Int)
        eucToJis (c1,c2) = ((ord c1) - 0x80,(ord c2) - 0x80)
        
        jisToSjis ::(Int,Int)-> (Char,Char)
        jisToSjis (c1,c2) = let (o1,o2) = if c1 `mod` 2 == 1 then (((c1 + 1) `div` 2) + 0x70, c2 + 0x1f)
                                                             else ((c1 `div` 2) + 0x70,       c2 + 0x7d)
                            in (chr (if o1 >= 0xa0 then o1 + 0x40 else o1), chr (if o2 >= 0x7f then o2 + 1 else o2))

-- ====================================================================
textSjisToEuc :: T.Text -> T.Text
textSjisToEuc text =
  if T.null text then "" else sjisToEuc (T.head text) (T.tail text)
    where
     sjisToEuc :: Char -> T.Text -> T.Text
     sjisToEuc x xs | n <= 0x7F              = x `T.cons` textSjisToEuc xs -- ASCII
                    | n >= 0xA0 && n <= 0xDF = (chr 0x8E) `T.cons` x `T.cons` textSjisToEuc xs  -- 半角カナ
                    | otherwise = if T.null xs
                                    then error "(conv error)"
                                    else let (c1,c2) = jisToEuc(sjisToJis (x,T.head xs)) 
                                         in  c1 `T.cons` c2 `T.cons` (textSjisToEuc (T.tail xs))
        where
          n = ord x
          jisToEuc ::(Int,Int) -> (Char,Char)
          jisToEuc (c1,c2) = (chr (c1 + 0x80), chr (c2 + 0x80))
          sjisToJis ::(Char,Char)-> (Int,Int)
          sjisToJis (c1,c2) = let o1 = if n1 >= 0xe0 then n1 - 0x40 else n1
                                  o2 = if n2 >= 0x80 then n2 - 1    else n2
                              in  if o1 >= 0x9e then ((o1 - 0x70) * 2,      o2 - 0x7d)
                                                else (((o1 - 0x70) * 2) - 1,o2 - 0x1f)
              where
                n1 = ord c1
                n2 = ord c2

2013/11/29 Data.ByteString.Char8 バージョンを追記。

{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE CPP #-}

import Data.Char
import Data.Word
import Foreign.Ptr
import Foreign.C.String
import Foreign.C.Types
import Foreign.Marshal.Alloc
import System.IO.Unsafe      (unsafePerformIO)
import qualified Data.ByteString.Char8 as B

data BSTR

type BSTRobj    = (Ptr BSTR)

#define CP_ACP 0

-- ====================================================================
bstrEucToSjis :: B.ByteString -> B.ByteString
bstrEucToSjis bstr = 
  if B.null bstr then "" else eucToSjis (B.head bstr) (B.tail bstr)
    where
     eucToSjis :: Char -> B.ByteString -> B.ByteString
     eucToSjis x xs | n == 0x8E = if B.null xs                -- 半角カナ
                                      then error "0x8E(conv error)" 
                                      else B.cons (B.head xs) (bstrEucToSjis (B.tail xs))
                    | n <= 0xA0 = x `B.cons` (bstrEucToSjis xs) -- ASCII
                    | otherwise = if B.null xs
                                    then error "(conv error)"
                                    else let (c1,c2) = jisToSjis(eucToJis (x,B.head xs))
                                         in  c1 `B.cons` (c2 `B.cons` (bstrEucToSjis (B.tail xs)))
      where
        n = ord x
        eucToJis ::(Char,Char) -> (Word8,Word8)
        eucToJis (c1,c2) = (fromIntegral ((ord c1) - 0x80),fromIntegral((ord c2) - 0x80))

        jisToSjis ::(Word8,Word8)-> (Char,Char)
        jisToSjis (c1,c2) = (chr (if o1 >= 0xa0 then o1 + 0x40 else o1), 
                             chr (if o2 >= 0x7f then o2 + 1    else o2))
          where
            (o1,o2) = if c1 `mod` 2 == 1 then (fromIntegral(((c1 + 1) `div` 2) + 0x70), fromIntegral(c2 + 0x1f))
                                         else (fromIntegral((c1 `div` 2) + 0x70),fromIntegral(c2 + 0x7d))

-- ====================================================================
bstrSjisToEuc :: B.ByteString -> B.ByteString
bstrSjisToEuc bstr =
  if B.null bstr then "" else sjisToEuc (B.head bstr) (B.tail bstr)
    where
     sjisToEuc :: Char -> B.ByteString -> B.ByteString
     sjisToEuc x xs | n <= 0x7F              = x `B.cons` bstrSjisToEuc xs -- ASCII
                    | n >= 0xA0 && n <= 0xDF = (chr 0x8E) `B.cons` x `B.cons` bstrSjisToEuc xs  -- 半角カナ
                    | otherwise = if B.null xs
                                    then error "(conv error)"
                                    else let (c1,c2) = jisToEuc(sjisToJis (x,B.head xs)) 
                                         in  c1 `B.cons` c2 `B.cons` (bstrSjisToEuc (B.tail xs))
        where
          n = ord x
          jisToEuc ::(Int,Int) -> (Char,Char)
          jisToEuc (c1,c2) = (chr (c1 + 0x80), chr (c2 + 0x80))
          sjisToJis ::(Char,Char)-> (Int,Int)
          sjisToJis (c1,c2) = let o1 = if n1 >= 0xe0 then n1 - 0x40 else n1
                                  o2 = if n2 >= 0x80 then n2 - 1    else n2
                              in  if o1 >= 0x9e then ((o1 - 0x70) * 2,      o2 - 0x7d)
                                                else (((o1 - 0x70) * 2) - 1,o2 - 0x1f)
              where
                n1 = ord c1
                n2 = ord c2


sjisToUtf16 :: B.ByteString -> String
sjisToUtf16 = unsafePerformIO.shiftJisToUtf16'

foreign import stdcall "windows.h SysAllocStringLen" cSysAllocStringLen  :: BSTRobj -> CInt -> IO (Ptr CWchar)
-- http://msdn.microsoft.com/ja-jp/library/cc448053.aspx
foreign import stdcall "windows.h MultiByteToWideChar" cMultiByteToWideChar
        :: CUInt      ->  --    UINT CodePage,         // コードページ
           CULong     ->  --    DWORD dwFlags,         // 文字の種類を指定するフラグ
           CString    ->  --    LPCSTR lpMultiByteStr, // マップ元文字列のアドレス
           CInt       ->  --    int cchMultiByte,      // マップ元文字列のバイト数
           Ptr CWchar ->  --    LPWSTR lpWideCharStr,  // マップ先ワイド文字列を入れるバッファのアドレス
           CInt       ->  --    int cchWideChar        // バッファのサイズ
           IO CInt

foreign import stdcall "oleauto.h SysFreeString" cSysFreeString   :: CWString -> IO ()
foreign import ccall "string.h strlen"  cstrlen   :: CString  -> IO CInt

-- Shift-JISからUtf16への変換
shiftJisToUtf16' :: B.ByteString -> IO String
shiftJisToUtf16' byteString = do
    cwstring <- B.useAsCString byteString cstringToBSTR
    str      <- peekCWString cwstring
    cSysFreeString cwstring
    return str
  where
    cstringToBSTR :: CString -> IO CWString
    cstringToBSTR cstring = do
        cstringlen <- cstrlen cstring
        out_size   <- cMultiByteToWideChar CP_ACP 0 cstring cstringlen nullPtr 0
        wstr       <- cSysAllocStringLen nullPtr out_size
        _          <- cMultiByteToWideChar CP_ACP 0 cstring cstringlen wstr out_size
        return wstr


utf16ToSjis :: String -> B.ByteString
utf16ToSjis = unsafePerformIO.utf16ToShiftJis'


utf16ToShiftJis' :: String -> IO B.ByteString
utf16ToShiftJis' string = do
    cstring <- withCWString string bstrToCString
    byteS   <- B.packCString cstring
    free cstring
    return byteS
  where
    bstrToCString :: Ptr CWchar -> IO CString
    bstrToCString bstr = do
        out_size <- cWideCharToMultiByte CP_ACP 0 bstr (-1) nullPtr 0 nullPtr nullPtr
        cstring  <- mallocBytes ((fromIntegral out_size) + 1):: IO (Ptr CChar)
        _        <- cWideCharToMultiByte CP_ACP 0 bstr (-1) cstring out_size nullPtr nullPtr
        return cstring
-- http://msdn.microsoft.com/ja-jp/library/cc448089.aspx
foreign import stdcall "windows.h WideCharToMultiByte" cWideCharToMultiByte
        :: CUInt      ->  --  UINT CodePage,           // コードページ
           CULong     ->  --  DWORD dwFlags,           // 処理速度とマッピング方法を決定するフラグ
           Ptr CWchar ->  --  LPCWSTR lpWideCharStr,   // ワイド文字列のアドレス
           CInt       ->  --  int cchWideChar,         // ワイド文字列の文字数
           CString    ->  --  LPSTR lpMultiByteStr,    // 新しい文字列を受け取るバッファのアドレス
           CInt       ->  --  int cchMultiByte,        // 新しい文字列を受け取るバッファのサイズ
           CString    ->  --  LPCSTR lpDefaultChar,    // マップできない文字の既定値のアドレス
           Ptr Int    ->  --  LPBOOL lpUsedDefaultChar //既定の文字を使ったときにセットするフラグのアドレス
           IO CInt

main :: IO()
main = do
  let str   = "こんにちは、世界。"
      bsjis = utf16ToSjis str
  B.putStrLn bsjis                                -- => こんにちは、世界。
  putStrLn   (B.unpack bsjis)                     -- => こんにちは、世界。
  B.putStrLn (utf16ToSjis (sjisToUtf16 bsjis))    -- => こんにちは、世界。
  B.putStrLn (bstrEucToSjis(bstrSjisToEuc bsjis)) -- => こんにちは、世界。