|
1 | 1 | {-# LANGUAGE NoImplicitPrelude
|
2 | 2 | , BangPatterns
|
3 | 3 | , TypeApplications
|
| 4 | + , MultiWayIf |
4 | 5 | #-}
|
5 | 6 | {-# OPTIONS_GHC -funbox-strict-fields #-}
|
6 | 7 |
|
@@ -114,6 +115,113 @@ ucs2le_encode
|
114 | 115 | in
|
115 | 116 | loop ir0 ow0
|
116 | 117 |
|
| 118 | +-- ----------------------------------------------------------------------------- |
| 119 | +-- UTF-16b |
| 120 | +-- |
| 121 | + |
| 122 | +-- | Mimics the base encoding for filesystem operations. This should be total on all inputs (word16 byte arrays). |
| 123 | +-- |
| 124 | +-- Note that this has a subtle difference to 'encodeWithBaseWindows'/'decodeWithBaseWindows': it doesn't care for |
| 125 | +-- the @0x0000@ end marker and will as such produce different results. Use @takeWhile (/= '\NUL')@ on the input |
| 126 | +-- to recover this behavior. |
| 127 | +utf16le_b :: TextEncoding |
| 128 | +utf16le_b = mkUTF16le_b ErrorOnCodingFailure |
| 129 | + |
| 130 | +mkUTF16le_b :: CodingFailureMode -> TextEncoding |
| 131 | +mkUTF16le_b cfm = TextEncoding { textEncodingName = "UTF-16LE_b", |
| 132 | + mkTextDecoder = utf16le_b_DF cfm, |
| 133 | + mkTextEncoder = utf16le_b_EF cfm } |
| 134 | + |
| 135 | +utf16le_b_DF :: CodingFailureMode -> IO (TextDecoder ()) |
| 136 | +utf16le_b_DF cfm = |
| 137 | + return (BufferCodec { |
| 138 | + encode = utf16le_b_decode, |
| 139 | + recover = recoverDecode cfm, |
| 140 | + close = return (), |
| 141 | + getState = return (), |
| 142 | + setState = const $ return () |
| 143 | + }) |
| 144 | + |
| 145 | +utf16le_b_EF :: CodingFailureMode -> IO (TextEncoder ()) |
| 146 | +utf16le_b_EF cfm = |
| 147 | + return (BufferCodec { |
| 148 | + encode = utf16le_b_encode, |
| 149 | + recover = recoverEncode cfm, |
| 150 | + close = return (), |
| 151 | + getState = return (), |
| 152 | + setState = const $ return () |
| 153 | + }) |
| 154 | + |
| 155 | + |
| 156 | +utf16le_b_decode :: DecodeBuffer |
| 157 | +utf16le_b_decode |
| 158 | + input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ } |
| 159 | + output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os } |
| 160 | + = let |
| 161 | + loop !ir !ow |
| 162 | + | ow >= os = done OutputUnderflow ir ow |
| 163 | + | ir >= iw = done InputUnderflow ir ow |
| 164 | + | ir + 1 == iw = done InputUnderflow ir ow |
| 165 | + | otherwise = do |
| 166 | + c0 <- readWord8Buf iraw ir |
| 167 | + c1 <- readWord8Buf iraw (ir+1) |
| 168 | + let x1 = fromIntegral c1 `shiftL` 8 + fromIntegral c0 |
| 169 | + if | iw - ir >= 4 -> do |
| 170 | + c2 <- readWord8Buf iraw (ir+2) |
| 171 | + c3 <- readWord8Buf iraw (ir+3) |
| 172 | + let x2 = fromIntegral c3 `shiftL` 8 + fromIntegral c2 |
| 173 | + if | 0xd800 <= x1 && x1 <= 0xdbff |
| 174 | + , 0xdc00 <= x2 && x2 <= 0xdfff -> do |
| 175 | + ow' <- writeCharBuf oraw ow (unsafeChr ((x1 - 0xd800)*0x400 + (x2 - 0xdc00) + 0x10000)) |
| 176 | + loop (ir+4) ow' |
| 177 | + | otherwise -> do |
| 178 | + ow' <- writeCharBuf oraw ow (unsafeChr x1) |
| 179 | + loop (ir+2) ow' |
| 180 | + | iw - ir >= 2 -> do |
| 181 | + ow' <- writeCharBuf oraw ow (unsafeChr x1) |
| 182 | + loop (ir+2) ow' |
| 183 | + | otherwise -> done InputUnderflow ir ow |
| 184 | + |
| 185 | + -- lambda-lifted, to avoid thunks being built in the inner-loop: |
| 186 | + done why !ir !ow = return (why, |
| 187 | + if ir == iw then input{ bufL=0, bufR=0 } |
| 188 | + else input{ bufL=ir }, |
| 189 | + output{ bufR=ow }) |
| 190 | + in |
| 191 | + loop ir0 ow0 |
| 192 | + |
| 193 | + |
| 194 | +utf16le_b_encode :: EncodeBuffer |
| 195 | +utf16le_b_encode |
| 196 | + input@Buffer{ bufRaw=iraw, bufL=ir0, bufR=iw, bufSize=_ } |
| 197 | + output@Buffer{ bufRaw=oraw, bufL=_, bufR=ow0, bufSize=os } |
| 198 | + = let |
| 199 | + done why !ir !ow = return (why, |
| 200 | + if ir == iw then input{ bufL=0, bufR=0 } |
| 201 | + else input{ bufL=ir }, |
| 202 | + output{ bufR=ow }) |
| 203 | + loop !ir !ow |
| 204 | + | ir >= iw = done InputUnderflow ir ow |
| 205 | + | os - ow < 2 = done OutputUnderflow ir ow |
| 206 | + | otherwise = do |
| 207 | + (c,ir') <- readCharBuf iraw ir |
| 208 | + case ord c of |
| 209 | + x | x < 0x10000 -> do |
| 210 | + writeWord8Buf oraw ow (fromIntegral x) |
| 211 | + writeWord8Buf oraw (ow+1) (fromIntegral (x `shiftR` 8)) |
| 212 | + loop ir' (ow+2) |
| 213 | + | otherwise -> |
| 214 | + if os - ow < 4 then done OutputUnderflow ir ow else do |
| 215 | + let x' = x - 0x10000 |
| 216 | + w1 = x' `div` 0x400 + 0xd800 |
| 217 | + w2 = x' `mod` 0x400 + 0xdc00 |
| 218 | + writeWord8Buf oraw ow (fromIntegral w1) |
| 219 | + writeWord8Buf oraw (ow+1) (fromIntegral (w1 `shiftR` 8)) |
| 220 | + writeWord8Buf oraw (ow+2) (fromIntegral w2) |
| 221 | + writeWord8Buf oraw (ow+3) (fromIntegral (w2 `shiftR` 8)) |
| 222 | + loop ir' (ow+4) |
| 223 | + in |
| 224 | + loop ir0 ow0 |
117 | 225 |
|
118 | 226 | -- -----------------------------------------------------------------------------
|
119 | 227 | -- Windows encoding (ripped off from base)
|
|
0 commit comments