|
| 1 | +-- | |
| 2 | +-- Module : Unicode.Char |
| 3 | +-- Copyright : (c) 2024 Composewell Technologies and Contributors |
| 4 | +-- License : Apache-2.0 |
| 5 | + |
| 6 | +-- Stability : experimental |
| 7 | + |
| 8 | +module Unicode.Internal.Char.Label |
| 9 | + ( label |
| 10 | + , addHexCodePoint |
| 11 | + , intToDigiT |
| 12 | + ) where |
| 13 | + |
| 14 | +import Data.Char (ord) |
| 15 | +import Data.Functor (($>)) |
| 16 | +import Foreign.C.String (CString, CStringLen) |
| 17 | +import Foreign.C.Types (CChar (..)) |
| 18 | +import Foreign.Marshal (allocaArray, copyArray) |
| 19 | +import Foreign.Storable (Storable (..)) |
| 20 | +import GHC.Exts (Int (..), Int#, Ptr (..), isTrue#, quotRemInt#, (+#), (-#), (<=#)) |
| 21 | +import Unicode.Char.General (CodePointType (..), codePointType) |
| 22 | + |
| 23 | +-- | Returns the label of a code point if it has no character name, otherwise |
| 24 | +-- returns @\"UNDEFINED\"@. |
| 25 | +-- |
| 26 | +-- See subsection |
| 27 | +-- [“Code Point Labels”](https://www.unicode.org/versions/Unicode15.0.0/ch04.pdf#G135248) |
| 28 | +-- in section 4.8 “Name” of the Unicode Standard. |
| 29 | +-- |
| 30 | +-- @since 0.4.0 |
| 31 | +label :: Char -> IO CStringLen |
| 32 | +label c = case codePointType c of |
| 33 | + ControlType -> mkLabel 8# "control-"# |
| 34 | + PrivateUseType -> mkLabel 12# "private-use-"# |
| 35 | + SurrogateType -> mkLabel 10# "surrogate-"# |
| 36 | + NoncharacterType -> mkLabel 13# "noncharacter-"# |
| 37 | + ReservedType -> mkLabel 9# "reserved-"# |
| 38 | + _ -> pure (Ptr "UNDEFINED"#, 9) |
| 39 | + |
| 40 | + where |
| 41 | + |
| 42 | + mkLabel len s0 = allocaArray (I# len + 6) $ \s -> do |
| 43 | + copyArray s (Ptr s0) (I# len) |
| 44 | + len' <- addHexCodePoint s len len c |
| 45 | + pure (s, len') |
| 46 | + |
| 47 | +-- | Appned the code point of a character using the Unicode Standard convention: |
| 48 | +-- hexadecimal codepoint padded with zeros if inferior to 4 characters. |
| 49 | +-- |
| 50 | +-- It is the responsability of the caller to provide a 'CString' that can hold |
| 51 | +-- up to 6 characters from the provided index. |
| 52 | +addHexCodePoint |
| 53 | + :: CString -- ^ Destination ASCII string |
| 54 | + -> Int# -- ^ String length |
| 55 | + -> Int# -- ^ Index |
| 56 | + -> Char -- ^ Character which code point will be added to the string |
| 57 | + -> IO Int -- ^ New size of the string |
| 58 | +addHexCodePoint s len i0 c |
| 59 | + | isTrue# (cp# <=# 0x0000f#) = prependAt 3# <* pad0 0# <* pad0 1# <* pad0 2# |
| 60 | + | isTrue# (cp# <=# 0x000ff#) = prependAt 3# <* pad0 0# <* pad0 1# |
| 61 | + | isTrue# (cp# <=# 0x00fff#) = prependAt 3# <* pad0 0# |
| 62 | + | isTrue# (cp# <=# 0x0ffff#) = prependAt 3# |
| 63 | + | isTrue# (cp# <=# 0xfffff#) = prependAt 4# |
| 64 | + | otherwise = prependAt 5# |
| 65 | + where |
| 66 | + !(I# cp#) = ord c |
| 67 | + pad0 i = pokeElemOff s (I# (i0 +# i)) (CChar 0x30) |
| 68 | + prependAt i = go (i0 +# i) (quotRemInt# cp# 16#) $> I# (len +# i +# 1#) |
| 69 | + go i (# n#, d #) = do |
| 70 | + pokeElemOff s (I# i) (intToDigiT d) |
| 71 | + case n# of |
| 72 | + 0# -> pure () |
| 73 | + _ -> go (i -# 1#) (quotRemInt# n# 16#) |
| 74 | + |
| 75 | +-- | Convert an 'Int#' in the range 0..15 to the corresponding single digit |
| 76 | +-- 'CChar' in upper case. |
| 77 | +-- |
| 78 | +-- Undefined for numbers outside the 0..15 range. |
| 79 | +intToDigiT :: Int# -> CChar |
| 80 | +intToDigiT i = if isTrue# (i <=# 9#) |
| 81 | + then fromIntegral (I# (0x30# +# i)) |
| 82 | + else fromIntegral (I# (0x37# +# i)) |
0 commit comments