Skip to content

Commit 6da2784

Browse files
committed
scripts: Encode extensions using 16 bits
This will enable to process Unicode 16.0.
1 parent c282124 commit 6da2784

File tree

1 file changed

+15
-16
lines changed

1 file changed

+15
-16
lines changed

ucd2haskell/exe/UCD2Haskell/Modules/ScriptsExtensions.hs

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,14 @@ module UCD2Haskell.Modules.ScriptsExtensions
1010
) where
1111

1212
import Control.Arrow (Arrow (..))
13-
import Control.Exception (assert)
1413
import qualified Data.ByteString as B
1514
import qualified Data.ByteString.Builder as BB
1615
import qualified Data.ByteString.Short as BS
1716
import qualified Data.List.NonEmpty as NE
1817
import qualified Data.Map.Strict as Map
1918
import Data.Semigroup (Arg (..))
2019
import qualified Data.Set as Set
21-
import Data.Word (Word8)
20+
import Data.Word (Word16)
2221
import qualified Unicode.CharacterDatabase.Parser.Common as U
2322
import qualified Unicode.CharacterDatabase.Parser.Properties.Defaults as Defaults
2423
import qualified Unicode.CharacterDatabase.Parser.Properties.Single as Prop
@@ -127,15 +126,15 @@ genScriptExtensionsModule moduleName aliases extensions = Fold step initial done
127126
. Set.toAscList
128127
. Set.map (\s -> Arg (mkHaskellConstructor' s) s)
129128
$ usedScripts
130-
toWord8 :: Word8 -> Word8
131-
toWord8 =
132-
assert (fromEnum (Map.size encodedExtensions) < 0xff)
133-
(fromIntegral . fromEnum)
129+
toWord16 :: Word16 -> Word16
130+
toWord16 = if fromEnum (Map.size encodedExtensions) < 0xffff
131+
then fromIntegral . fromEnum
132+
else error "Too many script extensions"
134133

135134
mkHaskellConstructor' = B.toStrict . BB.toLazyByteString . mkHaskellConstructor
136-
encodedAbbr :: Map.Map BS.ShortByteString Word8
135+
encodedAbbr :: Map.Map BS.ShortByteString Word16
137136
encodedAbbr = Map.fromList (first getScriptAbbr <$> zip scripts [0..])
138-
encodeAbbr :: BS.ShortByteString -> Word8
137+
encodeAbbr :: BS.ShortByteString -> Word16
139138
encodeAbbr = (encodedAbbr Map.!)
140139

141140
singleScriptExtensions = pure . getScriptAbbr <$> scripts
@@ -147,10 +146,10 @@ genScriptExtensionsModule moduleName aliases extensions = Fold step initial done
147146
extensionsList = singleScriptExtensions
148147
<> Set.toList multiScriptExtensions
149148

150-
encodedExtensions :: Map.Map (NE.NonEmpty BS.ShortByteString) Word8
151-
encodedExtensions = let len = length extensionsList in if len > 0xff
152-
then error ("Too many script extensions: " <> show len)
153-
else Map.fromList (zip extensionsList [0..])
149+
encodedExtensions :: Map.Map (NE.NonEmpty BS.ShortByteString) Word16
150+
encodedExtensions = let len = length extensionsList in if len < 0xffff
151+
then Map.fromList (zip extensionsList [0..])
152+
else error ("Too many script extensions: " <> show len)
154153

155154
encodeExtensions = (encodedExtensions Map.!)
156155

@@ -166,7 +165,7 @@ genScriptExtensionsModule moduleName aliases extensions = Fold step initial done
166165
True
167166
(NE.singleton 3)
168167
[5]
169-
toWord8
168+
toWord16
170169
(def, BB.intDec (fromEnum def))
171170
(def, BB.intDec (fromEnum def))
172171
planes0To3
@@ -176,8 +175,8 @@ genScriptExtensionsModule moduleName aliases extensions = Fold step initial done
176175
(Set.fromList ["Addr#", "Int(..)", "nullAddr#", "negateInt#"])
177176

178177
mkDecodeScriptExtensions
179-
:: (NE.NonEmpty BS.ShortByteString -> Word8)
180-
-> (BS.ShortByteString -> Word8)
178+
:: (NE.NonEmpty BS.ShortByteString -> Word16)
179+
-> (BS.ShortByteString -> Word16)
181180
-> Set.Set (NE.NonEmpty BS.ShortByteString)
182181
-> BB.Builder
183182
mkDecodeScriptExtensions encodeExtensions encodeAbbr
@@ -187,7 +186,7 @@ genScriptExtensionsModule moduleName aliases extensions = Fold step initial done
187186

188187
mkDecodeScriptExtensions' = foldMap $ \(Arg v exts) -> mconcat
189188
[ "\n "
190-
, BB.word8Dec v
189+
, BB.word16Dec v
191190
, "# -> (# "
192191
, BB.intDec (length exts)
193192
, "#, \""

0 commit comments

Comments
 (0)