Skip to content

Commit 093f2d6

Browse files
committed
core: Improve Blocks
1 parent e324a09 commit 093f2d6

File tree

6 files changed

+424
-378
lines changed

6 files changed

+424
-378
lines changed

ucd2haskell/exe/UCD2Haskell/Modules/Blocks.hs

Lines changed: 28 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ recipe = ModuleRecipe
2323
genBlocksModule
2424

2525
data Acc = Acc
26-
{ blocks :: ![BB.Builder]
26+
{ count :: !Word
27+
, blocks :: ![BB.Builder]
2728
, defs :: ![BB.Builder]
2829
, ranges :: ![(Int, Int)] }
2930

@@ -33,10 +34,11 @@ genBlocksModule moduleName = Fold step initial done
3334

3435
done Acc{..} = let ranges' = reverse ranges in unlinesBB
3536
[ apacheLicense 2022 moduleName
37+
, "{-# LANGUAGE LambdaCase #-}"
3638
, "{-# OPTIONS_HADDOCK hide #-}"
3739
, ""
3840
, "module " <> moduleName
39-
, "(Block(..), BlockDefinition(..), block, blockDefinition)"
41+
, "(Block(..), block, blockDefinition)"
4042
, "where"
4143
, ""
4244
, "import Data.Ix (Ix)"
@@ -53,25 +55,25 @@ genBlocksModule moduleName = Fold step initial done
5355
, " = " <> mconcat (L.intersperse "\n | " (reverse blocks))
5456
, " deriving (Enum, Bounded, Eq, Ord, Ix, Show)"
5557
, ""
56-
, "-- | Block definition: range and name."
57-
, "--"
58-
, "-- @since 0.3.1"
59-
, "data BlockDefinition = BlockDefinition"
60-
, " { blockRange :: !(Int, Int) -- ^ Range"
61-
, " , blockName :: !String -- ^ Name"
62-
, " } deriving (Eq, Ord, Show)"
63-
, ""
6458
, "-- | Block definition"
6559
, "--"
60+
, "-- Undefined for values greater than " <> BB.wordDec (pred count) <> "."
61+
, "--"
62+
, "-- Returned value:"
63+
, "--"
64+
, "-- * Lower bound"
65+
, "-- * Upper bound"
66+
, "-- * Name (null terminated ASCII string)"
67+
, "--"
6668
, "-- @since 0.3.1"
67-
, "blockDefinition :: Block -> BlockDefinition"
68-
, "blockDefinition b = case b of"
69+
, "blockDefinition :: Int# -> (# Int#, Int#, Addr# #)"
70+
, "blockDefinition = \\case"
6971
, mconcat (reverse defs)
70-
, "-- | Character block, if defined."
72+
, "-- | Character block, if defined, else -1."
7173
, "--"
7274
, "-- @since 0.3.1"
73-
, "block :: Char -> Maybe Int"
74-
, "block (C# c#) = getBlock 0# " <> BB.intDec (length ranges - 1) <> BB.char7 '#'
75+
, "block :: Char# -> Int#"
76+
, "block c# = getBlock 0# " <> BB.intDec (length ranges - 1) <> BB.char7 '#'
7577
, " where"
7678
, " -- [NOTE] Encoding"
7779
, " -- A range is encoded as two LE Word32:"
@@ -83,7 +85,7 @@ genBlocksModule moduleName = Fold step initial done
8385
, ""
8486
, " -- Binary search"
8587
, " getBlock l# u# = if isTrue# (l# ># u#)"
86-
, " then Nothing"
88+
, " then -1#"
8789
, " else"
8890
, " let k# = l# +# uncheckedIShiftRL# (u# -# l#) 1#"
8991
, " j# = k# `uncheckedIShiftL#` 1#"
@@ -99,7 +101,7 @@ genBlocksModule moduleName = Fold step initial done
99101
, " then getBlock l# (k# -# 1#)"
100102
, " -- cp in block: get block index"
101103
, " else let block# = cpL0# `uncheckedShiftRL#` 21#"
102-
, " in Just (I# (word2Int# block#))"
104+
, " in word2Int# block#"
103105
, ""
104106
, " getRawCodePoint# = lookupWord32# ranges#"
105107
, ""
@@ -111,7 +113,7 @@ genBlocksModule moduleName = Fold step initial done
111113
, " \"" <> enumMapToAddrLiteral 4 0xff (mkRanges ranges') "\"#"
112114
]
113115

114-
initial = Acc mempty mempty mempty
116+
initial = Acc 0 mempty mempty mempty
115117

116118
step Acc{..} (Prop.Entry range blockName) = case range of
117119
U.SingleChar c -> error ("genBlocksModule: expected range, got: " <> show c)
@@ -120,8 +122,9 @@ genBlocksModule moduleName = Fold step initial done
120122
blockRange = (ord start, ord end)
121123
blockName' = BB.shortByteString blockName
122124
in Acc
123-
{ blocks = mkBlockConstructor blockID blockName' blockRange : blocks
124-
, defs = mkBlockDef blockID blockName' blockRange : defs
125+
{ count = succ count
126+
, blocks = mkBlockConstructor blockID blockName' blockRange : blocks
127+
, defs = mkBlockDef count blockName' blockRange : defs
125128
, ranges = blockRange : ranges }
126129

127130
mkBlockConstructor blockID blockName (l, u) = mconcat
@@ -135,16 +138,12 @@ genBlocksModule moduleName = Fold step initial done
135138
, "."
136139
]
137140

138-
mkBlockDef blockID blockName (l, u) = mconcat
141+
mkBlockDef blockIndex blockName (l, u) = mconcat
139142
[ " "
140-
, blockID
141-
, " -> BlockDefinition (0x"
142-
, showPaddedHexB l
143-
, ", 0x"
144-
, showPaddedHexB u
145-
, ") \""
146-
, blockName
147-
, "\"\n"
143+
, if u == ord maxBound then "_ " else BB.wordDec blockIndex <> "#"
144+
, " -> (# 0x", showPaddedHexB l, "#, 0x", showPaddedHexB u, "#, \""
145+
, blockName -- NOTE: name is ASCII
146+
, "\\0\"# #)\n"
148147
]
149148

150149
-- [NOTE] Encoding: a range is encoded as two LE Word32:

unicode-data/lib/Unicode/Char/General/Blocks.hs

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
{-# LANGUAGE PatternSynonyms #-}
2+
{-# LANGUAGE ViewPatterns #-}
3+
{-# LANGUAGE DeriveGeneric #-}
4+
15
-- |
26
-- Module : Unicode.Char.General.Blocks
37
-- Copyright : (c) 2020 Composewell Technologies and Contributors
@@ -10,19 +14,45 @@
1014
-- @since 0.3.1
1115

1216
module Unicode.Char.General.Blocks
13-
( B.Block(..)
14-
, B.BlockDefinition(..)
17+
( -- * Blocks
18+
B.Block(..)
1519
, block
16-
, B.blockDefinition
20+
-- * Blocks definitions
21+
, BlockDefinition(..)
22+
, blockDefinition
1723
)
1824

1925
where
2026

27+
import GHC.Exts (Char (..), Int (..), dataToTag#, tagToEnum#)
28+
29+
import Unicode.Internal.Bits (unpackCString#)
2130
import qualified Unicode.Internal.Char.Blocks as B
2231

2332
-- | Character [block](https://www.unicode.org/glossary/#block), if defined.
2433
--
2534
-- @since 0.3.1
2635
{-# INLINE block #-}
2736
block :: Char -> Maybe B.Block
28-
block = fmap toEnum . B.block
37+
block (C# c#) = case B.block c# of
38+
-1# -> Nothing
39+
b# -> Just (tagToEnum# b# :: B.Block)
40+
41+
-- | Block definition: range and name.
42+
--
43+
-- @since 0.3.1
44+
data BlockDefinition = BlockDefinition
45+
{ blockRange :: !(Int, Int) -- ^ Range
46+
, blockName :: !String -- ^ Name
47+
} deriving (Eq, Ord, Show)
48+
49+
-- | Block definition
50+
--
51+
-- @since 0.3.1
52+
blockDefinition :: B.Block -> BlockDefinition
53+
blockDefinition b = case B.blockDefinition (dataToTag# b) of
54+
(# lower#, upper#, name# #) -> BlockDefinition range name
55+
where
56+
!range = (I# lower#, I# upper#)
57+
-- Note: names are ASCII. See Unicode Standard 15.0.0, section 3.4.
58+
!name = unpackCString# name#

unicode-data/lib/Unicode/Internal/Bits.hs

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,13 @@
1212
-- Fast, static bitmap lookup utilities
1313

1414
module Unicode.Internal.Bits
15-
( lookupBit64,
15+
( -- * Bitmap lookup
16+
lookupBit64,
1617
lookupWord8AsInt,
1718
lookupWord16AsInt,
18-
lookupWord32#
19+
lookupWord32#,
20+
-- * CString
21+
unpackCString#
1922
) where
2023

2124
#include "MachDeps.h"
@@ -36,6 +39,12 @@ import GHC.Exts
3639
byteSwap16#, byteSwap32#)
3740
#endif
3841

42+
#if MIN_VERSION_base(4,15,0)
43+
import GHC.Exts (unpackCString#)
44+
#else
45+
import GHC.CString (unpackCString#)
46+
#endif
47+
3948
-- | @lookup64 addr index@ looks up the bit stored at bit index @index@ using a
4049
-- bitmap starting at the address @addr@. Looks up the 64-bit word containing
4150
-- the bit and then the bit in that word. The caller must make sure that the

0 commit comments

Comments
 (0)