Skip to content

Commit 281e9db

Browse files
committed
ucd2haskell: Use Shamochu on predicates bitmaps
- Compress predicates bitmaps with the Samochu algorithm. - Rename `lookupBit64` to `lookupBit` and lookup bytes, not words. - Improve properties files.
1 parent 5b870dd commit 281e9db

File tree

15 files changed

+1410
-1268
lines changed

15 files changed

+1410
-1268
lines changed

ucd2haskell/exe/UCD2Haskell/Generator.hs

Lines changed: 179 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,15 @@ module UCD2Haskell.Generator
1212
, moduleToFileName
1313
, dirFromFileName
1414
-- * Bitmap
15+
, BitmapType(..)
1516
, genBitmap
1617
, genEnumBitmap
1718
, bitMapToAddrLiteral
1819
, enumMapToAddrLiteral
1920
, chunkAddrLiteral
2021
, word32ToWord8s
2122
, splitPlanes
23+
, genBitmapShamochu
2224
, genEnumBitmapShamochu
2325
, generateShamochuBitmaps
2426
, toLookupBitMapName
@@ -29,6 +31,8 @@ module UCD2Haskell.Generator
2931
, apacheLicense
3032
) where
3133

34+
import Control.Exception (assert)
35+
import Data.Bifunctor (Bifunctor(..))
3236
import Data.Bits (Bits (..))
3337
import qualified Data.ByteString as B
3438
import qualified Data.ByteString.Builder as BB
@@ -185,7 +189,7 @@ genBitmap funcName ordList = mconcat
185189
, showPaddedHeXB (minimum ordList)
186190
, " && cp <= 0x"
187191
, showPaddedHeXB (maximum ordList)
188-
, " && lookupBit64 bitmap# cp\n"
192+
, " && lookupBit bitmap# cp\n"
189193
, " where\n" ]
190194
, rawBitmap )
191195
-- Planes 0-3 and 14
@@ -203,17 +207,21 @@ genBitmap funcName ordList = mconcat
203207
, " = False\n" ]
204208
else ""
205209
, " | cp < 0x", showPaddedHeXB bound1
206-
, " = lookupBit64 bitmap# cp\n"
210+
, " = lookupBit bitmap# cp\n"
207211
, " | cp < 0xE0000 = False\n"
208212
, " | cp < 0x", showPaddedHeXB bound2
209-
, " = lookupBit64 bitmap# (cp - 0x"
213+
, " = lookupBit bitmap# (cp - 0x"
210214
, showPaddedHeXB (0xE0000 - bound1)
211215
, ")\n"
212216
, " | otherwise = False\n"
213217
, " where\n"
214218
, " cp = ord c\n" ]
215219
, planes0To3 <> plane14 )
216220

221+
{-|
222+
>>> positionsToBitMap [0, 3, 8]
223+
[True,False,False,True,False,False,False,False,True]
224+
-}
217225
positionsToBitMap :: [Int] -> [Bool]
218226
positionsToBitMap = go 0
219227

@@ -450,6 +458,83 @@ word32ToWord8s n = (\k -> fromIntegral ((n `shiftR` k) .&. 0xff)) <$> [0,8..24]
450458
-- Bitmaps: Shamochu algorithm
451459
--------------------------------------------------------------------------------
452460

461+
genBitmapShamochu ::
462+
HasCallStack
463+
=> String
464+
-> NE.NonEmpty Word
465+
-- ^ Chunk size stage 1
466+
-> [Word]
467+
-- ^ Chunk size stage 2
468+
-> [Int]
469+
-> BB.Builder
470+
genBitmapShamochu funcNameStr stage1 stage2 ordList = mconcat
471+
[ "{-# INLINE " <> funcName <> " #-}\n"
472+
, funcName, " :: Char -> Bool\n"
473+
, funcName, func
474+
, "\n"
475+
, generateShamochuBitmaps funcNameStr False BitMap stage1 stage2 id (packBits bitmap)
476+
]
477+
where
478+
funcName = BB.string7 funcNameStr
479+
rawBitmap = positionsToBitMap ordList
480+
lookupFunc = toLookupBitMapName funcNameStr
481+
(func, bitmap) = if length rawBitmap <= 0x40000
482+
-- Only planes 0-3
483+
then
484+
( mconcat
485+
[ " c = c >= '\\x"
486+
, showPaddedHeXB (minimum ordList)
487+
, "' && c <= '\\x"
488+
, showPaddedHeXB (maximum ordList)
489+
, "' && ", lookupFunc, " (ord c)\n" ]
490+
, rawBitmap )
491+
-- Planes 0-3 and 14
492+
else
493+
let (planes0To3, plane14) = splitPlanes "genBitmap: cannot build" not rawBitmap
494+
bound0 = pred (minimum ordList)
495+
bound1 = length planes0To3
496+
bound2 = 0xE0000 + length plane14
497+
in ( mconcat
498+
[ " c\n"
499+
, if bound0 > 0
500+
then mconcat
501+
[ " | c < '\\x"
502+
, showPaddedHeXB bound0
503+
, "' = False\n" ]
504+
else ""
505+
, " | c < '\\x", showPaddedHeXB bound1
506+
, "' = ", lookupFunc, " (ord c)\n"
507+
, " | c < '\\xE0000' = False\n"
508+
, " | c < '\\x", showPaddedHeXB bound2
509+
, "' = ", lookupFunc, " (ord c - 0x"
510+
, showPaddedHeXB (0xE0000 - bound1)
511+
, ")\n"
512+
, " | otherwise = False\n"
513+
]
514+
, planes0To3 <> plane14 )
515+
516+
{-|
517+
>>> packBits [True, False, False, False, False, False, False, False, False, True]
518+
[1,2]
519+
-}
520+
packBits :: [Bool] -> [Word8]
521+
packBits = L.unfoldr go
522+
where
523+
go :: [Bool] -> Maybe (Word8, [Bool])
524+
go [] = Nothing
525+
go xs = Just . first pack . splitAt 8 $ xs
526+
527+
pack :: [Bool] -> Word8
528+
pack = toByte . padTo8
529+
530+
padTo8 :: [Bool] -> [Bool]
531+
padTo8 xs
532+
| length xs >= 8 = xs
533+
| otherwise = xs <> replicate (8 - length xs) False
534+
535+
toByte :: [Bool] -> Word8
536+
toByte xs = sum $ map (\i -> if xs !! i then 1 `shiftL` i else 0) [0..7]
537+
453538
genEnumBitmapShamochu
454539
:: forall a. (HasCallStack, Bounded a, Enum a, Eq a, Show a)
455540
=> String
@@ -477,7 +562,7 @@ genEnumBitmapShamochu funcNameStr rawInt stage1 stage2 convert (defPUA, pPUA) (d
477562
, funcName, " :: Char -> Int", rawSuffix, "\n"
478563
, funcName, func
479564
, "\n"
480-
, generateShamochuBitmaps funcNameStr rawInt stage1 stage2 convert bitmap
565+
, generateShamochuBitmaps funcNameStr rawInt ByteMap stage1 stage2 convert bitmap
481566
]
482567
where
483568
rawSuffix = if rawInt then "#" else ""
@@ -532,24 +617,44 @@ genEnumBitmapShamochu funcNameStr rawInt stage1 stage2 convert (defPUA, pPUA) (d
532617
, " ", if rawInt then "!cp@(I# cp#)" else "cp", " = ord c\n" ]
533618
, planes0To3' <> plane14' )
534619

620+
data BitmapType = BitMap | ByteMap
535621
generateShamochuBitmaps ::
536-
String -> Bool -> NE.NonEmpty Word -> [Word] -> (a -> Word8) -> [a] -> BB.Builder
537-
generateShamochuBitmaps name rawInt powersStage1 powersStage2 convert xs =
538-
case Shamochu.compress powersStage1 powersStage2 (Exts.fromList (convert <$> xs)) of
622+
-- | Name of the function
623+
String ->
624+
-- | Use raw 'Int#' if true
625+
Bool ->
626+
-- | Type
627+
BitmapType ->
628+
-- | Chunk sizes stage 1
629+
NE.NonEmpty Word ->
630+
-- | Chunk sizes stage 2
631+
[Word] ->
632+
-- | Conversion function
633+
(a -> Word8) ->
634+
-- | Input
635+
[a] ->
636+
BB.Builder
637+
generateShamochuBitmaps name rawInt mapType powersStage1 powersStage2 convert xs =
638+
case Shamochu.compress powersStage1 powersStage2 xs' of
539639
Shamochu.OneStage{..} -> trace' "OneStage" stats $ mconcat
540640
[ "{-# INLINE ", toLookupBitMapName name, " #-}\n"
541-
, toLookupBitMapName name, " :: Int", rawSuffix, " -> Int", rawSuffix, "\n"
641+
, toLookupBitMapName name, " :: Int", rawSuffix, " -> ", outputType, "\n"
542642
, toLookupBitMapName name, " n =\n"
543643
-- Lookup:
544644
-- mask = (1 << chunk_size_log2) - 1;
545645
-- original[i] = data[offsets[i >> chunk_size_log2] + (i & mask)];
546-
, mkLookup (Shamochu.dataIntSize stats) "data" 1 . mconcat $
547-
[ mkLookup (Shamochu.offsets1IntSize stats) "offsets" 2 $
548-
mkIndent 3 <> mkShiftR "n" (Shamochu.dataChunkSizeLog2 stats)
549-
, mkAnd "n" "mask" ]
646+
, case mapType of
647+
BitMap -> mkBitLookup "data" 1 . mconcat $
648+
[ mkWordLookup (Shamochu.offsets1IntSize stats) "offsets" 2 $
649+
mkIndent 3 <> mkShiftR "n" (3 + Shamochu.dataChunkSizeLog2 stats)
650+
, mkAnd (mkShiftR' "n" 3) "mask" ]
651+
ByteMap -> mkWordLookup (Shamochu.dataIntSize stats) "data" 1 . mconcat $
652+
[ mkWordLookup (Shamochu.offsets1IntSize stats) "offsets" 2 $
653+
mkIndent 3 <> mkShiftR "n" (Shamochu.dataChunkSizeLog2 stats)
654+
, mkAnd "n" "mask" ]
550655
, "\n"
551656
, " where\n"
552-
, " ", mkMask "mask" (Shamochu.dataChunkSizeLog2 stats)
657+
, " ", mkMaskDef "mask" (Shamochu.dataChunkSizeLog2 stats)
553658
, " !(Ptr data#) = ", dataBitMap, "\n"
554659
, " !(Ptr offsets#) = ", offsetsBitMap, "\n"
555660
, "\n"
@@ -559,7 +664,7 @@ generateShamochuBitmaps name rawInt powersStage1 powersStage2 convert xs =
559664
4
560665
50
561666
(Shamochu.dataIntSize stats `shiftR` 3)
562-
(Exts.toList array)
667+
(pad (Exts.toList array))
563668
"\"#\n"
564669
, "\n"
565670
, offsetsBitMap, " :: Ptr ", offsetType, "\n"
@@ -579,30 +684,39 @@ generateShamochuBitmaps name rawInt powersStage1 powersStage2 convert xs =
579684
offsetType = "Word" <> BB.wordDec (Shamochu.offsets1IntSize stats)
580685
Shamochu.TwoStages{..} -> trace' "TwoStages" stats $ mconcat
581686
[ "{-# INLINE ", toLookupBitMapName name, " #-}\n"
582-
, toLookupBitMapName name, " :: Int", rawSuffix, " -> Int", rawSuffix, "\n"
687+
, toLookupBitMapName name, " :: Int", rawSuffix, " -> ", outputType, "\n"
583688
, toLookupBitMapName name, " n =\n"
584689
-- Lookup:
585690
-- mask_data = (1 << data_chunk_size_log2) - 1
586691
-- mask_offsets = (1 << offsets_chunk_size_log2) - 1
587692
-- data[
588693
-- offsets1[
589-
-- offsets2[ks >> (data_chunk_size_log2 + offsets_chunk_size_log2)] +
590-
-- ((ks >> data_chunk_size_log2) & mask_offsets)
694+
-- offsets2[i >> (data_chunk_size_log2 + offsets_chunk_size_log2)] +
695+
-- ((i >> data_chunk_size_log2) & mask_offsets)
591696
-- ] +
592-
-- (ks & mask_data)
697+
-- (i & mask_data)
593698
-- ];
594-
, mkLookup (Shamochu.dataIntSize stats) "data" 1 . mconcat $
595-
[ mkLookup (Shamochu.offsets1IntSize stats) "offsets1" 2 . mconcat $
596-
[ mkLookup (Shamochu.offsets2IntSize stats) "offsets2" 3 $
597-
mkIndent 4 <>
598-
mkShiftR "n" (Shamochu.dataChunkSizeLog2 stats + Shamochu.offsets1ChunkSizeLog2 stats)
599-
, mkAnd ("(" <> mkShiftR "n" (Shamochu.dataChunkSizeLog2 stats) <> ")") "maskOffsets"
600-
]
601-
, mkAnd "n" "maskData" ]
699+
, case mapType of
700+
BitMap -> mkBitLookup "data" 1 . mconcat $
701+
[ mkWordLookup (Shamochu.offsets1IntSize stats) "offsets1" 2 . mconcat $
702+
[ mkWordLookup (Shamochu.offsets2IntSize stats) "offsets2" 3 $
703+
mkIndent 4 <>
704+
mkShiftR "n" (3 + Shamochu.dataChunkSizeLog2 stats + Shamochu.offsets1ChunkSizeLog2 stats)
705+
, mkAnd (mkShiftR' "n" (3 + Shamochu.dataChunkSizeLog2 stats)) "maskOffsets"
706+
]
707+
, mkAnd (mkShiftR' "n" 3) "maskData" ]
708+
ByteMap -> mkWordLookup (Shamochu.dataIntSize stats) "data" 1 . mconcat $
709+
[ mkWordLookup (Shamochu.offsets1IntSize stats) "offsets1" 2 . mconcat $
710+
[ mkWordLookup (Shamochu.offsets2IntSize stats) "offsets2" 3 $
711+
mkIndent 4 <>
712+
mkShiftR "n" (Shamochu.dataChunkSizeLog2 stats + Shamochu.offsets1ChunkSizeLog2 stats)
713+
, mkAnd (mkShiftR' "n" (Shamochu.dataChunkSizeLog2 stats)) "maskOffsets"
714+
]
715+
, mkAnd "n" "maskData" ]
602716
, "\n"
603717
, " where\n"
604-
, " ", mkMask "maskData" (Shamochu.dataChunkSizeLog2 stats)
605-
, " ", mkMask "maskOffsets" (Shamochu.offsets1ChunkSizeLog2 stats)
718+
, " ", mkMaskDef "maskData" (Shamochu.dataChunkSizeLog2 stats)
719+
, " ", mkMaskDef "maskOffsets" (Shamochu.offsets1ChunkSizeLog2 stats)
606720
, " !(Ptr data#) = ", dataBitMap, "\n"
607721
, " !(Ptr offsets1#) = ", offsets1BitMap, "\n"
608722
, " !(Ptr offsets2#) = ", offsets2BitMap, "\n"
@@ -613,7 +727,7 @@ generateShamochuBitmaps name rawInt powersStage1 powersStage2 convert xs =
613727
4
614728
50
615729
(Shamochu.dataIntSize stats `shiftR` 3)
616-
(Exts.toList dataArray)
730+
(pad (Exts.toList dataArray))
617731
"\"#\n"
618732
, "\n"
619733
, offsets1BitMap, " :: Ptr ", offset1Type, "\n"
@@ -644,6 +758,20 @@ generateShamochuBitmaps name rawInt powersStage1 powersStage2 convert xs =
644758
offset1Type = "Word" <> BB.wordDec (Shamochu.offsets1IntSize stats)
645759
offset2Type = "Word" <> BB.wordDec (Shamochu.offsets2IntSize stats)
646760
where
761+
xs' = Exts.fromList (convert <$> xs)
762+
maxWordBitSizeLog2 = 3
763+
maxWordByteSize = assert
764+
(all (>= maxWordBitSizeLog2) powersStage1) -- Chunks should not cut words
765+
(2^maxWordBitSizeLog2 `div` 8)
766+
outputType = case mapType of
767+
BitMap -> "Bool"
768+
ByteMap -> if rawInt then "Int#" else "Int"
769+
pad ys = case mapType of
770+
-- Ensure lookupBit read full words at the edge
771+
BitMap -> case rem (length ys) maxWordByteSize of
772+
0 -> ys
773+
k -> ys <> replicate (maxWordByteSize - k) 0
774+
ByteMap -> ys
647775
rawSuffix = if rawInt then "#" else ""
648776
trace' stages stats = trace $ mconcat
649777
[ "* ", name, ": Shamochu: ", stages, "; savings: "
@@ -653,25 +781,31 @@ generateShamochuBitmaps name rawInt powersStage1 powersStage2 convert xs =
653781
nameBB = BB.string7 name
654782
mkIndent :: Word -> BB.Builder
655783
mkIndent count = foldMap (const " ") [1..count]
656-
mkLookup dataSize addrName indent index = mconcat
784+
mkBitLookup addrName indent index = mconcat
657785
[ mkIndent indent
658-
, "lookupWord", BB.wordDec dataSize, "AsInt", rawSuffix, " ", addrName, "# (\n"
786+
, "lookupBit", rawSuffix, " ", addrName, "# (\n"
787+
, index, "\n"
788+
, mkIndent indent
789+
, ") (n "
790+
-- x % 2^n = x & (2^n - 1)
791+
, if rawInt then "`andI#` 7#)" else ".&. 7)" ]
792+
mkWordLookup dataSize addrName indent index = mconcat
793+
[ mkIndent indent
794+
, "lookupWord", BB.wordDec dataSize, "AsInt", rawSuffix
795+
, " ", addrName, "# (\n"
659796
, index, "\n"
660797
, mkIndent indent, ")" ]
661-
mkMask mask count = if rawInt
798+
mkMaskDef mask count = if rawInt
662799
then mconcat [mask, " = (1# `iShiftL#` ", BB.wordDec count, "#) -# 1#\n"]
663800
else mconcat [mask, " = (1 `shiftL` ", BB.wordDec count, ") - 1\n"]
801+
mkMask n mask = if rawInt
802+
then mconcat ["(", n, " `andI#` ", mask, ")"]
803+
else mconcat ["(", n, " .&. ", mask, ")"]
804+
mkAnd n mask = (if rawInt then " +# " else " + ") <> mkMask n mask
664805
mkShiftR n count = if rawInt
665806
then mconcat [n, " `iShiftRL#` ", BB.wordDec count, "#"]
666807
else mconcat [n, " `shiftR` ", BB.wordDec count]
667-
mkAnd n mask = if rawInt
668-
then mconcat [" +# (", n, " `andI#` ", mask, ")"]
669-
else mconcat [" + (", n, " .&. ", mask, ")"]
670-
671-
toTitle :: String -> String
672-
toTitle = \case
673-
c:cs -> toUpper c : cs
674-
cs -> cs
808+
mkShiftR' n count = "(" <> mkShiftR n count <> ")"
675809

676810
toLookupBitMapName :: String -> BB.Builder
677811
toLookupBitMapName name = "lookup" <> BB.string7 (toTitle name) <> "BitMap"
@@ -680,6 +814,11 @@ toLookupBitMapName name = "lookup" <> BB.string7 (toTitle name) <> "BitMap"
680814
-- Helpers
681815
--------------------------------------------------------------------------------
682816

817+
toTitle :: String -> String
818+
toTitle = \case
819+
c:cs -> toUpper c : cs
820+
cs -> cs
821+
683822
unlinesBB :: [BB.Builder] -> BB.Builder
684823
unlinesBB = (<> "\n") . mconcat . L.intersperse "\n"
685824

0 commit comments

Comments
 (0)