Skip to content

Commit 8dd6e66

Browse files
committed
Add Unicode.Char.Case.Compat with toLower, toUpper and toTitle functions.
Fix UnicodeData.txt parsing.
1 parent fc5a08b commit 8dd6e66

File tree

9 files changed

+4535
-10
lines changed

9 files changed

+4535
-10
lines changed

Changelog.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
## 0.3.0 (December 2021)
44

55
- Support for big-endian architectures.
6+
- Added the module `Unicode.Char.Case.Compat`.
67
- Added `GeneralCategory` data type and corresponding `generalCategoryAbbr`,
78
`generalCategory` functions.
89
- Added the following functions to `Unicode.Char.General`:

exe/Parser/Text.hs

Lines changed: 86 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import Control.Monad.IO.Class (MonadIO(liftIO))
2424
import Data.Bits (Bits(..))
2525
import Data.Word (Word8)
2626
import Data.Char (chr, ord, isSpace)
27+
import Data.Functor ((<&>))
2728
import Data.Function ((&))
2829
import Data.List (unfoldr, intersperse)
2930
import Data.Maybe (fromMaybe)
@@ -78,6 +79,9 @@ data DetailedChar =
7879
, _combiningClass :: Int
7980
, _decompositionType :: Maybe DecompType
8081
, _decomposition :: Decomp
82+
, _simpleUppercaseMapping :: Maybe Char
83+
, _simpleLowercaseMapping :: Maybe Char
84+
, _simpleTitlecaseMapping :: Maybe Char
8185
}
8286
deriving (Show)
8387

@@ -99,6 +103,10 @@ apacheLicense modName =
99103
readCodePoint :: String -> Char
100104
readCodePoint = chr . read . ("0x"++)
101105

106+
readCodePointM :: String -> Maybe Char
107+
readCodePointM "" = Nothing
108+
readCodePointM u = Just (readCodePoint u)
109+
102110
genSignature :: String -> String
103111
genSignature = (<> " :: Char -> Bool")
104112

@@ -390,8 +398,8 @@ genDecomposeDefModule moduleName before after dtype pred =
390398
step st dc = genDecomposeDef dc : st
391399

392400
done st =
393-
let body = genHeader ++ before ++ genSign ++ reverse st ++ after
394-
in unlines body
401+
let body = mconcat [genHeader, before, genSign, reverse st, after]
402+
in unlines body
395403

396404
genDecomposeDef dc =
397405
"decompose "
@@ -488,6 +496,48 @@ genCompositionsModule moduleName compExclu non0CC =
488496
++ composeStarterPair (reverse sp)
489497
++ isSecondStarter (Set.toList (Set.fromList ss))
490498

499+
genSimpleCaseMappingModule
500+
:: Monad m
501+
=> String
502+
-> String
503+
-> (DetailedChar -> Maybe Char)
504+
-> Fold m DetailedChar String
505+
genSimpleCaseMappingModule moduleName funcName field =
506+
done <$> Fold.foldl' step initial
507+
508+
where
509+
510+
genHeader =
511+
[ apacheLicense moduleName
512+
, "module " <> moduleName
513+
, "(" ++ funcName ++ ")"
514+
, "where"
515+
, ""
516+
]
517+
genSign =
518+
[ "{-# NOINLINE " ++ funcName ++ " #-}"
519+
, funcName ++ " :: Char -> Char"
520+
]
521+
initial = []
522+
523+
step ds dc = case genUpper dc of
524+
Nothing -> ds
525+
Just d -> d : ds
526+
527+
after = [funcName ++ " c = c"]
528+
529+
done st =
530+
let body = mconcat [genHeader, genSign, reverse st, after]
531+
in unlines body
532+
533+
genUpper dc = field dc <&> \c -> mconcat
534+
[ funcName
535+
, " "
536+
, show (_char dc)
537+
, " = "
538+
, show c
539+
]
540+
491541
genCorePropertiesModule ::
492542
Monad m => String -> (String -> Bool) -> Fold m (String, [Int]) String
493543
genCorePropertiesModule moduleName isProp =
@@ -621,7 +671,16 @@ parseUnicodeDataLines
621671
parseDetailedChar :: String -> DetailedChar
622672
parseDetailedChar line =
623673
DetailedChar
624-
(readCodePoint char) name (read gc) (read combining) dctype dcval
674+
{ _char = readCodePoint char
675+
, _name = name
676+
, _generalCategory = read gc
677+
, _combiningClass = read combining
678+
, _decompositionType = dctype
679+
, _decomposition = dcval
680+
, _simpleUppercaseMapping = readCodePointM sUpper
681+
, _simpleLowercaseMapping = readCodePointM sLower
682+
, _simpleTitlecaseMapping = readCodePointM sTitle
683+
}
625684

626685
where
627686

@@ -632,13 +691,15 @@ parseDetailedChar line =
632691
(_bidi, line5) = span (/= ';') (tail line4)
633692
(decomposition, line6) = span (/= ';') (tail line5)
634693
(dctype, dcval) = readDecomp decomposition
635-
(_numeric, line7) = span (/= ';') (tail line6)
636-
(_bidiM, line8) = span (/= ';') (tail line7)
637-
(_uni1Name, line9) = span (/= ';') (tail line8)
638-
(_iso, line10) = span (/= ';') (tail line9)
639-
(_sUpper, line11) = span (/= ';') (tail line10)
640-
(_sLower, line12) = span (/= ';') (tail line11)
641-
_sTitle = tail line12
694+
(_decimal, line7) = span (/= ';') (tail line6)
695+
(_digit, line8) = span (/= ';') (tail line7)
696+
(_numeric, line9) = span (/= ';') (tail line8)
697+
(_bidiM, line10) = span (/= ';') (tail line9)
698+
(_uni1Name, line11) = span (/= ';') (tail line10)
699+
(_iso, line12) = span (/= ';') (tail line11)
700+
(sUpper, line13) = span (/= ';') (tail line12)
701+
(sLower, line14) = span (/= ';') (tail line13)
702+
sTitle = tail line14
642703

643704
-------------------------------------------------------------------------------
644705
-- Generation
@@ -738,6 +799,9 @@ genModules indir outdir props = do
738799
, decompositionsK2
739800
, decompositionsK
740801
, generalCategory
802+
, simpleUpperCaseMapping
803+
, simpleLowerCaseMapping
804+
, simpleTitleCaseMapping
741805
]
742806

743807
runGenerator
@@ -797,3 +861,15 @@ genModules indir outdir props = do
797861
generalCategory =
798862
( "Unicode.Internal.Char.UnicodeData.GeneralCategory"
799863
, genGeneralCategoryModule)
864+
865+
simpleUpperCaseMapping =
866+
( "Unicode.Internal.Char.UnicodeData.SimpleUpperCaseMapping"
867+
, \m -> genSimpleCaseMappingModule m "toSimpleUpperCase" _simpleUppercaseMapping)
868+
869+
simpleLowerCaseMapping =
870+
( "Unicode.Internal.Char.UnicodeData.SimpleLowerCaseMapping"
871+
, \m -> genSimpleCaseMappingModule m "toSimpleLowerCase" _simpleLowercaseMapping)
872+
873+
simpleTitleCaseMapping =
874+
( "Unicode.Internal.Char.UnicodeData.SimpleTitleCaseMapping"
875+
, \m -> genSimpleCaseMappingModule m "toSimpleTitleCase" _simpleTitlecaseMapping)

lib/Unicode/Char.hs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ module Unicode.Char
2929
( module Unicode.Char.General
3030
, module Unicode.Char.General.Compat
3131
, module Unicode.Char.Case
32+
, module Unicode.Char.Case.Compat
3233
, module Unicode.Char.Numeric
3334
, module Unicode.Char.Normalization
3435
, module Unicode.Char.Identifiers
@@ -41,6 +42,7 @@ where
4142

4243
import Data.Char (chr, ord)
4344
import Unicode.Char.Case
45+
import Unicode.Char.Case.Compat
4446
import Unicode.Char.General
4547
import Unicode.Char.General.Compat
4648
import Unicode.Char.Identifiers

lib/Unicode/Char/Case/Compat.hs

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
-- |
2+
-- Module : Unicode.Char.Case.Compat
3+
-- Copyright : (c) 2020 Composewell Technologies and Contributors
4+
-- License : Apache-2.0
5+
-- Maintainer : [email protected]
6+
-- Stability : experimental
7+
--
8+
-- Compatibility module for case and case mapping related functions..
9+
--
10+
-- The functions of this module are drop-in replacement for those in "Data.Char".
11+
-- They are similar but not identical to some functions in "Unicode.Char.Case",
12+
-- therefore they are placed in a separate module in order to avoid ambiguity.
13+
--
14+
module Unicode.Char.Case.Compat
15+
( toUpper
16+
, toLower
17+
, toTitle
18+
) where
19+
20+
import qualified Unicode.Internal.Char.UnicodeData.SimpleLowerCaseMapping as C
21+
import qualified Unicode.Internal.Char.UnicodeData.SimpleTitleCaseMapping as C
22+
import qualified Unicode.Internal.Char.UnicodeData.SimpleUpperCaseMapping as C
23+
24+
-- | Convert a letter to the corresponding upper-case letter, if any.
25+
-- Any other character is returned unchanged.
26+
--
27+
-- prop> toUpper c == Data.Char.toUpper c
28+
--
29+
-- @since 0.3.0
30+
{-# INLINE toUpper #-}
31+
toUpper :: Char -> Char
32+
toUpper = C.toSimpleUpperCase
33+
34+
-- | Convert a letter to the corresponding lower-case letter, if any.
35+
-- Any other character is returned unchanged.
36+
--
37+
-- prop> toLower c == Data.Char.toLower c
38+
--
39+
-- @since 0.3.0
40+
{-# INLINE toLower #-}
41+
toLower :: Char -> Char
42+
toLower = C.toSimpleLowerCase
43+
44+
-- | Convert a letter to the corresponding title-case or upper-case letter,
45+
-- if any. (Title case differs from upper case only for a small number of
46+
-- ligature letters.) Any other character is returned unchanged.
47+
--
48+
-- prop> toTitle c == Data.Char.toTitle c
49+
--
50+
-- @since 0.3.0
51+
{-# INLINE toTitle #-}
52+
toTitle :: Char -> Char
53+
toTitle = C.toSimpleTitleCase

0 commit comments

Comments
 (0)