composewell
diff --git a/‎.editorconfig‎
Lines changed: 12 additions & 0 deletions b/‎.editorconfig‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎.hlint.ignore‎
Lines changed: 7 additions & 0 deletions b/‎.hlint.ignore‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎.packcheck.ignore‎
Lines changed: 1 addition & 0 deletions b/‎.packcheck.ignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎Changelog.md‎
Lines changed: 17 additions & 0 deletions b/‎Changelog.md‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 86 additions & 0 deletions b/‎README.md‎
Lines changed: 86 additions & 0 deletions
diff --git a/‎appveyor.yml‎
Lines changed: 1 addition & 1 deletion b/‎appveyor.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bench/Main.hs‎
Lines changed: 180 additions & 0 deletions b/‎bench/Main.hs‎
Lines changed: 180 additions & 0 deletions
@@ -0,0 +1,12 @@
+# EditorConfig is awesome: https://EditorConfig.org
+
+# top-most EditorConfig file
+root = true
+
+[*]
+indent_style = space
+indent_size = 4
+end_of_line = lf
+charset = utf-8
+trim_trailing_whitespace = true
+insert_final_newline = false
@@ -1,3 +1,10 @@
 lib/Unicode/Internal/Division.hs
 lib/Unicode/Internal/Char/PropList.hs
 lib/Unicode/Internal/Char/DerivedCoreProperties.hs
+lib/Unicode/Internal/Char/UnicodeData/CombiningClass.hs
+lib/Unicode/Internal/Char/UnicodeData/Compositions.hs
+lib/Unicode/Internal/Char/UnicodeData/Decomposable.hs
+lib/Unicode/Internal/Char/UnicodeData/DecomposableK.hs
+lib/Unicode/Internal/Char/UnicodeData/Decompositions.hs
+lib/Unicode/Internal/Char/UnicodeData/DecompositionsK2.hs
+lib/Unicode/Internal/Char/UnicodeData/GeneralCategory.hs
@@ -1,4 +1,5 @@
 .packcheck.ignore
+.editorconfig
 .github/workflows/haskell.yml
 appveyor.yml
 stack.yaml
 
@@ -1,5 +1,22 @@
 # Changelog
 
+## 0.3.0 (December 2021)
+
+- Support for big-endian architectures.
+- Added `GeneralCategory` data type and corresponding `generalCategoryAbbr`,
+  `generalCategory` functions.
+- Added the following functions to `Unicode.Char.General`:
+  `isAlphabetic`, `isAlphaNum`,
+  `isControl`, `isMark`, `isPrint`, `isPunctuation`, `isSeparator`,
+  `isSymbol` and `isWhiteSpace`.
+- Added the module `Unicode.Char.Numeric`.
+- **Breaking change:** Changed the behavior of `isLetter` and `isSpace` to match
+  `base`’s `Data.Char` behavior. Move these functions to the compatibility module
+  `Unicode.Char.General.Compat`. The previous behavior is obtained using
+  `isAlphabetic` and `isWhiteSpace` respectively.
+- Re-export some functions from `Data.Char` in order to make `Unicode.Char`
+  a drop-in replacement.
+
 ## 0.2.0 (November 2021)
 
 * Update to [Unicode 14.0.0](https://www.unicode.org/versions/Unicode14.0.0/).
 
@@ -15,6 +15,92 @@ any other packages or use cases.
 
 Please see the haddock documentation for reference documentation.
 
+## Performance
+
+`unicode-data` is up to _5 times faster_ than `base`.
+
+The following benchmark compares the time taken in milliseconds to process all
+the Unicode code points for `base-4.16` and this package (v0.3).
+Machine: 8 × AMD Ryzen 5 2500U on Linux.
+
+```
+All
+  Unicode.Char.Case
+    isLower
+      base:           OK (6.59s)
+         26 ms ± 238 μs
+      unicode-data:   OK (1.16s)
+        4.5 ms ±  83 μs, 0.17x
+    isUpper
+      base:           OK (1.69s)
+         27 ms ± 459 μs
+      unicode-data:   OK (1.21s)
+        4.8 ms ±  77 μs, 0.18x
+  Unicode.Char.General
+    generalCategory
+      base:           OK (0.92s)
+        131 ms ± 1.5 ms
+      unicode-data:   OK (1.62s)
+        108 ms ± 1.2 ms, 0.82x
+    isAlphaNum
+      base:           OK (3.28s)
+         26 ms ± 300 μs
+      unicode-data:   OK (20.60s)
+        5.0 ms ±  59 μs, 0.19x
+    isControl
+      base:           OK (1.61s)
+         26 ms ± 463 μs
+      unicode-data:   OK (1.22s)
+        4.8 ms ±  53 μs, 0.19x
+    isMark
+      base:           OK (0.80s)
+         26 ms ± 339 μs
+      unicode-data:   OK (1.33s)
+        5.2 ms ±  77 μs, 0.20x
+    isPrint
+      base:           OK (3.32s)
+         26 ms ± 498 μs
+      unicode-data:   OK (1.33s)
+        5.2 ms ±  55 μs, 0.20x
+    isPunctuation
+      base:           OK (3.41s)
+         27 ms ± 497 μs
+      unicode-data:   OK (2.67s)
+        5.3 ms ±  28 μs, 0.20x
+    isSeparator
+      base:           OK (0.84s)
+         27 ms ± 422 μs
+      unicode-data:   OK (1.41s)
+        5.5 ms ±  52 μs, 0.21x
+    isSymbol
+      base:           OK (1.72s)
+         27 ms ± 443 μs
+      unicode-data:   OK (1.45s)
+        5.7 ms ± 112 μs, 0.21x
+  Unicode.Char.General.Compat
+    isAlpha
+      base:           OK (3.26s)
+         26 ms ± 254 μs
+      unicode-data:   OK (2.66s)
+        5.2 ms ±  48 μs, 0.20x
+    isLetter
+      base:           OK (1.70s)
+         27 ms ± 453 μs
+      unicode-data:   OK (1.33s)
+        5.2 ms ±  69 μs, 0.19x
+    isSpace
+      base:           OK (0.85s)
+         13 ms ± 237 μs
+      unicode-data:   OK (1.69s)
+        6.7 ms ±  61 μs, 0.49x
+  Unicode.Char.Numeric
+    isNumber
+      base:           OK (1.67s)
+         26 ms ± 316 μs
+      unicode-data:   OK (1.32s)
+        5.2 ms ±  91 μs, 0.20x
+```
+
 ## Unicode database version update
 
 To update the Unicode version please update the version number in
 
@@ -33,7 +33,7 @@ environment:
     # version.
     #STACKVER: "1.6.5"
     STACK_UPGRADE: "y"
-    RESOLVER: "lts-18.17"
+    RESOLVER: "lts-18.18"
     STACK_ROOT: "c:\\sr"
 
     # ------------------------------------------------------------------------
 
@@ -0,0 +1,180 @@
+import Control.DeepSeq (NFData, deepseq)
+import Data.Ix (Ix(..))
+import Test.Tasty.Bench (Benchmark, bgroup, bench, bcompare, nf, defaultMain)
+
+import qualified Data.Char as B
+import qualified Unicode.Char.Case as C
+import qualified Unicode.Char.General as G
+import qualified Unicode.Char.General.Compat as GC
+import qualified Unicode.Char.Identifiers as I
+import qualified Unicode.Char.Normalization as N
+import qualified Unicode.Char.Numeric as Num
+
+-- | A unit benchmark
+data Bench a = Bench
+  { _title :: !String  -- ^ Name
+  , _func :: Char -> a -- ^ Function to benchmark
+  }
+
+main :: IO ()
+main = defaultMain
+  [ bgroup "Unicode.Char.Case"
+    [ bgroup' "isLower"
+      [ Bench "base"         B.isLower
+      , Bench "unicode-data" C.isLower
+      ]
+    , bgroup' "isUpper"
+      [ Bench "base"         B.isUpper
+      , Bench "unicode-data" C.isUpper
+      ]
+    ]
+  , bgroup "Unicode.Char.General"
+    -- Character classification
+    [ bgroup' "generalCategory"
+      [ Bench "base"          (show . B.generalCategory)
+      , Bench "unicode-data"  (show . G.generalCategory)
+      ]
+    , bgroup "isAlphabetic"
+      [ benchNF "unicode-data"  G.isAlphabetic
+      ]
+    , bgroup' "isAlphaNum"
+      [ Bench "base"          B.isAlphaNum
+      , Bench "unicode-data"  G.isAlphaNum
+      ]
+    , bgroup' "isControl"
+      [ Bench "base"          B.isControl
+      , Bench "unicode-data"  G.isControl
+      ]
+    , bgroup' "isMark"
+      [ Bench "base"          B.isMark
+      , Bench "unicode-data"  G.isMark
+      ]
+    , bgroup' "isPrint"
+      [ Bench "base"          B.isPrint
+      , Bench "unicode-data"  G.isPrint
+      ]
+    , bgroup' "isPunctuation"
+      [ Bench "base"          B.isPunctuation
+      , Bench "unicode-data"  G.isPunctuation
+      ]
+    , bgroup' "isSeparator"
+      [ Bench "base"          B.isSeparator
+      , Bench "unicode-data"  G.isSeparator
+      ]
+    , bgroup' "isSymbol"
+      [ Bench "base"          B.isSymbol
+      , Bench "unicode-data"  G.isSymbol
+      ]
+    , bgroup "isWhiteSpace"
+      [ benchNF "unicode-data"  G.isWhiteSpace
+      ]
+    -- Korean Hangul Characters
+    , bgroup "isHangul"
+      [ benchNF "unicode-data"  G.isHangul
+      ]
+    , bgroup "isHangulLV"
+      [ benchNF "unicode-data"  G.isHangul
+      ]
+    , bgroup "isJamo"
+      [ benchNF "unicode-data"  G.isJamo
+      ]
+    , bgroup "jamoLIndex"
+      [ benchNF "unicode-data"  G.jamoLIndex
+      ]
+    , bgroup "jamoVIndex"
+      [ benchNF "unicode-data"  G.jamoVIndex
+      ]
+    , bgroup "jamoTIndex"
+      [ benchNF "unicode-data"  G.jamoTIndex
+      ]
+    ]
+  , bgroup "Unicode.Char.General.Compat"
+    [ bgroup' "isAlpha"
+      [ Bench "base"          B.isAlpha
+      , Bench "unicode-data"  GC.isAlpha
+      ]
+    , bgroup' "isLetter"
+      [ Bench "base"          B.isLetter
+      , Bench "unicode-data"  GC.isLetter
+      ]
+    , bgroup' "isSpace"
+      [ Bench "base"          B.isSpace
+      , Bench "unicode-data"  GC.isSpace
+      ]
+    ]
+  , bgroup "Unicode.Char.Identifiers"
+    [ bgroup "isIDContinue"
+      [ benchNF "unicode-data"  I.isIDContinue
+      ]
+    , bgroup "isIDStart"
+      [ benchNF "unicode-data"  I.isIDStart
+      ]
+    , bgroup "isXIDContinue"
+      [ benchNF "unicode-data"  I.isXIDContinue
+      ]
+    , bgroup "isXIDStart"
+      [ benchNF "unicode-data"  I.isXIDStart
+      ]
+    , bgroup "isPatternSyntax"
+      [ benchNF "unicode-data"  I.isPatternSyntax
+      ]
+    , bgroup "isPatternWhitespace"
+      [ benchNF "unicode-data"  I.isPatternWhitespace
+      ]
+    ]
+  , bgroup "Unicode.Char.Normalization"
+    [ bgroup "isCombining"
+      [ benchNF "unicode-data"  N.isCombining
+      ]
+    , bgroup "combiningClass"
+      [ benchNF "unicode-data"  N.combiningClass
+      ]
+    , bgroup "isCombiningStarter"
+      [ benchNF "unicode-data"  N.isCombiningStarter
+      ]
+    -- [TODO] compose, composeStarters
+    , bgroup "isDecomposable"
+      [ bgroup "Canonical"
+        [ benchNF "unicode-data" (N.isDecomposable N.Canonical)
+        ]
+      , bgroup "Kompat"
+        [ benchNF "unicode-data" (N.isDecomposable N.Kompat)
+        ]
+      ]
+    -- [FIXME] Fail due to non-exhaustive pattern matching
+    -- , bgroup "decompose"
+    --   [ bgroup "Canonical"
+    --     [ benchNF "unicode-data" (N.decompose N.Canonical)
+    --     ]
+    --   , bgroup "Kompat"
+    --     [ benchNF "unicode-data" (N.decompose N.Kompat)
+    --     ]
+    --   ]
+    , bgroup "decomposeHangul"
+      [ benchNF "unicode-data" N.decomposeHangul
+      ]
+    ]
+  , bgroup "Unicode.Char.Numeric"
+    [ bgroup' "isNumber"
+      [ Bench "base"          B.isNumber
+      , Bench "unicode-data"  Num.isNumber
+      ]
+    ]
+  ]
+  where
+    bgroup' groupTitle bs = bgroup groupTitle
+      [ benchNF' groupTitle title f
+      | Bench title f <- bs
+      ]
+
+    -- [NOTE] Works if groupTitle uniquely identifies the benchmark group.
+    benchNF' groupTitle title = case title of
+      "base" -> benchNF title
+      _      -> bcompare ("$NF == \"base\" && $(NF-1) == \"" ++ groupTitle ++ "\"")
+              . benchNF title
+
+    benchNF :: forall a. (NFData a) => String -> (Char -> a) -> Benchmark
+    benchNF t f = bench t $ nf (fold_ f) (minBound, maxBound)
+
+    fold_ :: forall a. (NFData a) => (Char -> a) -> (Char, Char) -> ()
+    fold_ f = foldr (deepseq . f) () . range
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`.packcheck.ignore`
	`2`	`+.editorconfig`
`2`	`3`	`.github/workflows/haskell.yml`
`3`	`4`	`appveyor.yml`
`4`	`5`	`stack.yaml`