Skip to content

Commit 421659d

Browse files
committed
Add support for ICU 78.0
Also handle better any Unicode version mismatch.
1 parent 20d1031 commit 421659d

File tree

6 files changed

+85
-18
lines changed

6 files changed

+85
-18
lines changed

experimental/icu/cbits/icu.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,10 @@ int32_t __hs_uscript_getScriptExtensions
5252
return uscript_getScriptExtensions(codepoint, scripts, capacity, &err);
5353
}
5454

55+
int __hs_getMaxScript(void) {
56+
return u_getIntPropertyMaxValue(UCHAR_SCRIPT);
57+
}
58+
5559
const char * __hs_uscript_getShortName(UScriptCode scriptCode) {
5660
return uscript_getShortName(scriptCode);
5761
}

experimental/icu/cbits/icu.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ int32_t __hs_uscript_getScriptExtensions
4141
, UScriptCode * scripts
4242
, int32_t capacity );
4343

44+
int __hs_getMaxScript(void);
45+
4446
const char * __hs_uscript_getShortName(UScriptCode scriptCode);
4547

4648
#endif

experimental/icu/lib/ICU/Scripts.hsc

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
module ICU.Scripts
44
( Script(..)
5+
, maxSupportedScript
56
, script
67
, codepointScript
78
, scriptShortName
@@ -35,9 +36,15 @@ foreign import ccall safe "icu.h __hs_uscript_getScript" uscript_getScript
3536
foreign import ccall unsafe "icu.h __hs_uscript_getScriptExtensions" uscript_getScriptExtensions
3637
:: UChar32 -> Ptr UScriptCode -> Int32 -> IO Int32
3738

39+
foreign import ccall unsafe "icu.h __hs_getMaxScript" getMaxScript
40+
:: IO CInt
41+
3842
foreign import ccall unsafe "icu.h __hs_uscript_getShortName" uscript_getShortName
3943
:: UScriptCode -> IO CString
4044

45+
maxSupportedScript :: Script
46+
maxSupportedScript = toEnum (fromIntegral (unsafePerformIO getMaxScript))
47+
4148
{-# INLINE codepointScript #-}
4249
codepointScript :: Word32 -> Script
4350
-- codepointScript = toEnum . unsafePerformIO . with 0 . uscript_getScript
@@ -65,13 +72,13 @@ scriptExtensionsRaw
6572
capacity = 30
6673

6774
scriptShortName :: Script -> String
68-
scriptShortName
69-
= unsafePerformIO
70-
. (uscript_getShortName . fromIntegral . fromEnum >=> peekCString)
75+
scriptShortName s = if s <= maxSupportedScript
76+
then unsafePerformIO ((uscript_getShortName . fromIntegral . fromEnum >=> peekCString) s)
77+
else ""
7178

7279

7380
-- See: https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/uscript_8h_source.html
74-
-- Last sync: 2023-03-09
81+
-- Last sync: 2025-09-13
7582

7683
data Script
7784
= Common -- ^ USCRIPT_COMMON = 0
@@ -282,4 +289,9 @@ data Script
282289
| Sunu -- ^ USCRIPT_SUNUWAR = 205
283290
| Todr -- ^ USCRIPT_TODHRI = 206
284291
| Tutg -- ^ USCRIPT_TULU_TIGALARI = 207
285-
deriving (Bounded, Enum, Eq, Show)
292+
| Berf -- ^ USCRIPT_BERIA_ERFE = 208
293+
| Sidt -- ^ USCRIPT_SIDETIC = 209
294+
| Tayo -- ^ USCRIPT_TAI_YO = 210
295+
| Tols -- ^ USCRIPT_TOLONG_SIKI = 211
296+
| Hntl -- ^ USCRIPT_TRADITIONAL_HAN_WITH_LATIN = 212
297+
deriving (Bounded, Enum, Eq, Ord, Show)

unicode-data-names/test/ICU/NamesSpec.hs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@ spec = do
5757
#endif
5858
where
5959
ourUnicodeVersion = versionBranch U.unicodeVersion
60+
theirUnicodeVersion = take 3 (versionBranch ICU.unicodeVersion)
61+
versionMismatch = ourUnicodeVersion /= theirUnicodeVersion
6062
showCodePoint c = ("U+" ++) . fmap U.toUpper . showHex (U.ord c)
6163

6264
-- There is no feature to display warnings other than `trace`, so
@@ -85,6 +87,8 @@ spec = do
8587
-- Unicode version mismatch: char is not mapped in one of the libs:
8688
-- add warning.
8789
| ageMismatch c = acc{warnings=c : warnings acc}
90+
-- Unicode version mismatch
91+
| versionMismatch = acc{warnings=c : warnings acc}
8892
-- Error
8993
| otherwise =
9094
let !msg = mconcat

unicode-data-scripts/test/ICU/ScriptsSpec.hs

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,8 @@ import Data.Char (toUpper, ord)
88
import Data.Foldable (traverse_)
99
import qualified Data.List as L
1010
import qualified Data.List.NonEmpty as NE
11-
import Data.Maybe (isJust)
1211
import Data.Version (versionBranch, showVersion)
13-
import Debug.Trace (traceM)
12+
import Debug.Trace (trace, traceM)
1413
import Numeric (showHex)
1514
import Test.Hspec ( Spec, it, expectationFailure, shouldSatisfy )
1615

@@ -20,9 +19,19 @@ import qualified Unicode.Char.General.Scripts as S
2019

2120
spec :: Spec
2221
spec = do
23-
let icuScripts = (\s -> (ICU.scriptShortName s, s)) <$> [minBound..maxBound]
2422
it "scriptShortName"
25-
let check = isJust . (`lookup` icuScripts) . S.scriptShortName
23+
let check s = case toIcuScript s of
24+
Just _ -> True
25+
Nothing
26+
| versionMismatch -> trace (mconcat
27+
[ "[WARNING] Cannot test scriptShortName for "
28+
, show s
29+
, ": incompatible ICU version ("
30+
, showVersion ICU.unicodeVersion
31+
, " /= "
32+
, showVersion S.unicodeVersion
33+
, ")." ]) True
34+
| otherwise -> False
2635
in traverse_ (`shouldSatisfy` check) [minBound..maxBound]
2736
it "script"
2837
let check c
@@ -48,10 +57,23 @@ spec = do
4857
let {
4958
check s =
5059
case lookup (S.scriptShortName s) icuScripts of
51-
Nothing -> error ("Cannot convert script: " ++ show s)
60+
Nothing
61+
| ourUnicodeVersion > theirUnicodeVersion
62+
-> traceM . mconcat $
63+
[ "[WARNING] Cannot convert script "
64+
, show s
65+
, ": incompatible ICU version ("
66+
, showVersion ICU.unicodeVersion
67+
, " /= "
68+
, showVersion S.unicodeVersion
69+
, "). "
70+
, "Max supported ICU script:"
71+
, show ICU.maxSupportedScript ]
72+
| otherwise -> error ("Cannot convert script: " ++ show s)
5273
Just s'
5374
| def == defRef -> pure ()
54-
| ourUnicodeVersion /= theirUnicodeVersion -> traceM . mconcat $
75+
| ourUnicodeVersion /= theirUnicodeVersion
76+
-> traceM . mconcat $
5577
[ "[WARNING] Cannot test "
5678
, show s
5779
, ": incompatible ICU version ("
@@ -106,3 +128,5 @@ spec = do
106128
theirUnicodeVersion = take 3 (versionBranch ICU.unicodeVersion)
107129
showCodePoint c = ("U+" ++) . fmap toUpper $ showHex (ord c) ""
108130
versionMismatch = ourUnicodeVersion /= theirUnicodeVersion
131+
icuScripts = (\s -> (ICU.scriptShortName s, s)) <$> [minBound..maxBound]
132+
toIcuScript = (`lookup` icuScripts) . S.scriptShortName

unicode-data/test/ICU/CharSpec.hs

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ module ICU.CharSpec
55
) where
66

77
import Control.Applicative (Alternative(..))
8+
import Data.Bits (Bits(..))
9+
import qualified Data.Char as Char
810
import Data.Foldable (traverse_)
911
import Data.Version (showVersion, versionBranch)
1012
import Numeric (showHex)
@@ -35,6 +37,12 @@ spec = do
3537
ourUnicodeVersion = versionBranch U.unicodeVersion
3638
theirUnicodeVersion = versionBranch ICU.unicodeVersion
3739
showCodePoint c = ("U+" ++) . fmap U.toUpper . showHex (U.ord c)
40+
-- Check if the character is not assigned in exactly one Unicode version.
41+
isUnassigned c = (U.generalCategory c == U.NotAssigned)
42+
`xor` (ICU.toGeneralCategory (ICU.charType c) == Char.NotAssigned)
43+
-- Check if the character has changed its general category
44+
hasDifferentCategory c = fromEnum (U.generalCategory c)
45+
/= fromEnum (ICU.toGeneralCategory (ICU.charType c))
3846

3947
-- There is no feature to display warnings other than `trace`, so
4048
-- hack our own:
@@ -61,8 +69,11 @@ spec = do
6169
| n == nRef = acc
6270
-- Unicode version mismatch: char is not mapped in one of the libs:
6371
-- add warning.
64-
| age' > ourUnicodeVersion || age' > theirUnicodeVersion
65-
= acc{warnings=c : warnings acc}
72+
| age' > ourUnicodeVersion || age' > theirUnicodeVersion ||
73+
isUnassigned c
74+
= acc{warnings=(c, Unassigned) : warnings acc}
75+
| hasDifferentCategory c
76+
= acc{warnings=(c, CategoryChange) : warnings acc}
6677
-- Error
6778
| otherwise =
6879
let !msg = mconcat
@@ -75,14 +86,18 @@ spec = do
7586
!nRef = fRef c
7687
age = ICU.charAge c
7788
age' = take 3 (versionBranch age)
78-
mkWarning c = it (showCodePoint c "") . pendingWith $ mconcat
89+
mkWarning (c, reason) = it (showCodePoint c "") . pendingWith $ mconcat
7990
[ "Incompatible ICU Unicode version: expected "
8091
, showVersion U.unicodeVersion
8192
, ", got: "
8293
, showVersion ICU.unicodeVersion
83-
, " (ICU character age is: "
84-
, showVersion (ICU.charAge c)
85-
, ")" ]
94+
, case reason of
95+
Unassigned -> mconcat
96+
[ " (ICU character age is: "
97+
, showVersion (ICU.charAge c)
98+
, ")" ]
99+
CategoryChange -> " (different general category)"
100+
]
86101

87102
-- | Helper to compare our GeneralCategory to 'Data.Char.GeneralCategory'.
88103
data GeneralCategory = forall c. (Show c, Enum c) => GeneralCategory c
@@ -93,6 +108,12 @@ instance Show GeneralCategory where
93108
instance Eq GeneralCategory where
94109
GeneralCategory a == GeneralCategory b = fromEnum a == fromEnum b
95110

111+
data MismatchReason
112+
= Unassigned
113+
| CategoryChange
114+
96115
-- | Warning accumulator
97-
data Acc = Acc { warnings :: ![Char], firstError :: !(Maybe String) }
116+
data Acc = Acc
117+
{ warnings :: ![(Char, MismatchReason)]
118+
, firstError :: !(Maybe String) }
98119

0 commit comments

Comments
 (0)