Skip to content

Commit a247a95

Browse files
committed
core: Add isNoncharacter
1 parent aa3a933 commit a247a95

File tree

6 files changed

+55
-1
lines changed

6 files changed

+55
-1
lines changed

experimental/icu/cbits/icu.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,14 @@ void __hs_u_getUnicodeVersion(UVersionInfo versionArray) {
77
u_getUnicodeVersion(versionArray);
88
}
99

10+
/*******************************************************************************
11+
* Properties
12+
******************************************************************************/
13+
14+
bool __hs_u_hasBinaryProperty(UChar32 c, UProperty which) {
15+
return u_hasBinaryProperty(c, which);
16+
}
17+
1018
/*******************************************************************************
1119
* Names
1220
******************************************************************************/

experimental/icu/cbits/icu.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@
77

88
void __hs_u_getUnicodeVersion(UVersionInfo versionArray);
99

10+
/*******************************************************************************
11+
* Properties
12+
******************************************************************************/
13+
14+
bool __hs_u_hasBinaryProperty(UChar32 c, UProperty which);
15+
1016
/*******************************************************************************
1117
* Names
1218
******************************************************************************/

experimental/icu/icu.cabal

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,4 +73,4 @@ library
7373
pkgconfig-depends:
7474
icu-uc >= 72.1
7575
build-tool-depends:
76-
c2hs:c2hs
76+
c2hs:c2hs >= 0.28.8

experimental/icu/lib/ICU/Char.chs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ module ICU.Char
1515
, UGeneralCategory(..)
1616
, toGeneralCategory
1717
, charType
18+
, isNoncharacter
1819
) where
1920

2021
#include <unicode/uchar.h>
@@ -134,3 +135,21 @@ toGeneralCategory = \case
134135
OtherSymbol -> Char.OtherSymbol
135136
InitialPunctuation -> Char.InitialQuote
136137
FinalPunctuation -> Char.FinalQuote
138+
139+
{#enum define UProperty {
140+
UCHAR_NONCHARACTER_CODE_POINT as NoncharacterCodePoint
141+
}
142+
deriving (Bounded, Eq, Ord, Show) #}
143+
144+
foreign import ccall safe "icu.h __hs_u_hasBinaryProperty" u_hasBinaryProperty
145+
:: UChar32 -> Int -> Bool
146+
147+
-- hasBinaryProperty :: UChar32 -> Int -> Bool
148+
-- hasBinaryProperty = {#call pure u_hasBinaryProperty as __hs_u_hasBinaryProperty#}
149+
-- {#fun pure u_hasBinaryProperty as hasBinaryProperty
150+
-- {`UChar32', `Int'} -> `Bool' #}
151+
152+
isNoncharacter :: Char -> Bool
153+
isNoncharacter c = u_hasBinaryProperty
154+
(fromIntegral (ord c))
155+
(fromEnum NoncharacterCodePoint)

unicode-data/lib/Unicode/Char/General.hs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ module Unicode.Char.General
2424
, isSeparator
2525
, isSymbol
2626
, isWhiteSpace
27+
, isNoncharacter
2728
, isLetter
2829
, isSpace
2930
-- ** Re-export
@@ -414,6 +415,22 @@ isSymbol :: Char -> Bool
414415
isSymbol c = UC.MathSymbol <= gc && gc <= UC.OtherSymbol
415416
where gc = UC.generalCategory c
416417

418+
-- | Returns 'True' for any /noncharacter/.
419+
--
420+
-- A /noncharacter/ is a code point that is permanently reserved for internal
421+
-- use (see definition D14 in the section
422+
-- [3.4 “Characters and Encoding”](https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G2212)
423+
-- of the Unicode Standard).
424+
--
425+
-- Noncharacters consist of the values @U+nFFFE@ and @U+nFFFF@ (where @n@
426+
-- is from 0 to 10₁₆) and the values @U+FDD0..U+FDEF@.
427+
--
428+
-- @since 0.6.0
429+
isNoncharacter :: Char -> Bool
430+
isNoncharacter c
431+
= ('\xFDD0' <= c && c <= '\xFDEF')
432+
|| (ord c .&. 0xFFFF) >= 0xFFFE
433+
417434
-- | Returns 'True' for alphabetic Unicode characters (lower-case, upper-case
418435
-- and title-case letters, plus letters of caseless scripts and modifiers
419436
-- letters).

unicode-data/test/ICU/CharSpec.hs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ spec = do
2626
"charType"
2727
(GeneralCategory . U.generalCategory)
2828
(GeneralCategory . ICU.toGeneralCategory . ICU.charType)
29+
checkAndGatherErrors
30+
"isNoncharacter"
31+
(GeneralCategory . U.isNoncharacter)
32+
(GeneralCategory . ICU.isNoncharacter)
2933
-- TODO: other functions
3034
where
3135
ourUnicodeVersion = versionBranch U.unicodeVersion

0 commit comments

Comments
 (0)