Skip to content

Commit b983449

Browse files
committed
ICU-21821 un-hardcode scx set in icuexportdata
1 parent d02d43b commit b983449

File tree

5 files changed

+109
-85
lines changed

5 files changed

+109
-85
lines changed

icu4c/source/common/uchar.cpp

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -616,6 +616,33 @@ uscript_getScriptExtensions(UChar32 c,
616616
return length;
617617
}
618618

619+
namespace {
620+
621+
UBool U_CALLCONV
622+
_scxRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
623+
// From u_getUnicodeProperties(start, 0).
624+
uint32_t vecWord = propsVectors[value]; // vecIndex=value, column 0
625+
uint32_t scriptX = vecWord & UPROPS_SCRIPT_X_MASK;
626+
if (scriptX >= UPROPS_SCRIPT_X_WITH_COMMON) {
627+
// Code points start..end have Script_Extensions.
628+
const USetAdder* sa = static_cast<const USetAdder*>(context);
629+
sa->addRange(sa->set, start, end);
630+
}
631+
(void) value;
632+
return true;
633+
}
634+
635+
}
636+
637+
// for icuexportdata
638+
U_CAPI void U_EXPORT2
639+
uprv_addScriptExtensionsCodePoints(const USetAdder *sa, UErrorCode *pErrorCode) {
640+
if(U_FAILURE(*pErrorCode)) {
641+
return;
642+
}
643+
utrie2_enum(&propsVectorsTrie, nullptr, _scxRange, sa);
644+
}
645+
619646
U_CAPI UBlockCode U_EXPORT2
620647
ublock_getCode(UChar32 c) {
621648
// We store Block values indexed by the code point shifted right 4 bits

icu4c/source/common/uprops.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,10 @@ ublock_addPropertyStarts(const USetAdder *sa, UErrorCode &errorCode);
500500
uprv_getInclusions(const USetAdder *sa, UErrorCode *pErrorCode);
501501
*/
502502

503+
/** @internal for icuexportdata */
504+
U_CAPI void U_EXPORT2
505+
uprv_addScriptExtensionsCodePoints(const USetAdder *sa, UErrorCode *pErrorCode);
506+
503507
// TODO: Move this into a different header file (udataswp.h? new unames.h?) so that uprops.h
504508
// need not be C-compatible any more.
505509
/**

icu4c/source/test/intltest/ucdtest.cpp

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include "testutil.h"
2222
#include "uparse.h"
2323
#include "ucdtest.h"
24+
#include "uprops.h"
2425
#include "usettest.h"
2526

2627
#include <iostream>
@@ -88,6 +89,7 @@ void UnicodeTest::runIndexedTest( int32_t index, UBool exec, const char* &name,
8889
TESTCASE_AUTO(TestPropertiesUsingPpucd);
8990
TESTCASE_AUTO(TestIDStatus);
9091
TESTCASE_AUTO(TestIDType);
92+
TESTCASE_AUTO(TestScriptExtensionsCodePoints);
9193
TESTCASE_AUTO_END;
9294
}
9395

@@ -1317,3 +1319,47 @@ void UnicodeTest::TestIDType() {
13171319
notNFKC.size() + di.size() + dep.size() + notChar.size(),
13181320
allExclusive.size());
13191321
}
1322+
1323+
namespace {
1324+
1325+
void U_CALLCONV
1326+
set_add(USet *set, UChar32 c) {
1327+
UnicodeSet::fromUSet(set)->add(c);
1328+
}
1329+
1330+
void U_CALLCONV
1331+
set_addRange(USet *set, UChar32 start, UChar32 end) {
1332+
UnicodeSet::fromUSet(set)->add(start, end);
1333+
}
1334+
1335+
}
1336+
1337+
void UnicodeTest::TestScriptExtensionsCodePoints() {
1338+
IcuTestErrorCode errorCode(*this, "TestScriptExtensionsCodePoints()");
1339+
UnicodeSet scxCPs;
1340+
USetAdder sa = {
1341+
scxCPs.toUSet(),
1342+
set_add,
1343+
set_addRange,
1344+
nullptr, // don't need addString,
1345+
nullptr, // don't need remove()
1346+
nullptr // don't need removeRange()
1347+
};
1348+
uprv_addScriptExtensionsCodePoints(&sa, errorCode);
1349+
assertSuccess("uprv_addScriptExtensionsCodePoints", errorCode);
1350+
1351+
UnicodeSet mostAssigned(u"[[^[:C:][:Unified_Ideograph:]][:Cf:]]", errorCode);
1352+
assertSuccess("mostAssigned", errorCode);
1353+
1354+
UnicodeSet expected;
1355+
UScriptCode scripts[100];
1356+
for (auto c : mostAssigned.codePoints()) {
1357+
int32_t length = uscript_getScriptExtensions(c, scripts, UPRV_LENGTHOF(scripts), errorCode);
1358+
if (length != 1 || uscript_getScript(c, errorCode) != scripts[0]) {
1359+
expected.add(c);
1360+
}
1361+
}
1362+
assertSuccess("collect expected", errorCode);
1363+
1364+
assertTrue("scxCPs == expected", scxCPs == expected);
1365+
}

icu4c/source/test/intltest/ucdtest.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ class UnicodeTest: public IntlTest {
5555
void TestPropertiesUsingPpucd();
5656
void TestIDStatus();
5757
void TestIDType();
58+
void TestScriptExtensionsCodePoints();
5859

5960
private:
6061

icu4c/source/tools/icuexportdata/icuexportdata.cpp

Lines changed: 31 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include "unicode/ucharstriebuilder.h"
2727
#include "ucase.h"
2828
#include "unicode/normalizer2.h"
29+
#include "uprops.h"
2930
#include "normalizer2impl.h"
3031
#include "writesrc.h"
3132

@@ -46,90 +47,6 @@ int16_t DATAEXPORT_SCRIPT_X_WITH_COMMON = 0x0400;
4647
int16_t DATAEXPORT_SCRIPT_X_WITH_INHERITED = 0x0800;
4748
int16_t DATAEXPORT_SCRIPT_X_WITH_OTHER = 0x0c00;
4849

49-
// TODO(ICU-21821): Replace this with a call to a library function
50-
// This is an array of all code points with explicit scx values, and can be generated the quick and dirty
51-
// way with this script:
52-
//
53-
// # <ScriptExtensions.txt python script.py
54-
//
55-
// import sys
56-
// for line in sys.stdin:
57-
// line = line.strip()
58-
// if len(line) == 0 or line.startswith("#"):
59-
// continue
60-
// entry = line.split(" ")[0]
61-
// # Either it is a range
62-
// if ".." in entry:
63-
// split = entry.split("..")
64-
// start = int(split[0], 16)
65-
// end = int(split[1], 16)
66-
// # +
67-
// for ch in range(start, end + 1):
68-
// print("0x%04x, " % ch, end="")
69-
// # or a single code point
70-
// else:
71-
// print("0x%s, " % entry.lower(), end="")
72-
73-
int32_t scxCodePoints[] = {
74-
0x00b7, 0x02bc, 0x02c7, 0x02c9, 0x02ca, 0x02cb, 0x02cd, 0x02d7, 0x02d9, 0x0300, 0x0301, 0x0302,
75-
0x0303, 0x0304, 0x0305, 0x0306, 0x0307, 0x0308, 0x0309, 0x030a, 0x030b, 0x030c, 0x030d, 0x030e,
76-
0x0310, 0x0311, 0x0313, 0x0320, 0x0323, 0x0324, 0x0325, 0x032d, 0x032e, 0x0330, 0x0331, 0x0342,
77-
0x0345, 0x0358, 0x035e, 0x0363, 0x0364, 0x0365, 0x0366, 0x0367, 0x0368, 0x0369, 0x036a, 0x036b,
78-
0x036c, 0x036d, 0x036e, 0x036f, 0x0374, 0x0375, 0x0483, 0x0484, 0x0485, 0x0486, 0x0487, 0x0589,
79-
0x060c, 0x061b, 0x061c, 0x061f, 0x0640, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, 0x0650, 0x0651,
80-
0x0652, 0x0653, 0x0654, 0x0655, 0x0660, 0x0661, 0x0662, 0x0663, 0x0664, 0x0665, 0x0666, 0x0667,
81-
0x0668, 0x0669, 0x0670, 0x06d4, 0x0951, 0x0952, 0x0964, 0x0965, 0x0966, 0x0967, 0x0968, 0x0969,
82-
0x096a, 0x096b, 0x096c, 0x096d, 0x096e, 0x096f, 0x09e6, 0x09e7, 0x09e8, 0x09e9, 0x09ea, 0x09eb,
83-
0x09ec, 0x09ed, 0x09ee, 0x09ef, 0x0a66, 0x0a67, 0x0a68, 0x0a69, 0x0a6a, 0x0a6b, 0x0a6c, 0x0a6d,
84-
0x0a6e, 0x0a6f, 0x0ae6, 0x0ae7, 0x0ae8, 0x0ae9, 0x0aea, 0x0aeb, 0x0aec, 0x0aed, 0x0aee, 0x0aef,
85-
0x0be6, 0x0be7, 0x0be8, 0x0be9, 0x0bea, 0x0beb, 0x0bec, 0x0bed, 0x0bee, 0x0bef, 0x0bf0, 0x0bf1,
86-
0x0bf2, 0x0bf3, 0x0ce6, 0x0ce7, 0x0ce8, 0x0ce9, 0x0cea, 0x0ceb, 0x0cec, 0x0ced, 0x0cee, 0x0cef,
87-
0x1040, 0x1041, 0x1042, 0x1043, 0x1044, 0x1045, 0x1046, 0x1047, 0x1048, 0x1049, 0x10fb, 0x16eb,
88-
0x16ec, 0x16ed, 0x1735, 0x1736, 0x1802, 0x1803, 0x1805, 0x1cd0, 0x1cd1, 0x1cd2, 0x1cd3, 0x1cd4,
89-
0x1cd5, 0x1cd6, 0x1cd7, 0x1cd8, 0x1cd9, 0x1cda, 0x1cdb, 0x1cdc, 0x1cdd, 0x1cde, 0x1cdf, 0x1ce0,
90-
0x1ce1, 0x1ce2, 0x1ce3, 0x1ce4, 0x1ce5, 0x1ce6, 0x1ce7, 0x1ce8, 0x1ce9, 0x1cea, 0x1ceb, 0x1cec,
91-
0x1ced, 0x1cee, 0x1cef, 0x1cf0, 0x1cf1, 0x1cf2, 0x1cf3, 0x1cf4, 0x1cf5, 0x1cf6, 0x1cf7, 0x1cf8,
92-
0x1cf9, 0x1cfa, 0x1dc0, 0x1dc1, 0x1df8, 0x1dfa, 0x202f, 0x204f, 0x205a, 0x205d, 0x20f0, 0x2e17,
93-
0x2e30, 0x2e31, 0x2e3c, 0x2e41, 0x2e43, 0x2ff0, 0x2ff1, 0x2ff2, 0x2ff3, 0x2ff4, 0x2ff5, 0x2ff6,
94-
0x2ff7, 0x2ff8, 0x2ff9, 0x2ffa, 0x2ffb, 0x2ffc, 0x2ffd, 0x2ffe, 0x2fff, 0x3001, 0x3002, 0x3003,
95-
0x3006, 0x3008, 0x3009, 0x300a, 0x300b, 0x300c, 0x300d, 0x300e, 0x300f, 0x3010, 0x3011, 0x3013,
96-
0x3014, 0x3015, 0x3016, 0x3017, 0x3018, 0x3019, 0x301a, 0x301b, 0x301c, 0x301d, 0x301e, 0x301f,
97-
0x302a, 0x302b, 0x302c, 0x302d, 0x3030, 0x3031, 0x3032, 0x3033, 0x3034, 0x3035, 0x3037, 0x303c,
98-
0x303d, 0x303e, 0x303f, 0x3099, 0x309a, 0x309b, 0x309c, 0x30a0, 0x30fb, 0x30fc, 0x3190, 0x3191,
99-
0x3192, 0x3193, 0x3194, 0x3195, 0x3196, 0x3197, 0x3198, 0x3199, 0x319a, 0x319b, 0x319c, 0x319d,
100-
0x319e, 0x319f, 0x31c0, 0x31c1, 0x31c2, 0x31c3, 0x31c4, 0x31c5, 0x31c6, 0x31c7, 0x31c8, 0x31c9,
101-
0x31ca, 0x31cb, 0x31cc, 0x31cd, 0x31ce, 0x31cf, 0x31d0, 0x31d1, 0x31d2, 0x31d3, 0x31d4, 0x31d5,
102-
0x31d6, 0x31d7, 0x31d8, 0x31d9, 0x31da, 0x31db, 0x31dc, 0x31dd, 0x31de, 0x31df, 0x31e0, 0x31e1,
103-
0x31e2, 0x31e3, 0x31e4, 0x31e5, 0x31ef, 0x3220, 0x3221, 0x3222, 0x3223, 0x3224, 0x3225, 0x3226,
104-
0x3227, 0x3228, 0x3229, 0x322a, 0x322b, 0x322c, 0x322d, 0x322e, 0x322f, 0x3230, 0x3231, 0x3232,
105-
0x3233, 0x3234, 0x3235, 0x3236, 0x3237, 0x3238, 0x3239, 0x323a, 0x323b, 0x323c, 0x323d, 0x323e,
106-
0x323f, 0x3240, 0x3241, 0x3242, 0x3243, 0x3244, 0x3245, 0x3246, 0x3247, 0x3280, 0x3281, 0x3282,
107-
0x3283, 0x3284, 0x3285, 0x3286, 0x3287, 0x3288, 0x3289, 0x328a, 0x328b, 0x328c, 0x328d, 0x328e,
108-
0x328f, 0x3290, 0x3291, 0x3292, 0x3293, 0x3294, 0x3295, 0x3296, 0x3297, 0x3298, 0x3299, 0x329a,
109-
0x329b, 0x329c, 0x329d, 0x329e, 0x329f, 0x32a0, 0x32a1, 0x32a2, 0x32a3, 0x32a4, 0x32a5, 0x32a6,
110-
0x32a7, 0x32a8, 0x32a9, 0x32aa, 0x32ab, 0x32ac, 0x32ad, 0x32ae, 0x32af, 0x32b0, 0x32c0, 0x32c1,
111-
0x32c2, 0x32c3, 0x32c4, 0x32c5, 0x32c6, 0x32c7, 0x32c8, 0x32c9, 0x32ca, 0x32cb, 0x32ff, 0x3358,
112-
0x3359, 0x335a, 0x335b, 0x335c, 0x335d, 0x335e, 0x335f, 0x3360, 0x3361, 0x3362, 0x3363, 0x3364,
113-
0x3365, 0x3366, 0x3367, 0x3368, 0x3369, 0x336a, 0x336b, 0x336c, 0x336d, 0x336e, 0x336f, 0x3370,
114-
0x337b, 0x337c, 0x337d, 0x337e, 0x337f, 0x33e0, 0x33e1, 0x33e2, 0x33e3, 0x33e4, 0x33e5, 0x33e6,
115-
0x33e7, 0x33e8, 0x33e9, 0x33ea, 0x33eb, 0x33ec, 0x33ed, 0x33ee, 0x33ef, 0x33f0, 0x33f1, 0x33f2,
116-
0x33f3, 0x33f4, 0x33f5, 0x33f6, 0x33f7, 0x33f8, 0x33f9, 0x33fa, 0x33fb, 0x33fc, 0x33fd, 0x33fe,
117-
0xa66f, 0xa700, 0xa701, 0xa702, 0xa703, 0xa704, 0xa705, 0xa706, 0xa707, 0xa830, 0xa831, 0xa832,
118-
0xa833, 0xa834, 0xa835, 0xa836, 0xa837, 0xa838, 0xa839, 0xa8f1, 0xa8f3, 0xa92e, 0xa9cf, 0xfd3e,
119-
0xfd3f, 0xfdf2, 0xfdfd, 0xfe45, 0xfe46, 0xff61, 0xff62, 0xff63, 0xff64, 0xff65, 0xff70, 0xff9e,
120-
0xff9f, 0x10100, 0x10101, 0x10102, 0x10107, 0x10108, 0x10109, 0x1010a, 0x1010b, 0x1010c, 0x1010d,
121-
0x1010e, 0x1010f, 0x10110, 0x10111, 0x10112, 0x10113, 0x10114, 0x10115, 0x10116, 0x10117, 0x10118,
122-
0x10119, 0x1011a, 0x1011b, 0x1011c, 0x1011d, 0x1011e, 0x1011f, 0x10120, 0x10121, 0x10122, 0x10123,
123-
0x10124, 0x10125, 0x10126, 0x10127, 0x10128, 0x10129, 0x1012a, 0x1012b, 0x1012c, 0x1012d, 0x1012e,
124-
0x1012f, 0x10130, 0x10131, 0x10132, 0x10133, 0x10137, 0x10138, 0x10139, 0x1013a, 0x1013b, 0x1013c,
125-
0x1013d, 0x1013e, 0x1013f, 0x102e0, 0x102e1, 0x102e2, 0x102e3, 0x102e4, 0x102e5, 0x102e6, 0x102e7,
126-
0x102e8, 0x102e9, 0x102ea, 0x102eb, 0x102ec, 0x102ed, 0x102ee, 0x102ef, 0x102f0, 0x102f1, 0x102f2,
127-
0x102f3, 0x102f4, 0x102f5, 0x102f6, 0x102f7, 0x102f8, 0x102f9, 0x102fa, 0x102fb, 0x10af2, 0x11301,
128-
0x11303, 0x1133b, 0x1133c, 0x11fd0, 0x11fd1, 0x11fd3, 0x1bca0, 0x1bca1, 0x1bca2, 0x1bca3, 0x1d360,
129-
0x1d361, 0x1d362, 0x1d363, 0x1d364, 0x1d365, 0x1d366, 0x1d367, 0x1d368, 0x1d369, 0x1d36a, 0x1d36b,
130-
0x1d36c, 0x1d36d, 0x1d36e, 0x1d36f, 0x1d370, 0x1d371, 0x1f250, 0x1f251,
131-
};
132-
13350
void handleError(ErrorCode& status, int line, const char* context) {
13451
if (status.isFailure()) {
13552
std::cerr << "Error[" << line << "]: " << context << ": " << status.errorName() << std::endl;
@@ -374,6 +291,34 @@ void dumpGeneralCategoryMask(FILE* f) {
374291
fprintf(f, "]\n");
375292
}
376293

294+
namespace {
295+
296+
void U_CALLCONV
297+
set_add(USet *set, UChar32 c) {
298+
UnicodeSet::fromUSet(set)->add(c);
299+
}
300+
301+
void U_CALLCONV
302+
set_addRange(USet *set, UChar32 start, UChar32 end) {
303+
UnicodeSet::fromUSet(set)->add(start, end);
304+
}
305+
306+
}
307+
308+
UnicodeSet getScriptExtensionsCodePoints(IcuToolErrorCode &errorCode) {
309+
UnicodeSet scxCPs;
310+
USetAdder sa = {
311+
scxCPs.toUSet(),
312+
set_add,
313+
set_addRange,
314+
nullptr, // don't need addString,
315+
nullptr, // don't need remove()
316+
nullptr // don't need removeRange()
317+
};
318+
uprv_addScriptExtensionsCodePoints(&sa, errorCode);
319+
return scxCPs;
320+
}
321+
377322
void dumpScriptExtensions(FILE* f) {
378323
IcuToolErrorCode status("icuexportdata: dumpScriptExtensions");
379324

@@ -402,7 +347,8 @@ void dumpScriptExtensions(FILE* f) {
402347

403348
// The sc/scx companion array is an array of arrays (of script codes)
404349
fputs("script_code_array = [\n", f);
405-
for(const UChar32 cp : scxCodePoints) {
350+
UnicodeSet scxCodePoints = getScriptExtensionsCodePoints(status);
351+
for(const UChar32 cp : scxCodePoints.codePoints()) {
406352
// Get the Script value
407353
uint32_t scVal = umutablecptrie_get(builder.getAlias(), cp);
408354
// Get the Script_Extensions value (array of Script codes)

0 commit comments

Comments
 (0)