Skip to content

Commit 8f60d05

Browse files
committed
ICU-23222 Enhance unit alias handling
See #3675
1 parent 122e60c commit 8f60d05

File tree

8 files changed

+475
-46
lines changed

8 files changed

+475
-46
lines changed

icu4c/source/i18n/measunit_extra.cpp

Lines changed: 153 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,9 @@ enum PowerPart {
100100
// "fluid-ounce-imperial".
101101
constexpr int32_t kSimpleUnitOffset = 512;
102102

103+
// Trie value offset for aliases, e.g. "portion" replaced by "part"
104+
constexpr int32_t kAliasOffset = 51200; // This will give a very big space for the units ids.
105+
103106
const struct UnitPrefixStrings {
104107
const char* const string;
105108
UMeasurePrefix value;
@@ -255,6 +258,58 @@ class SimpleUnitIdentifiersSink : public icu::ResourceSink {
255258
int32_t outIndex;
256259
};
257260

261+
class UnitAliasesSink : public icu::ResourceSink {
262+
public:
263+
/**
264+
* Constructor.
265+
* @param unitAliases The output vector of unit alias identifiers (CharString).
266+
* @param unitReplacements The output vector of replacements for the unit aliases (CharString).
267+
*/
268+
explicit UnitAliasesSink(MaybeStackVector<CharString> &unitAliases,
269+
MaybeStackVector<CharString> &unitReplacements)
270+
: unitAliases(unitAliases), unitReplacements(unitReplacements) {}
271+
272+
/**
273+
* Adds the unit alias key and its replacement to the unitAliases and unitReplacements vectors.
274+
* @param key The unit alias identifier (e.g., "meter-and-liter").
275+
* @param value Should be a ResourceTable value containing the replacement,
276+
* when ures_getAllChildrenWithFallback() is called correctly for this sink.
277+
* @param noFallback Ignored.
278+
* @param status The standard ICU error code output parameter.
279+
*/
280+
void put(const char *key, ResourceValue &value, UBool /*noFallback*/,
281+
UErrorCode &status) override {
282+
if (U_FAILURE(status)) return;
283+
284+
// Add the unit alias key to the unitAliases vector
285+
int32_t keyLen = static_cast<int32_t>(uprv_strlen(key));
286+
unitAliases.emplaceBackAndCheckErrorCode(status)->append(key, keyLen, status);
287+
if (U_FAILURE(status)) {
288+
return;
289+
}
290+
291+
// Find the replacement for this unit alias from the alias table resource.
292+
ResourceTable aliasTable = value.getTable(status);
293+
if (U_FAILURE(status)) {
294+
return;
295+
}
296+
297+
if (!aliasTable.findValue("replacement", value)) {
298+
status = U_MISSING_RESOURCE_ERROR;
299+
return;
300+
}
301+
302+
int32_t len;
303+
const char16_t *uReplacement = value.getString(len, status);
304+
unitReplacements.emplaceBackAndCheckErrorCode(status)->appendInvariantChars(uReplacement,
305+
len, status);
306+
}
307+
308+
private:
309+
MaybeStackVector<CharString> &unitAliases;
310+
MaybeStackVector<CharString> &unitReplacements;
311+
};
312+
258313
/**
259314
* A ResourceSink that collects information from `unitQuantities` in the `units`
260315
* resource to provide key->value lookups from base unit to category, as well as
@@ -321,6 +376,11 @@ class CategoriesSink : public icu::ResourceSink {
321376

322377
icu::UInitOnce gUnitExtrasInitOnce {};
323378

379+
// Array of unit aliases.
380+
MaybeStackVector<icu::CharString> gUnitAliases;
381+
// Array of replacements for the unit aliases.
382+
MaybeStackVector<icu::CharString> gUnitReplacements;
383+
324384
// Array of simple unit IDs.
325385
//
326386
// The array memory itself is owned by this pointer, but the individual char* in
@@ -453,6 +513,24 @@ void U_CALLCONV initUnitExtras(UErrorCode& status) {
453513
simpleUnitsCount, b, kSimpleUnitOffset);
454514
ures_getAllItemsWithFallback(unitsBundle.getAlias(), "convertUnits", identifierSink, status);
455515

516+
// Populate gUnitAliases and gUnitReplacements.
517+
LocalUResourceBundlePointer aliasBundle(ures_open(U_ICUDATA_ALIAS, "metadata", &status));
518+
if (U_FAILURE(status)) {
519+
return;
520+
}
521+
UnitAliasesSink aliasSink(gUnitAliases, gUnitReplacements);
522+
ures_getAllChildrenWithFallback(aliasBundle.getAlias(), "alias/unit", aliasSink, status);
523+
if (U_FAILURE(status)) {
524+
return;
525+
}
526+
527+
for (int32_t i = 0; i < gUnitAliases.length(); i++) {
528+
b.add(gUnitAliases[i]->data(), i + kAliasOffset, status);
529+
if (U_FAILURE(status)) {
530+
return;
531+
}
532+
}
533+
456534
// Build the CharsTrie
457535
// TODO: Use SLOW or FAST here?
458536
StringPiece result = b.buildStringPiece(USTRINGTRIE_BUILD_FAST, status);
@@ -479,8 +557,10 @@ class Token {
479557
this->fType = TYPE_INITIAL_COMPOUND_PART;
480558
} else if (fMatch < kSimpleUnitOffset) {
481559
this->fType = TYPE_POWER_PART;
482-
} else {
560+
} else if (fMatch < kAliasOffset) {
483561
this->fType = TYPE_SIMPLE_UNIT;
562+
} else {
563+
this->fType = TYPE_ALIAS;
484564
}
485565
}
486566

@@ -505,6 +585,7 @@ class Token {
505585
TYPE_POWER_PART,
506586
TYPE_SIMPLE_UNIT,
507587
TYPE_CONSTANT_DENOMINATOR,
588+
TYPE_ALIAS,
508589
};
509590

510591
// Calling getType() is invalid, resulting in an assertion failure, if Token
@@ -551,6 +632,11 @@ class Token {
551632
return fMatch - kSimpleUnitOffset;
552633
}
553634

635+
int32_t getAliasIndex() const {
636+
U_ASSERT(getType() == TYPE_ALIAS);
637+
return static_cast<int32_t>(fMatch - kAliasOffset);
638+
}
639+
554640
// TODO: Consider moving this to a separate utility class.
555641
// Utility function to parse a string into an unsigned long value.
556642
// The value must be a positive integer within the range [1, INT64_MAX].
@@ -673,6 +759,10 @@ class Parser {
673759
}
674760

675761
if (singleUnitOrConstant.isConstantDenominator()) {
762+
if (result.constantDenominator > 0) {
763+
status = kUnitIdentifierSyntaxError;
764+
return result;
765+
}
676766
result.constantDenominator = singleUnitOrConstant.getConstantDenominator();
677767
result.complexity = UMEASURE_UNIT_COMPOUND;
678768
continue;
@@ -728,6 +818,9 @@ class Parser {
728818
StringPiece fSource;
729819
BytesTrie fTrie;
730820

821+
// Storage for modified source string when aliases are expanded
822+
CharString fModifiedSource;
823+
731824
// Set to true when we've seen a "-per-" or a "per-", after which all units
732825
// are in the denominator. Until we find an "-and-", at which point the
733826
// identifier is invalid pending TODO(CLDR-13701).
@@ -830,6 +923,19 @@ class Parser {
830923
return {};
831924
}
832925

926+
// Handles the case where the alias replacement begins with "per-".
927+
// For example:
928+
// if the alias is "permeter" and the replacement is "per-meter".
929+
// NOTE: This case does not currently exist in CLDR, but this code anticipates possible future
930+
// additions.
931+
if (token.getType() == Token::TYPE_ALIAS) {
932+
processAlias(token, status);
933+
token = nextToken(status);
934+
if (U_FAILURE(status)) {
935+
return {};
936+
}
937+
}
938+
833939
fJustSawPer = false;
834940

835941
if (atStart) {
@@ -923,6 +1029,10 @@ class Parser {
9231029
singleUnitResult.index = token.getSimpleUnitIndex();
9241030
break;
9251031

1032+
case Token::TYPE_ALIAS:
1033+
processAlias(token, status);
1034+
break;
1035+
9261036
default:
9271037
status = kUnitIdentifierSyntaxError;
9281038
return {};
@@ -945,6 +1055,48 @@ class Parser {
9451055

9461056
return SingleUnitOrConstant::singleUnitValue(singleUnitResult);
9471057
}
1058+
1059+
private:
1060+
/**
1061+
* Helper function to process alias replacement.
1062+
*
1063+
* @param token The token of TYPE_ALIAS to process
1064+
* @param status ICU error code
1065+
*/
1066+
void processAlias(const Token &token, UErrorCode &status) {
1067+
if (U_FAILURE(status)) {
1068+
return;
1069+
}
1070+
1071+
auto aliasIndex = token.getAliasIndex();
1072+
if (aliasIndex < 0 || aliasIndex >= gUnitAliases.length() ||
1073+
aliasIndex >= gUnitReplacements.length()) {
1074+
status = kUnitIdentifierSyntaxError;
1075+
return;
1076+
}
1077+
1078+
auto replacement = gUnitReplacements[aliasIndex];
1079+
1080+
// Create new source string: replacement + remaining unparsed portion
1081+
fModifiedSource.clear();
1082+
fModifiedSource.append(replacement->data(), replacement->length(), status);
1083+
1084+
// Add the remaining unparsed portion of fSource which starts from fIndex
1085+
if (fIndex < fSource.length()) {
1086+
StringPiece remaining = fSource.substr(fIndex);
1087+
fModifiedSource.append(remaining.data(), remaining.length(), status);
1088+
}
1089+
1090+
if (U_FAILURE(status)) {
1091+
return;
1092+
}
1093+
1094+
// Update parser state with new source and reset index
1095+
fSource = StringPiece(fModifiedSource.data(), fModifiedSource.length());
1096+
fIndex = 0;
1097+
1098+
return;
1099+
}
9481100
};
9491101

9501102
// Sorting function wrapping SingleUnitImpl::compareTo for use with uprv_sortArray.

icu4c/source/i18n/number_longnames.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -438,6 +438,7 @@ void getMeasureData(const Locale &locale,
438438
subKey.append(unit.getType(), status);
439439
subKey.append("/", status);
440440

441+
// TODO(ICU-23226): Refactor LongNameHandler to use gUnitAliases and gUnitReplacements measunit_extra.cpp instead of local reasource bundle.
441442
// Check if unitSubType is an alias or not.
442443
LocalUResourceBundlePointer aliasBundle(ures_open(U_ICUDATA_ALIAS, "metadata", &status));
443444

icu4c/source/test/intltest/numbertest_api.cpp

Lines changed: 75 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -6063,27 +6063,85 @@ void NumberFormatterApiTest::formatUnitsAliases() {
60636063
std::unique_ptr<MeasureUnit> measureUnit;
60646064
const char * measureUnitString; // Only used if measureUnit is nullptr
60656065
const UnicodeString expectedFormat;
6066+
6067+
TestCase(std::unique_ptr<MeasureUnit> mu, const UnicodeString &expected)
6068+
: measureUnit(std::move(mu)), measureUnitString(nullptr), expectedFormat(expected) {}
6069+
TestCase(const char *muStr, const UnicodeString &expected)
6070+
: measureUnit(nullptr), measureUnitString(muStr), expectedFormat(expected) {}
60666071
} testCases[]{
6067-
// Aliases
6068-
{std::make_unique<MeasureUnit>(MeasureUnit::getMilligramOfglucosePerDeciliter()), nullptr, u"2 milligrams per deciliter"},
6069-
{std::make_unique<MeasureUnit>(MeasureUnit::getMilligramPerDeciliter()), nullptr, u"2 milligrams per deciliter"},
6070-
{std::make_unique<MeasureUnit>(MeasureUnit::getLiterPer100Kilometers()), nullptr,u"2 liters per 100 kilometers"},
6071-
{std::make_unique<MeasureUnit>(MeasureUnit::getPartPerMillion()), nullptr, u"2 parts per million"},
6072-
{std::make_unique<MeasureUnit>(MeasureUnit::getMillimeterOfMercury()), nullptr, u"2 millimeters of mercury"},
6073-
6074-
// Some replacements
6075-
{nullptr, "millimeter-ofhg", u"2 millimeters of mercury"},
6076-
{nullptr, "liter-per-100-kilometer", u"2 liters per 100 kilometers"},
6077-
{nullptr, "permillion", u"2 parts per million"},
6078-
{nullptr, "part-per-million", u"2 parts per million"},
6079-
{nullptr, "part-per-1e6", u"2 parts per million"},
6072+
// permillion
6073+
TestCase("permillion", u"2 parts per million"),
6074+
TestCase("part-per-million", u"2 parts per million"),
6075+
TestCase("portion-per-million", u"2 parts per million"),
6076+
TestCase("portion-per-1e6", u"2 parts per million"),
6077+
TestCase("part-per-1e6", u"2 parts per million"),
6078+
TestCase(std::make_unique<MeasureUnit>(MeasureUnit::getPartPer1E6()), u"2 parts per million"),
6079+
6080+
// part-per-billion
6081+
TestCase("portion-per-1e9", u"2 parts per billion"),
6082+
TestCase("part-per-1e9", u"2 parts per billion"),
6083+
TestCase(std::make_unique<MeasureUnit>(MeasureUnit::getPartPer1E9()), u"2 parts per billion"),
6084+
6085+
// pound-foot
6086+
TestCase("pound-foot", u"2 pound-force-feet"),
6087+
TestCase("pound-force-foot", u"2 pound-force-feet"),
6088+
TestCase(std::make_unique<MeasureUnit>(MeasureUnit::getPoundFoot()), u"2 pound-force-feet"),
6089+
6090+
// pound-force
6091+
TestCase(std::make_unique<MeasureUnit>(MeasureUnit::getPoundForce()), u"2 pounds of force"),
6092+
TestCase("pound-force", u"2 pounds of force"),
6093+
6094+
// pound-per-square-inch
6095+
TestCase("pound-per-square-inch", u"2 pounds-force per square inch"),
6096+
TestCase("pound-force-per-square-inch", u"2 pounds-force per square inch"),
6097+
TestCase(std::make_unique<MeasureUnit>(MeasureUnit::getPoundPerSquareInch()), u"2 pounds-force per square inch"),
6098+
6099+
// millimeter-of-mercury
6100+
TestCase("millimeter-of-mercury", u"2 millimeters of mercury"),
6101+
TestCase("millimeter-ofhg", u"2 millimeters of mercury"),
6102+
TestCase(std::make_unique<MeasureUnit>(MeasureUnit::getMillimeterOfMercury()), u"2 millimeters of mercury"),
6103+
6104+
// inch-hg
6105+
TestCase("inch-hg", u"2 inches of mercury"),
6106+
TestCase("inch-ofhg", u"2 inches of mercury"),
6107+
TestCase(std::make_unique<MeasureUnit>(MeasureUnit::getInchHg()), u"2 inches of mercury"),
6108+
6109+
// liter-per-100kilometers
6110+
TestCase("liter-per-100kilometers", u"2 liters per 100 kilometers"),
6111+
TestCase("liter-per-100-kilometer", u"2 liters per 100 kilometers"),
6112+
TestCase(std::make_unique<MeasureUnit>(MeasureUnit::getLiterPer100Kilometers()), u"2 liters per 100 kilometers"),
6113+
// meter-per-second-squared
6114+
TestCase("meter-per-second-squared", u"2 meters per second squared"),
6115+
TestCase("meter-per-square-second", u"2 meters per second squared"),
6116+
TestCase(std::make_unique<MeasureUnit>(MeasureUnit::getMeterPerSecondSquared()), u"2 meters per second squared"),
6117+
6118+
// metric-ton
6119+
TestCase("metric-ton", u"2 metric tons"),
6120+
TestCase("tonne", u"2 metric tons"),
6121+
TestCase(std::make_unique<MeasureUnit>(MeasureUnit::getMetricTon()), u"2 metric tons"),
6122+
TestCase(std::make_unique<MeasureUnit>(MeasureUnit::getTonne()), u"2 metric tons"),
6123+
6124+
// milligram-per-deciliter
6125+
TestCase("milligram-per-deciliter", u"2 milligrams per deciliter"),
6126+
TestCase("milligram-ofglucose-per-deciliter", u"2 milligrams per deciliter"),
6127+
TestCase(std::make_unique<MeasureUnit>(MeasureUnit::getMilligramPerDeciliter()), u"2 milligrams per deciliter"),
6128+
TestCase(std::make_unique<MeasureUnit>(MeasureUnit::getMilligramOfglucosePerDeciliter()), u"2 milligrams per deciliter"),
6129+
6130+
// Arbitrary
6131+
TestCase("meter-permillion", u"2 meter-parts per 1000000"),
6132+
TestCase("permillion-meter", u"2 parts per 1000000-meter"),
6133+
TestCase("tonne-per-second", u"2 metric tons per second"),
6134+
TestCase("part-per-1e6-per-100", u"expect exception"),
6135+
TestCase("permillion-per-100", u"expect exception"),
60806136
};
60816137

60826138
for (const auto &testCase : testCases) {
6083-
if (testCase.measureUnitString != nullptr &&
6084-
(uprv_strcmp("permillion", testCase.measureUnitString) == 0 ||
6085-
uprv_strcmp("part-per-million", testCase.measureUnitString) == 0)) {
6086-
logKnownIssue("ICU-23222", "Ensure unit aliases work correctly to avoid breaking callers");
6139+
if (testCase.expectedFormat == UnicodeString("expect exception")) {
6140+
UErrorCode exStatus = U_ZERO_ERROR;
6141+
MeasureUnit::forIdentifier(testCase.measureUnitString, exStatus);
6142+
if (U_SUCCESS(exStatus)) {
6143+
errln("exception expected");
6144+
}
60876145
continue;
60886146
}
60896147

0 commit comments

Comments
 (0)