Skip to content

Commit 649262a

Browse files
committed
ICU-23264 Add validateAndGet function to MeasureUnit for efficient unit validation
See unicode-org#3780
1 parent 4ebbe0c commit 649262a

File tree

6 files changed

+116
-31
lines changed

6 files changed

+116
-31
lines changed

icu4c/source/i18n/measunit.cpp

Lines changed: 60 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2546,6 +2546,14 @@ MeasureUnit MeasureUnit::getToJp() {
25462546

25472547
// End generated code for measunit.cpp
25482548

2549+
/**
2550+
* Generic binary search function for string arrays.
2551+
* @param array the array of strings to search in
2552+
* @param start the starting index (inclusive)
2553+
* @param end the ending index (exclusive)
2554+
* @param key the string to search for
2555+
* @return the index if found, -1 if not found
2556+
*/
25492557
static int32_t binarySearch(
25502558
const char * const * array, int32_t start, int32_t end, StringPiece key) {
25512559
while (start < end) {
@@ -2563,6 +2571,33 @@ static int32_t binarySearch(
25632571
return -1;
25642572
}
25652573

2574+
/**
2575+
* Helper function to get the subtype range for a given type index.
2576+
* @param typeIdx the type index (0 to UPRV_LENGTHOF(gTypes)-1)
2577+
* @param start will be set to the starting index in gSubTypes
2578+
* @param end will be set to the ending index in gSubTypes (exclusive)
2579+
* @return the length of the range (end - start)
2580+
*/
2581+
static int32_t getSubtypeRange(int32_t typeIdx, int32_t &start, int32_t &end) {
2582+
U_ASSERT(typeIdx >= 0 && typeIdx < UPRV_LENGTHOF(gTypes));
2583+
start = gOffsets[typeIdx];
2584+
end = gOffsets[typeIdx + 1];
2585+
return end - start;
2586+
}
2587+
2588+
/**
2589+
* Binary search function that searches for a subtype in gSubTypes within the range
2590+
* corresponding to the given type.
2591+
* @param typeIdx the type index (0 to UPRV_LENGTHOF(gTypes)-1)
2592+
* @param key the subtype string to search for
2593+
* @return the index in gSubTypes if found, -1 if not found
2594+
*/
2595+
static int32_t binarySearch(int32_t typeIdx, StringPiece key) {
2596+
int32_t start, end;
2597+
(void)getSubtypeRange(typeIdx, start, end);
2598+
return binarySearch(gSubTypes, start, end, key);
2599+
}
2600+
25662601
MeasureUnit::MeasureUnit() : MeasureUnit(kBaseTypeIdx, kBaseSubTypeIdx) {
25672602
}
25682603

@@ -2680,7 +2715,8 @@ int32_t MeasureUnit::getAvailable(
26802715
}
26812716
int32_t idx = 0;
26822717
for (int32_t typeIdx = 0; typeIdx < UPRV_LENGTHOF(gTypes); ++typeIdx) {
2683-
int32_t len = gOffsets[typeIdx + 1] - gOffsets[typeIdx];
2718+
int32_t start, end;
2719+
int32_t len = getSubtypeRange(typeIdx, start, end);
26842720
for (int32_t subTypeIdx = 0; subTypeIdx < len; ++subTypeIdx) {
26852721
dest[idx].setTo(typeIdx, subTypeIdx);
26862722
++idx;
@@ -2702,7 +2738,8 @@ int32_t MeasureUnit::getAvailable(
27022738
if (typeIdx == -1) {
27032739
return 0;
27042740
}
2705-
int32_t len = gOffsets[typeIdx + 1] - gOffsets[typeIdx];
2741+
int32_t start, end;
2742+
int32_t len = getSubtypeRange(typeIdx, start, end);
27062743
if (destCapacity < len) {
27072744
errorCode = U_BUFFER_OVERFLOW_ERROR;
27082745
return len;
@@ -2729,6 +2766,24 @@ StringEnumeration* MeasureUnit::getAvailableTypes(UErrorCode &errorCode) {
27292766
return result;
27302767
}
27312768

2769+
bool MeasureUnit::validateAndGet(StringPiece type, StringPiece subtype, MeasureUnit &result) {
2770+
// Find the type index using binary search
2771+
int32_t typeIdx = binarySearch(gTypes, 0, UPRV_LENGTHOF(gTypes), type);
2772+
if (typeIdx == -1) {
2773+
return false; // Type not found
2774+
}
2775+
2776+
// Find the subtype within the type's range using binary search
2777+
int32_t subtypeIdx = binarySearch(typeIdx, subtype);
2778+
if (subtypeIdx == -1) {
2779+
return false; // Subtype not found
2780+
}
2781+
2782+
// Create the MeasureUnit and return it
2783+
result.setTo(typeIdx, subtypeIdx - gOffsets[typeIdx]);
2784+
return true;
2785+
}
2786+
27322787
bool MeasureUnit::findBySubType(StringPiece subType, MeasureUnit* output) {
27332788
// Sanity checking kCurrencyOffset and final entry in gOffsets
27342789
U_ASSERT(uprv_strcmp(gTypes[kCurrencyOffset], "currency") == 0);
@@ -2739,7 +2794,7 @@ bool MeasureUnit::findBySubType(StringPiece subType, MeasureUnit* output) {
27392794
if (t == kCurrencyOffset) {
27402795
continue;
27412796
}
2742-
int32_t st = binarySearch(gSubTypes, gOffsets[t], gOffsets[t + 1], subType);
2797+
int32_t st = binarySearch(t, subType);
27432798
if (st >= 0) {
27442799
output->setTo(t, st - gOffsets[t]);
27452800
return true;
@@ -2763,7 +2818,7 @@ void MeasureUnit::initTime(const char *timeId) {
27632818
int32_t result = binarySearch(gTypes, 0, UPRV_LENGTHOF(gTypes), "duration");
27642819
U_ASSERT(result != -1);
27652820
fTypeId = result;
2766-
result = binarySearch(gSubTypes, gOffsets[fTypeId], gOffsets[fTypeId + 1], timeId);
2821+
result = binarySearch(fTypeId, timeId);
27672822
U_ASSERT(result != -1);
27682823
fSubTypeId = result - gOffsets[fTypeId];
27692824
}
@@ -2772,8 +2827,7 @@ void MeasureUnit::initCurrency(StringPiece isoCurrency) {
27722827
int32_t result = binarySearch(gTypes, 0, UPRV_LENGTHOF(gTypes), "currency");
27732828
U_ASSERT(result != -1);
27742829
fTypeId = result;
2775-
result = binarySearch(
2776-
gSubTypes, gOffsets[fTypeId], gOffsets[fTypeId + 1], isoCurrency);
2830+
result = binarySearch(fTypeId, isoCurrency);
27772831
if (result == -1) {
27782832
UErrorCode status = U_ZERO_ERROR;
27792833
fImpl = new MeasureUnitImpl(MeasureUnitImpl::forCurrencyCode(isoCurrency, status));

icu4c/source/i18n/number_skeletons.cpp

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1072,25 +1072,12 @@ void blueprint_helpers::parseMeasureUnitOption(const StringSegment& segment, Mac
10721072
CharString subType;
10731073
SKELETON_UCHAR_TO_CHAR(subType, stemString, firstHyphen + 1, stemString.length(), status);
10741074

1075-
// Note: the largest type as of this writing (Aug 2020) is "volume", which has 33 units.
1076-
static constexpr int32_t CAPACITY = 40;
1077-
MeasureUnit units[CAPACITY];
1078-
UErrorCode localStatus = U_ZERO_ERROR;
1079-
int32_t numUnits = MeasureUnit::getAvailable(type.data(), units, CAPACITY, localStatus);
1080-
if (U_FAILURE(localStatus)) {
1081-
// More than 30 units in this type?
1082-
status = U_INTERNAL_PROGRAM_ERROR;
1075+
MeasureUnit unit;
1076+
if (MeasureUnit::validateAndGet(type.toStringPiece(), subType.toStringPiece(), unit)) {
1077+
macros.unit = unit;
10831078
return;
1084-
}
1085-
for (int32_t i = 0; i < numUnits; i++) {
1086-
auto& unit = units[i];
1087-
if (uprv_strcmp(subType.data(), unit.getSubtype()) == 0) {
1088-
macros.unit = unit;
1089-
return;
1090-
}
1091-
}
1092-
1093-
// throw new SkeletonSyntaxException("Unknown measure unit", segment);
1079+
}
1080+
10941081
status = U_NUMBER_SKELETON_SYNTAX_ERROR;
10951082
}
10961083

icu4c/source/i18n/unicode/measunit.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -720,6 +720,22 @@ class U_I18N_API MeasureUnit: public UObject {
720720
*/
721721
static StringEnumeration* getAvailableTypes(UErrorCode &errorCode);
722722

723+
#ifndef U_HIDE_INTERNAL_API
724+
/**
725+
* Validates that a specific type and subtype combination exists and retrieve the unit.
726+
*
727+
* <p> Note: This is more efficient than calling getAvailable() when you only need
728+
* to validate and retrieve a single unit.
729+
*
730+
* @param type the unit type (e.g., "length", "mass", "volume")
731+
* @param subtype the unit subtype (e.g., "meter", "kilogram", "liter")
732+
* @param result if the unit is valid, this will be set to the MeasureUnit
733+
* @return true if the type/subtype combination is valid, false otherwise
734+
* @internal
735+
*/
736+
static bool validateAndGet(StringPiece type, StringPiece subtype, MeasureUnit &result);
737+
#endif /* U_HIDE_INTERNAL_API */
738+
723739
/**
724740
* Return the class ID for this class. This is useful only for comparing to
725741
* a return value from getDynamicClassID(). For example:

icu4c/source/test/intltest/numbertest_api.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -888,6 +888,17 @@ void NumberFormatterApiTest::unitMeasure() {
888888
Locale("en"),
889889
100,
890890
u"100");
891+
892+
// This test checks the behavior of using a fixed-size length for the units array with a fixed number of units.
893+
// The array size is fixed, and we are now using a binary search approach.
894+
assertFormatSingle(
895+
u"One of the latest unit, volume-teaspoon",
896+
u"measure-unit/volume-teaspoon",
897+
u"unit/teaspoon",
898+
NumberFormatter::with().unit(MeasureUnit::forIdentifier("teaspoon", status)),
899+
Locale("en-US"),
900+
100,
901+
u"100 tsp");
891902

892903
// TODO: desired behaviour for this "pathological" case?
893904
// Since this is pointless, we don't test that its behaviour doesn't change.

icu4j/main/core/src/main/java/com/ibm/icu/number/NumberSkeletonImpl.java

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
import com.ibm.icu.util.StringTrieBuilder;
2727
import java.math.BigDecimal;
2828
import java.math.RoundingMode;
29-
import java.util.Set;
3029

3130
/**
3231
* @author sffc
@@ -1095,12 +1094,10 @@ private static void parseMeasureUnitOption(StringSegment segment, MacroProps mac
10951094
}
10961095
String type = segment.subSequence(0, firstHyphen).toString();
10971096
String subType = segment.subSequence(firstHyphen + 1, segment.length()).toString();
1098-
Set<MeasureUnit> units = MeasureUnit.getAvailable(type);
1099-
for (MeasureUnit unit : units) {
1100-
if (subType.equals(unit.getSubtype())) {
1101-
macros.unit = unit;
1102-
return;
1103-
}
1097+
MeasureUnit unit = MeasureUnit.getUnit(type, subType);
1098+
if (unit != null) {
1099+
macros.unit = unit;
1100+
return;
11041101
}
11051102
throw new SkeletonSyntaxException("Unknown measure unit", segment);
11061103
}

icu4j/main/core/src/main/java/com/ibm/icu/util/MeasureUnit.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -870,6 +870,26 @@ public static MeasureUnit findBySubType(String subType) {
870870
return null;
871871
}
872872

873+
/**
874+
* Returns the MeasureUnit instance if the given type and subtype combination is valid, or null
875+
* otherwise.
876+
*
877+
* <p>Example: "length", "meter" -> METER "length", "kilometer" -> KILOMETER "length",
878+
* "kilometer-per-hour" -> null --> not valid
879+
*
880+
* @param type the unit type (e.g., "length", "mass", "volume")
881+
* @param subtype the unit subtype (e.g., "meter", "kilogram", "liter")
882+
* @return the MeasureUnit if valid, otherwise null
883+
* @internal
884+
* @deprecated This API is ICU internal only.
885+
*/
886+
@Deprecated
887+
public static MeasureUnit getUnit(String type, String subtype) {
888+
populateCache();
889+
Map<String, MeasureUnit> units = cache.get(type);
890+
return units != null ? units.get(subtype) : null;
891+
}
892+
873893
static final UnicodeSet ASCII = new UnicodeSet('a', 'z').freeze();
874894
static final UnicodeSet ASCII_HYPHEN_DIGITS =
875895
new UnicodeSet('-', '-', '0', '9', 'a', 'z').freeze();

0 commit comments

Comments
 (0)