Skip to content

Commit 23670fd

Browse files
committed
8363972: Lenient parsing of minus sign pattern in DecimalFormat/CompactNumberFormat
Reviewed-by: jlu, rriggs
1 parent b426151 commit 23670fd

File tree

8 files changed

+406
-38
lines changed

8 files changed

+406
-38
lines changed

make/jdk/src/classes/build/tools/cldrconverter/Bundle.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ static enum Type {
7979
"NumberElements/nan",
8080
"NumberElements/currencyDecimal",
8181
"NumberElements/currencyGroup",
82+
"NumberElements/lenientMinusSigns",
8283
};
8384

8485
private static final String[] TIME_PATTERN_KEYS = {

make/jdk/src/classes/build/tools/cldrconverter/LDMLParseHandler.java

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -844,6 +844,26 @@ public void startElement(String uri, String localName, String qName, Attributes
844844
});
845845
break;
846846

847+
// Lenient parsing
848+
case "parseLenients":
849+
if ("lenient".equals(attributes.getValue("level"))) {
850+
pushKeyContainer(qName, attributes, attributes.getValue("scope"));
851+
} else {
852+
pushIgnoredContainer(qName);
853+
}
854+
break;
855+
856+
case "parseLenient":
857+
// Use only the lenient minus sign for now
858+
if (currentContainer instanceof KeyContainer kc
859+
&& kc.getKey().equals("number")
860+
&& attributes.getValue("sample").equals("-")) {
861+
pushStringEntry(qName, attributes, currentNumberingSystem + "NumberElements/lenientMinusSigns");
862+
} else {
863+
pushIgnoredContainer(qName);
864+
}
865+
break;
866+
847867
default:
848868
// treat anything else as a container
849869
pushContainer(qName, attributes);
@@ -1150,6 +1170,14 @@ public void endElement(String uri, String localName, String qName) throws SAXExc
11501170
currentStyle = "";
11511171
putIfEntry();
11521172
break;
1173+
case "parseLenient":
1174+
if (currentContainer instanceof StringEntry se) {
1175+
// Convert to a simple concatenation of lenient minuses
1176+
// e.g. "[\--﹣ ‐‑ ‒ – −⁻₋ ➖]" -> "--﹣‐‑‒–−⁻₋➖" for the root locale
1177+
put(se.getKey(), se.getValue().replaceAll("[\\[\\]\\\\ ]", ""));
1178+
}
1179+
break;
1180+
11531181
default:
11541182
putIfEntry();
11551183
}

src/java.base/share/classes/java/text/CompactNumberFormat.java

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@
147147
* a compact pattern. This special pattern can appear explicitly for any specific
148148
* range, or considered as a default pattern for an empty string.
149149
*
150-
* <h3>Negative Subpatterns</h3>
150+
* <h3><a id="negative_subpatterns">Negative Subpatterns</a></h3>
151151
* A compact pattern contains a positive and negative subpattern
152152
* separated by a subpattern boundary character {@code ';'},
153153
* for example, {@code "0K;-0K"}. Each subpattern has a prefix,
@@ -159,7 +159,10 @@
159159
* the negative prefix and suffix. The number of minimum integer digits,
160160
* and other characteristics are all the same as the positive pattern.
161161
* That means that {@code "0K;-00K"} produces precisely the same behavior
162-
* as {@code "0K;-0K"}.
162+
* as {@code "0K;-0K"}. In {@link NumberFormat##leniency lenient parsing}
163+
* mode, loose matching of the minus sign pattern is enabled, following the
164+
* LDML’s <a href="https://unicode.org/reports/tr35/#Loose_Matching">
165+
* loose matching</a> specification.
163166
*
164167
* <h4>Escaping Special Characters</h4>
165168
* Many characters in a compact pattern are taken literally, they are matched
@@ -1585,6 +1588,9 @@ private void expandAffixPatterns() {
15851588
* and are not digits that occur within the numerical portion
15861589
* </ul>
15871590
* <p>
1591+
* When lenient, the minus sign in the {@link ##negative_subpatterns
1592+
* negative subpatterns} is loosely matched against lenient minus sign characters.
1593+
* <p>
15881594
* The subclass returned depends on the value of
15891595
* {@link #isParseBigDecimal}.
15901596
* <ul>
@@ -1693,14 +1699,12 @@ public Number parse(String text, ParsePosition pos) {
16931699
// Given text does not match the non empty valid compact prefixes
16941700
// check with the default prefixes
16951701
if (!gotPositive && !gotNegative) {
1696-
if (text.regionMatches(pos.index, defaultPosPrefix, 0,
1697-
defaultPosPrefix.length())) {
1702+
if (decimalFormat.matchAffix(text, position, defaultPosPrefix)) {
16981703
// Matches the default positive prefix
16991704
matchedPosPrefix = defaultPosPrefix;
17001705
gotPositive = true;
17011706
}
1702-
if (text.regionMatches(pos.index, defaultNegPrefix, 0,
1703-
defaultNegPrefix.length())) {
1707+
if (decimalFormat.matchAffix(text, position, defaultNegPrefix)) {
17041708
// Matches the default negative prefix
17051709
matchedNegPrefix = defaultNegPrefix;
17061710
gotNegative = true;
@@ -1924,7 +1928,7 @@ private boolean matchAffix(String text, int position, String affix,
19241928
if (!affix.isEmpty() && !affix.equals(defaultAffix)) {
19251929
// Look ahead only for the longer match than the previous match
19261930
if (matchedAffix.length() < affix.length()) {
1927-
return text.regionMatches(position, affix, 0, affix.length());
1931+
return decimalFormat.matchAffix(text, position, affix);
19281932
}
19291933
}
19301934
return false;
@@ -2026,17 +2030,15 @@ private Number computeParseMultiplier(String text, ParsePosition parsePosition,
20262030
if (!gotPos && !gotNeg) {
20272031
String positiveSuffix = defaultDecimalFormat.getPositiveSuffix();
20282032
String negativeSuffix = defaultDecimalFormat.getNegativeSuffix();
2029-
boolean containsPosSuffix = text.regionMatches(position,
2030-
positiveSuffix, 0, positiveSuffix.length());
2033+
boolean containsPosSuffix = decimalFormat.matchAffix(text, position, positiveSuffix);
20312034
boolean endsWithPosSuffix = containsPosSuffix && text.length() ==
20322035
position + positiveSuffix.length();
20332036
if (parseStrict ? endsWithPosSuffix : containsPosSuffix) {
20342037
// Matches the default positive prefix
20352038
matchedPosSuffix = positiveSuffix;
20362039
gotPos = true;
20372040
}
2038-
boolean containsNegSuffix = text.regionMatches(position,
2039-
negativeSuffix, 0, negativeSuffix.length());
2041+
boolean containsNegSuffix = decimalFormat.matchAffix(text, position, negativeSuffix);
20402042
boolean endsWithNegSuffix = containsNegSuffix && text.length() ==
20412043
position + negativeSuffix.length();
20422044
if (parseStrict ? endsWithNegSuffix : containsNegSuffix) {

src/java.base/share/classes/java/text/DecimalFormat.java

Lines changed: 61 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -296,7 +296,7 @@
296296
* #setMaximumIntegerDigits(int)} can be used to manually adjust the maximum
297297
* integer digits.
298298
*
299-
* <h3>Negative Subpatterns</h3>
299+
* <h3><a id="negative_subpatterns">Negative Subpatterns</a></h3>
300300
* A {@code DecimalFormat} pattern contains a positive and negative
301301
* subpattern, for example, {@code "#,##0.00;(#,##0.00)"}. Each
302302
* subpattern has a prefix, numeric part, and suffix. The negative subpattern
@@ -307,7 +307,11 @@
307307
* serves only to specify the negative prefix and suffix; the number of digits,
308308
* minimal digits, and other characteristics are all the same as the positive
309309
* pattern. That means that {@code "#,##0.0#;(#)"} produces precisely
310-
* the same behavior as {@code "#,##0.0#;(#,##0.0#)"}.
310+
* the same behavior as {@code "#,##0.0#;(#,##0.0#)"}. In
311+
* {@link NumberFormat##leniency lenient parsing} mode, loose matching of the
312+
* minus sign pattern is enabled, following the LDML’s
313+
* <a href="https://unicode.org/reports/tr35/#Loose_Matching">
314+
* loose matching</a> specification.
311315
*
312316
* <p>The prefixes, suffixes, and various symbols used for infinity, digits,
313317
* grouping separators, decimal separators, etc. may be set to arbitrary
@@ -2189,6 +2193,9 @@ private void append(StringBuf result, String string,
21892193
* and are not digits that occur within the numerical portion
21902194
* </ul>
21912195
* <p>
2196+
* When lenient, the minus sign in the {@link ##negative_subpatterns
2197+
* negative subpatterns} is loosely matched against lenient minus sign characters.
2198+
* <p>
21922199
* The subclass returned depends on the value of {@link #isParseBigDecimal}
21932200
* as well as on the string being parsed.
21942201
* <ul>
@@ -2385,10 +2392,8 @@ private final boolean subparse(String text, ParsePosition parsePosition,
23852392
boolean gotPositive, gotNegative;
23862393

23872394
// check for positivePrefix; take longest
2388-
gotPositive = text.regionMatches(position, positivePrefix, 0,
2389-
positivePrefix.length());
2390-
gotNegative = text.regionMatches(position, negativePrefix, 0,
2391-
negativePrefix.length());
2395+
gotPositive = matchAffix(text, position, positivePrefix);
2396+
gotNegative = matchAffix(text, position, negativePrefix);
23922397

23932398
if (gotPositive && gotNegative) {
23942399
if (positivePrefix.length() > negativePrefix.length()) {
@@ -2424,15 +2429,13 @@ private final boolean subparse(String text, ParsePosition parsePosition,
24242429
// When lenient, text only needs to contain the suffix.
24252430
if (!isExponent) {
24262431
if (gotPositive) {
2427-
boolean containsPosSuffix =
2428-
text.regionMatches(position, positiveSuffix, 0, positiveSuffix.length());
2432+
boolean containsPosSuffix = matchAffix(text, position, positiveSuffix);
24292433
boolean endsWithPosSuffix =
24302434
containsPosSuffix && text.length() == position + positiveSuffix.length();
24312435
gotPositive = parseStrict ? endsWithPosSuffix : containsPosSuffix;
24322436
}
24332437
if (gotNegative) {
2434-
boolean containsNegSuffix =
2435-
text.regionMatches(position, negativeSuffix, 0, negativeSuffix.length());
2438+
boolean containsNegSuffix = matchAffix(text, position, negativeSuffix);
24362439
boolean endsWithNegSuffix =
24372440
containsNegSuffix && text.length() == position + negativeSuffix.length();
24382441
gotNegative = parseStrict ? endsWithNegSuffix : containsNegSuffix;
@@ -3501,6 +3504,54 @@ private void appendAffix(StringBuilder buffer, String affix, boolean localized)
35013504
if (needQuote) buffer.append('\'');
35023505
}
35033506

3507+
/**
3508+
* {@return true if the text matches the affix}
3509+
* In lenient mode, lenient minus signs also match the hyphen-minus
3510+
* (U+002D). Package-private access, as this is called from
3511+
* CompactNumberFormat.
3512+
*
3513+
* Note: Minus signs in the supplementary character range or normalization
3514+
* equivalents are not matched, as they may alter the affix length.
3515+
*/
3516+
boolean matchAffix(String text, int position, String affix) {
3517+
var alen = affix.length();
3518+
var tlen = text.length();
3519+
3520+
if (alen == 0) {
3521+
// always match with an empty affix, as affix is optional
3522+
return true;
3523+
}
3524+
if (position >= tlen) {
3525+
return false;
3526+
}
3527+
if (parseStrict) {
3528+
return text.regionMatches(position, affix, 0, alen);
3529+
}
3530+
3531+
var lms = symbols.getLenientMinusSigns();
3532+
int i = 0;
3533+
int limit = Math.min(tlen, position + alen);
3534+
for (; position + i < limit; i++) {
3535+
char t = text.charAt(position + i);
3536+
char a = affix.charAt(i);
3537+
int tIndex = lms.indexOf(t);
3538+
int aIndex = lms.indexOf(a);
3539+
// Non LMS. Match direct
3540+
if (tIndex < 0 && aIndex < 0) {
3541+
if (t != a) {
3542+
return false;
3543+
}
3544+
} else {
3545+
// By here, at least one LMS. Ensure both LMS.
3546+
if (tIndex < 0 || aIndex < 0) {
3547+
return false;
3548+
}
3549+
}
3550+
}
3551+
// Return true if entire affix was matched
3552+
return i == alen;
3553+
}
3554+
35043555
/**
35053556
* Implementation of producing a pattern. This method returns a positive and
35063557
* negative (if needed), pattern string in the form of : Prefix (optional)

src/java.base/share/classes/java/text/DecimalFormatSymbols.java

Lines changed: 42 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -718,6 +718,17 @@ void setMinusSignText(String minusSignText) {
718718
this.minusSign = findNonFormatChar(minusSignText, '-');
719719
}
720720

721+
/**
722+
* {@return the lenient minus signs} Multiple lenient minus signs
723+
* are concatenated to form the returned string. Each codepoint
724+
* in the string is a valid minus sign pattern. If there are no
725+
* lenient minus signs defined in this locale, {@code minusSignText}
726+
* is returned.
727+
*/
728+
String getLenientMinusSigns() {
729+
return lenientMinusSigns;
730+
}
731+
721732
//------------------------------------------------------------
722733
// END Package Private methods ... to be made public later
723734
//------------------------------------------------------------
@@ -818,18 +829,7 @@ public int hashCode() {
818829
private void initialize(Locale locale) {
819830
this.locale = locale;
820831

821-
// check for region override
822-
Locale override = locale.getUnicodeLocaleType("nu") == null ?
823-
CalendarDataUtility.findRegionOverride(locale) :
824-
locale;
825-
826-
// get resource bundle data
827-
LocaleProviderAdapter adapter = LocaleProviderAdapter.getAdapter(DecimalFormatSymbolsProvider.class, override);
828-
// Avoid potential recursions
829-
if (!(adapter instanceof ResourceBundleBasedAdapter)) {
830-
adapter = LocaleProviderAdapter.getResourceBundleBased();
831-
}
832-
Object[] data = adapter.getLocaleResources(override).getDecimalFormatSymbolsData();
832+
Object[] data = loadNumberData(locale);
833833
String[] numberElements = (String[]) data[0];
834834

835835
decimalSeparator = numberElements[0].charAt(0);
@@ -854,11 +854,30 @@ private void initialize(Locale locale) {
854854
monetaryGroupingSeparator = numberElements.length < 13 || numberElements[12].isEmpty() ?
855855
groupingSeparator : numberElements[12].charAt(0);
856856

857+
// Lenient minus signs
858+
lenientMinusSigns = numberElements.length < 14 ? minusSignText : numberElements[13];
859+
857860
// maybe filled with previously cached values, or null.
858861
intlCurrencySymbol = (String) data[1];
859862
currencySymbol = (String) data[2];
860863
}
861864

865+
private Object[] loadNumberData(Locale locale) {
866+
// check for region override
867+
Locale override = locale.getUnicodeLocaleType("nu") == null ?
868+
CalendarDataUtility.findRegionOverride(locale) :
869+
locale;
870+
871+
// get resource bundle data
872+
LocaleProviderAdapter adapter = LocaleProviderAdapter.getAdapter(DecimalFormatSymbolsProvider.class, override);
873+
// Avoid potential recursions
874+
if (!(adapter instanceof ResourceBundleBasedAdapter)) {
875+
adapter = LocaleProviderAdapter.getResourceBundleBased();
876+
}
877+
878+
return adapter.getLocaleResources(override).getDecimalFormatSymbolsData();
879+
}
880+
862881
/**
863882
* Obtains non-format single character from String
864883
*/
@@ -995,6 +1014,14 @@ private void readObject(ObjectInputStream stream)
9951014
}
9961015
currencyInitialized = true;
9971016
}
1017+
1018+
if (loadNumberData(locale) instanceof Object[] d &&
1019+
d[0] instanceof String[] numberElements &&
1020+
numberElements.length >= 14) {
1021+
lenientMinusSigns = numberElements[13];
1022+
} else {
1023+
lenientMinusSigns = minusSignText;
1024+
}
9981025
}
9991026

10001027
/**
@@ -1174,6 +1201,9 @@ private void readObject(ObjectInputStream stream)
11741201
private transient Currency currency;
11751202
private transient volatile boolean currencyInitialized;
11761203

1204+
// Lenient minus. No need to be set by applications
1205+
private transient String lenientMinusSigns;
1206+
11771207
/**
11781208
* Cached hash code.
11791209
*/

src/java.base/share/classes/java/text/NumberFormat.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,11 @@
195195
* Lenient parsing should be used when attempting to parse a number
196196
* out of a String that contains non-numerical or non-format related values.
197197
* For example, using a {@link Locale#US} currency format to parse the number
198-
* {@code 1000} out of the String "$1,000.00 was paid".
198+
* {@code 1000} out of the String "$1,000.00 was paid". Lenient parsing also
199+
* allows loose matching of characters in the source text. For example, an
200+
* implementation of the {@code NumberFormat} class may allow matching "−"
201+
* (U+2212 MINUS SIGN) to the "-" (U+002D HYPHEN-MINUS) pattern character
202+
* when used as a negative prefix.
199203
* <p>
200204
* Strict parsing should be used when attempting to ensure a String adheres exactly
201205
* to a locale's conventions, and can thus serve to validate input. For example, successfully

test/jdk/java/text/Format/CompactNumberFormat/TestCompactNumber.java

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2018, 2024, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2018, 2025, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* This code is free software; you can redistribute it and/or modify it
@@ -22,7 +22,7 @@
2222
*/
2323
/*
2424
* @test
25-
* @bug 8177552 8217721 8222756 8295372 8306116 8319990 8338690
25+
* @bug 8177552 8217721 8222756 8295372 8306116 8319990 8338690 8363972
2626
* @summary Checks the functioning of compact number format
2727
* @modules jdk.localedata
2828
* @run testng/othervm TestCompactNumber
@@ -462,6 +462,8 @@ Object[][] compactParseData() {
462462
{FORMAT_SE_SHORT, "12345679,89\u00a0bn", 1.2345679890000001E19, Double.class},
463463
{FORMAT_SE_SHORT, "\u2212999", -999L, Long.class},
464464
{FORMAT_SE_SHORT, "\u22128\u00a0mn", -8000000L, Long.class},
465+
// lenient parsing. Hyphen-minus should match the localized minus sign
466+
{FORMAT_SE_SHORT, "-8\u00a0mn", -8000000L, Long.class},
465467
{FORMAT_SE_SHORT, "\u22128\u00a0dt", -8000L, Long.class},
466468
{FORMAT_SE_SHORT, "\u221212345679\u00a0bn", -1.2345679E19, Double.class},
467469
{FORMAT_SE_SHORT, "\u221212345679,89\u00a0bn", -1.2345679890000001E19, Double.class},
@@ -503,8 +505,7 @@ Object[][] exceptionParseData() {
503505
{FORMAT_EN_US_SHORT, "K12,347", null},
504506
// Invalid prefix for ja_JP
505507
{FORMAT_JA_JP_SHORT, "\u4E071", null},
506-
// Localized minus sign should be used
507-
{FORMAT_SE_SHORT, "-8\u00a0mn", null},};
508+
};
508509
}
509510

510511
@DataProvider(name = "invalidParse")

0 commit comments

Comments
 (0)