@@ -5,7 +5,6 @@ import ch.randelshofer.fastdoubleparser.NumberFormatSymbols
55import io.github.oshai.kotlinlogging.KotlinLogging
66import org.jetbrains.kotlinx.dataframe.DataFrame
77import org.jetbrains.kotlinx.dataframe.api.ParserOptions
8- import org.jetbrains.kotlinx.dataframe.api.parser
98import org.jetbrains.kotlinx.dataframe.impl.api.Parsers
109import java.nio.charset.Charset
1110import java.text.DecimalFormatSymbols
@@ -15,19 +14,24 @@ import java.util.Locale
1514
1615private val logger = KotlinLogging .logger {}
1716
18- // (lowercase) strings that are recognized to represent infinity and NaN in doubles in all locales
19- private val INFINITIES = arrayOf(" ∞" , " inf" , " infinity" , " infty" )
20- private val PLUS_INFINITIES = INFINITIES .map { " +$it " }
21- private val MINUS_INFINITIES = INFINITIES .map { " -$it " }
22- private val NANS = arrayOf(" nan" , " na" , " n/a" )
23-
2417/* *
2518 * Parses a [String]/[CharSequence], [CharArray], or [ByteArray] into a [Double].
2619 *
2720 * If [ParserOptions.useFastDoubleParser] is enabled, it will try to parse the input with the
2821 * fast double parser library, [FastDoubleParser](https://github.com/wrandelshofer/FastDoubleParser).
2922 * If not, or if it fails, it will use [NumberFormat] to parse the input.
3023 *
24+ * The [locale][locale] used by the double parser is defined like:
25+ *
26+ * [parserOptions][parserOptions]`?.`[locale][ParserOptions.locale]` ?: `[Parsers.locale][Parsers.locale]` :? `[Locale.getDefault()][Locale.getDefault]
27+ *
28+ * [FastDoubleParser] has a fallback mechanism; In practice, this means it can recognize symbols and notations
29+ * of any locale recognized by Java as long as that symbol does not conflict with the given locale.
30+ *
31+ * For example, if your locale uses ',' as decimal separator, it will NOT recognize ',' as thousands separator,
32+ * but it will recognize ' ', '٬', '_', ' ', etc. as such.
33+ * The same holds for characters like "e", "inf", "×10^", "NaN", etc.
34+ *
3135 * Public, so it can be used in other modules.
3236 *
3337 * @param parserOptions can be supplied to configure the parser.
@@ -41,106 +45,103 @@ public class FastDoubleParser(private val parserOptions: ParserOptions? = null)
4145
4246 private val useFastDoubleParser = parserOptions?.useFastDoubleParser ? : Parsers .useFastDoubleParser
4347 private val locale = parserOptions?.locale ? : Parsers .locale
44- private val fallbackLocale = Locale .ROOT
45-
46- private val localDecimalFormatSymbols = DecimalFormatSymbols .getInstance(locale)
47- private val fallbackDecimalFormatSymbols = DecimalFormatSymbols .getInstance(fallbackLocale)
4848
4949 private val parser = ConfigurableDoubleParser (/* symbols = */ setupNumberFormatSymbols(), /* ignoreCase = */ true )
5050
5151 /* *
5252 * Sets up the [NumberFormatSymbols] for the [ConfigurableDoubleParser] based on
53- * [localDecimalFormatSymbols ] with fallbacks from [fallbackDecimalFormatSymbols] .
53+ * the [locale ] with fallbacks from all other locales .
5454 *
5555 * Fallback characters/strings are only added if they're not clashing with local characters/strings.
5656 */
57- private fun setupNumberFormatSymbols (): NumberFormatSymbols {
58- // collect all chars and strings that are locale-specific such that we can check whether
59- // fallback chars and strings are safe to add
60- val localChars = with (localDecimalFormatSymbols) {
61- buildSet {
62- add(decimalSeparator.lowercaseChar())
63- add(groupingSeparator.lowercaseChar())
64- add(minusSign.lowercaseChar())
65- add(' +' )
66- add(zeroDigit.lowercaseChar())
57+ private fun setupNumberFormatSymbols (): NumberFormatSymbols =
58+ numberFormatSymbolsCache.getOrPut(locale) {
59+ val localDecimalFormatSymbols = DecimalFormatSymbols .getInstance(locale)
60+
61+ // collect all chars and strings that are locale-specific such that we can check whether
62+ // fallback chars and strings are safe to add
63+ val localChars = with (localDecimalFormatSymbols) {
64+ buildSet {
65+ add(decimalSeparator.lowercaseChar())
66+ add(groupingSeparator.lowercaseChar())
67+ add(minusSign.lowercaseChar())
68+ add(' +' )
69+ // we don't include zeroDigit here, for notations like ×10^
70+ }
6771 }
68- }
69- val localStrings = with (localDecimalFormatSymbols) {
70- buildSet {
71- add(exponentSeparator .lowercase())
72- add(infinity .lowercase())
73- add(naN.lowercase())
72+ val localStrings = with (localDecimalFormatSymbols) {
73+ buildSet {
74+ add(exponentSeparator.lowercase())
75+ add(infinity .lowercase())
76+ add(naN .lowercase())
77+ }
7478 }
75- }
7679
77- /* *
78- * Builds a set with the specified char from [localDecimalFormatSymbols] and
79- * its fallback char from [fallbackDecimalFormatSymbols] if it's safe to do so.
80- * [additionals] will be added to the set too, when they're safe to add.
81- */
82- fun ((DecimalFormatSymbols ) -> Char ).fromLocalWithFallBack(vararg additionals : Char ): Set <Char > =
83- buildSet {
84- val getChar = this @fromLocalWithFallBack
85- val char = getChar(localDecimalFormatSymbols).lowercaseChar()
86- add(char)
87-
88- // add fallback char if it's safe to do so
89- val fallbackChar = getChar(fallbackDecimalFormatSymbols).lowercaseChar()
90- if (fallbackChar !in localChars && ! localStrings.any { fallbackChar in it }) {
91- add(fallbackChar)
92- }
80+ /* *
81+ * Builds a set with the specified char from [this] and
82+ * [fallbackChars] will be added to the set too, when they're safe to add.
83+ */
84+ fun Char.withFallback (fallbackChars : CharArray ): Set <Char > =
85+ buildSet {
86+ val char = this @withFallback.lowercaseChar()
87+ add(char)
9388
94- // Fixes NBSP and other whitespace characters not being recognized if the user writes space instead .
95- if (char.isWhitespace()) add( ' ' )
89+ // Treat NBSP and other whitespace characters the same .
90+ if (char.isWhitespace()) addAll( WHITE_SPACES .asIterable() )
9691
97- // add additional chars if needed
98- for (additional in additionals) {
99- val lowercase = additional.lowercaseChar()
100- if (lowercase !in localChars && ! localStrings.any { lowercase in it }) {
101- add(lowercase)
92+ // add fallback chars if needed
93+ for (char in fallbackChars) {
94+ val lowercase = char.lowercaseChar()
95+ if (lowercase !in localChars && ! localStrings.any { lowercase in it }) {
96+ add(lowercase)
97+ }
98+
99+ // Treat NBSP and other whitespace characters the same.
100+ if (char.isWhitespace()) addAll(WHITE_SPACES .asIterable())
102101 }
103102 }
104- }
105103
106- /* *
107- * Builds a set with the specified string from [localDecimalFormatSymbols] and
108- * its fallback string from [fallbackDecimalFormatSymbols] if it's safe to do so.
109- * [additionals] will be added to the set too, when they're safe to add.
110- */
111- fun ((DecimalFormatSymbols ) -> String ).fromLocalWithFallBack(vararg additionals : String ): Set <String > =
112- buildSet {
113- val getString = this @fromLocalWithFallBack
114- val string = getString(localDecimalFormatSymbols).lowercase()
115- add(string)
116-
117- // add fallback string if it's safe to do so
118- val fallbackString = getString(fallbackDecimalFormatSymbols).lowercase()
119- if (! fallbackString.any { it in localChars } && fallbackString !in localStrings) {
120- add(fallbackString)
121- }
104+ /* *
105+ * Builds a set with the specified string from [this] and
106+ * [fallbackStrings] will be added to the set too, when they're safe to add.
107+ */
108+ fun String.withFallback (fallbackStrings : Array <String >): Set <String > =
109+ buildSet {
110+ val string = this @withFallback.lowercase()
111+ add(string)
112+
113+ // Treat NBSP and other whitespace characters the same.
114+ if (string.isBlank()) addAll(WHITE_SPACES .map { it.toString() })
122115
123- // Fixes NBSP and other whitespace characters not being recognized if the user writes space instead.
124- if (string.isBlank()) add(" " )
116+ // add fallback strings if needed
117+ for (string in fallbackStrings) {
118+ val lowercase = string.lowercase()
119+ if (! lowercase.any { it in localChars } && lowercase !in localStrings) {
120+ add(lowercase)
121+ }
125122
126- // add additional strings if needed
127- for (additional in additionals) {
128- val lowercase = additional.lowercase()
129- if (! lowercase.any { it in localChars } && lowercase !in localStrings) {
130- add(lowercase)
123+ // Treat NBSP and other whitespace characters the same.
124+ if (string.isBlank()) addAll(WHITE_SPACES .map { it.toString() })
131125 }
132126 }
133- }
134127
135- return NumberFormatSymbols .fromDecimalFormatSymbols(localDecimalFormatSymbols)
136- .withPlusSign(setOf (' +' ))
137- .withDecimalSeparator(DecimalFormatSymbols ::getDecimalSeparator.fromLocalWithFallBack())
138- .withGroupingSeparator(DecimalFormatSymbols ::getGroupingSeparator.fromLocalWithFallBack())
139- .withExponentSeparator(DecimalFormatSymbols ::getExponentSeparator.fromLocalWithFallBack())
140- .withMinusSign(DecimalFormatSymbols ::getMinusSign.fromLocalWithFallBack())
141- .withInfinity(DecimalFormatSymbols ::getInfinity.fromLocalWithFallBack(* INFINITIES ))
142- .withNaN(DecimalFormatSymbols ::getNaN.fromLocalWithFallBack(* NANS ))
143- }
128+ NumberFormatSymbols .fromDecimalFormatSymbols(localDecimalFormatSymbols)
129+ .withPlusSign(
130+ setOf (' +' ),
131+ ).withDecimalSeparator(
132+ localDecimalFormatSymbols.decimalSeparator.withFallback(DECIMAL_SEPARATORS ),
133+ ).withGroupingSeparator(
134+ localDecimalFormatSymbols.groupingSeparator.withFallback(GROUPING_SEPARATORS ),
135+ ).withExponentSeparator(
136+ localDecimalFormatSymbols.exponentSeparator.withFallback(EXPONENTS ),
137+ ).withMinusSign(
138+ localDecimalFormatSymbols.minusSign.withFallback(MINUS_SIGNS ),
139+ ).withInfinity(
140+ localDecimalFormatSymbols.infinity.withFallback(INFINITIES ),
141+ ).withNaN(
142+ localDecimalFormatSymbols.naN.withFallback(NANS ),
143+ )
144+ }
144145
145146 /* * Fallback method for parsing doubles. */
146147 private fun String.parseToDoubleOrNullFallback (): Double? =
@@ -152,7 +153,7 @@ public class FastDoubleParser(private val parserOptions: ParserOptions? = null)
152153 in NANS -> Double .NaN
153154
154155 else -> {
155- // not thread safe; must be created here
156+ // NumberFormat is not thread safe; must be created in the function body
156157 val numberFormat = NumberFormat .getInstance(locale)
157158 val parsePosition = ParsePosition (0 )
158159 val result = numberFormat.parse(this , parsePosition)?.toDouble()
@@ -235,4 +236,49 @@ public class FastDoubleParser(private val parserOptions: ParserOptions? = null)
235236 }
236237 return String (chars = ca, offset = offset, length = length).parseToDoubleOrNullFallback()
237238 }
239+
240+ /* *
241+ * Here we store all possible decimal format symbols of all locales on the system.
242+ * These will be used as fallbacks for the selected locale.
243+ * They are only added by [withFallback] if they don't interfere with symbols already in the provided [locale]
244+ * (so ',' is not added as grouping separator if '.' is already the locale's decimal separator).
245+ */
246+ internal companion object {
247+ private val allDecimalFormatSymbols by lazy {
248+ Locale .getAvailableLocales().map { DecimalFormatSymbols .getInstance(it) }
249+ }
250+ val MINUS_SIGNS by lazy {
251+ allDecimalFormatSymbols.mapNotNullTo(mutableSetOf ()) { it.minusSign }.toCharArray()
252+ }
253+ val INFINITIES by lazy {
254+ allDecimalFormatSymbols.mapNotNullTo(mutableSetOf ()) { it.infinity }
255+ .plus(arrayOf(" ∞" , " inf" , " infinity" , " infty" ))
256+ .toTypedArray()
257+ }
258+ val PLUS_INFINITIES by lazy { INFINITIES .map { " +$it " }.toTypedArray() }
259+ val MINUS_INFINITIES by lazy {
260+ INFINITIES .flatMap { inf -> MINUS_SIGNS .map { min -> min + inf } }.toTypedArray()
261+ }
262+ val NANS by lazy {
263+ allDecimalFormatSymbols.mapNotNullTo(mutableSetOf ()) { it.naN }
264+ .plus(arrayOf(" nan" , " na" , " n/a" ))
265+ .toTypedArray()
266+ }
267+ val WHITE_SPACES = charArrayOf(' ' , ' \u00A0 ' , ' \u2009 ' , ' \u202F ' , ' \t ' )
268+ val GROUPING_SEPARATORS by lazy {
269+ allDecimalFormatSymbols.mapNotNullTo(mutableSetOf ()) { it.groupingSeparator }
270+ .plus(arrayOf(' \' ' , ' ˙' , * WHITE_SPACES .toTypedArray()))
271+ .toCharArray()
272+ }
273+ val DECIMAL_SEPARATORS by lazy {
274+ allDecimalFormatSymbols.flatMapTo(mutableSetOf ()) {
275+ listOfNotNull(it.decimalSeparator, it.monetaryDecimalSeparator)
276+ }.plus(arrayOf(' ·' , ' ⎖' ))
277+ .toCharArray()
278+ }
279+ val EXPONENTS by lazy {
280+ allDecimalFormatSymbols.mapNotNullTo(mutableSetOf ()) { it.exponentSeparator }.toTypedArray()
281+ }
282+ val numberFormatSymbolsCache = mutableMapOf<Locale , NumberFormatSymbols >()
283+ }
238284}
0 commit comments