@@ -5,7 +5,6 @@ import ch.randelshofer.fastdoubleparser.NumberFormatSymbols
55import io.github.oshai.kotlinlogging.KotlinLogging
66import org.jetbrains.kotlinx.dataframe.DataFrame
77import org.jetbrains.kotlinx.dataframe.api.ParserOptions
8- import org.jetbrains.kotlinx.dataframe.api.parser
98import org.jetbrains.kotlinx.dataframe.impl.api.Parsers
109import java.nio.charset.Charset
1110import java.text.DecimalFormatSymbols
@@ -15,19 +14,24 @@ import java.util.Locale
1514
1615private val logger = KotlinLogging .logger {}
1716
18- // (lowercase) strings that are recognized to represent infinity and NaN in doubles in all locales
19- private val INFINITIES = arrayOf(" ∞" , " inf" , " infinity" , " infty" )
20- private val PLUS_INFINITIES = INFINITIES .map { " +$it " }
21- private val MINUS_INFINITIES = INFINITIES .map { " -$it " }
22- private val NANS = arrayOf(" nan" , " na" , " n/a" )
23-
2417/* *
2518 * Parses a [String]/[CharSequence], [CharArray], or [ByteArray] into a [Double].
2619 *
2720 * If [ParserOptions.useFastDoubleParser] is enabled, it will try to parse the input with the
2821 * fast double parser library, [FastDoubleParser](https://github.com/wrandelshofer/FastDoubleParser).
2922 * If not, or if it fails, it will use [NumberFormat] to parse the input.
3023 *
24+ * The [locale][locale] used by the double parser is defined like:
25+ *
26+ * [parserOptions][parserOptions]`?.`[locale][ParserOptions.locale]` ?: `[Parsers.locale][Parsers.locale]` :? `[Locale.getDefault()][Locale.getDefault]
27+ *
28+ * [FastDoubleParser] has a fallback mechanism; In practice, this means it can recognize symbols and notations
29+ * of any locale recognized by Java as long as that symbol does not conflict with the given locale.
30+ *
31+ * For example, if your locale uses ',' as decimal symbol, it will NOT recognize ',' as thousands separator, but it will
32+ * recognize ' ', '٬', '_', ' ', etc. as thousands separator.
33+ * The same holds for characters like "e", "inf", "x10", "NaN", etc.
34+ *
3135 * Public, so it can be used in other modules.
3236 *
3337 * @param parserOptions can be supplied to configure the parser.
@@ -41,10 +45,8 @@ public class FastDoubleParser(private val parserOptions: ParserOptions? = null)
4145
4246 private val useFastDoubleParser = parserOptions?.useFastDoubleParser ? : Parsers .useFastDoubleParser
4347 private val locale = parserOptions?.locale ? : Parsers .locale
44- private val fallbackLocale = Locale .ROOT
4548
4649 private val localDecimalFormatSymbols = DecimalFormatSymbols .getInstance(locale)
47- private val fallbackDecimalFormatSymbols = DecimalFormatSymbols .getInstance(fallbackLocale)
4850
4951 private val parser = ConfigurableDoubleParser (/* symbols = */ setupNumberFormatSymbols(), /* ignoreCase = */ true )
5052
@@ -75,71 +77,69 @@ public class FastDoubleParser(private val parserOptions: ParserOptions? = null)
7577 }
7678
7779 /* *
78- * Builds a set with the specified char from [localDecimalFormatSymbols] and
79- * its fallback char from [fallbackDecimalFormatSymbols] if it's safe to do so.
80- * [additionals] will be added to the set too, when they're safe to add.
80+ * Builds a set with the specified char from [this] and
81+ * [fallbackChars] will be added to the set too, when they're safe to add.
8182 */
82- fun (( DecimalFormatSymbols ) -> Char ).fromLocalWithFallBack( vararg additionals : Char ): Set <Char > =
83+ fun Char. withFallback ( fallbackChars : CharArray ): Set <Char > =
8384 buildSet {
84- val getChar = this @fromLocalWithFallBack
85- val char = getChar(localDecimalFormatSymbols).lowercaseChar()
85+ val char = this @withFallback.lowercaseChar()
8686 add(char)
8787
88- // add fallback char if it's safe to do so
89- val fallbackChar = getChar(fallbackDecimalFormatSymbols).lowercaseChar()
90- if (fallbackChar !in localChars && ! localStrings.any { fallbackChar in it }) {
91- add(fallbackChar)
92- }
93-
94- // Fixes NBSP and other whitespace characters not being recognized if the user writes space instead.
95- if (char.isWhitespace()) add(' ' )
88+ // Treat NBSP and other whitespace characters the same.
89+ if (char.isWhitespace()) addAll(WHITE_SPACES .asIterable())
9690
97- // add additional chars if needed
98- for (additional in additionals ) {
99- val lowercase = additional .lowercaseChar()
91+ // add fallback chars if needed
92+ for (char in fallbackChars ) {
93+ val lowercase = char .lowercaseChar()
10094 if (lowercase !in localChars && ! localStrings.any { lowercase in it }) {
10195 add(lowercase)
10296 }
97+
98+ // Treat NBSP and other whitespace characters the same.
99+ if (char.isWhitespace()) addAll(WHITE_SPACES .asIterable())
103100 }
104101 }
105102
106103 /* *
107- * Builds a set with the specified string from [localDecimalFormatSymbols] and
108- * its fallback string from [fallbackDecimalFormatSymbols] if it's safe to do so.
109- * [additionals] will be added to the set too, when they're safe to add.
104+ * Builds a set with the specified string from [this] and
105+ * [fallbackStrings] will be added to the set too, when they're safe to add.
110106 */
111- fun (( DecimalFormatSymbols ) -> String ).fromLocalWithFallBack( vararg additionals : String ): Set <String > =
107+ fun String. withFallback ( fallbackStrings : Array < String > ): Set <String > =
112108 buildSet {
113- val getString = this @fromLocalWithFallBack
114- val string = getString(localDecimalFormatSymbols).lowercase()
109+ val string = this @withFallback.lowercase()
115110 add(string)
116111
117- // add fallback string if it's safe to do so
118- val fallbackString = getString(fallbackDecimalFormatSymbols).lowercase()
119- if (! fallbackString.any { it in localChars } && fallbackString !in localStrings) {
120- add(fallbackString)
121- }
122-
123- // Fixes NBSP and other whitespace characters not being recognized if the user writes space instead.
124- if (string.isBlank()) add(" " )
112+ // Treat NBSP and other whitespace characters the same.
113+ if (string.isBlank()) addAll(WHITE_SPACES .map { it.toString() })
125114
126- // add additional strings if needed
127- for (additional in additionals ) {
128- val lowercase = additional .lowercase()
115+ // add fallback strings if needed
116+ for (string in fallbackStrings ) {
117+ val lowercase = string .lowercase()
129118 if (! lowercase.any { it in localChars } && lowercase !in localStrings) {
130119 add(lowercase)
131120 }
121+
122+ // Treat NBSP and other whitespace characters the same.
123+ if (string.isBlank()) addAll(WHITE_SPACES .map { it.toString() })
132124 }
133125 }
134126
135127 return NumberFormatSymbols .fromDecimalFormatSymbols(localDecimalFormatSymbols)
136- .withPlusSign(setOf (' +' ))
137- .withDecimalSeparator(DecimalFormatSymbols ::getDecimalSeparator.fromLocalWithFallBack())
138- .withGroupingSeparator(DecimalFormatSymbols ::getGroupingSeparator.fromLocalWithFallBack())
139- .withExponentSeparator(DecimalFormatSymbols ::getExponentSeparator.fromLocalWithFallBack())
140- .withMinusSign(DecimalFormatSymbols ::getMinusSign.fromLocalWithFallBack())
141- .withInfinity(DecimalFormatSymbols ::getInfinity.fromLocalWithFallBack(* INFINITIES ))
142- .withNaN(DecimalFormatSymbols ::getNaN.fromLocalWithFallBack(* NANS ))
128+ .withPlusSign(
129+ setOf (' +' ),
130+ ).withDecimalSeparator(
131+ localDecimalFormatSymbols.decimalSeparator.withFallback(DECIMAL_SEPARATORS ),
132+ ).withGroupingSeparator(
133+ localDecimalFormatSymbols.groupingSeparator.withFallback(GROUPING_SEPARATORS ),
134+ ).withExponentSeparator(
135+ localDecimalFormatSymbols.exponentSeparator.withFallback(EXPONENTS ),
136+ ).withMinusSign(
137+ localDecimalFormatSymbols.minusSign.withFallback(MINUS_SIGNS ),
138+ ).withInfinity(
139+ localDecimalFormatSymbols.infinity.withFallback(INFINITIES ),
140+ ).withNaN(
141+ localDecimalFormatSymbols.naN.withFallback(NANS ),
142+ )
143143 }
144144
145145 /* * Fallback method for parsing doubles. */
@@ -152,7 +152,7 @@ public class FastDoubleParser(private val parserOptions: ParserOptions? = null)
152152 in NANS -> Double .NaN
153153
154154 else -> {
155- // not thread safe; must be created here
155+ // NumberFormat is not thread safe; must be created in the function body
156156 val numberFormat = NumberFormat .getInstance(locale)
157157 val parsePosition = ParsePosition (0 )
158158 val result = numberFormat.parse(this , parsePosition)?.toDouble()
@@ -235,4 +235,47 @@ public class FastDoubleParser(private val parserOptions: ParserOptions? = null)
235235 }
236236 return String (chars = ca, offset = offset, length = length).parseToDoubleOrNullFallback()
237237 }
238+
239+ /* *
240+ * Here we store all possible decimal format symbols of all locales on the system.
241+ * These will be used as fallbacks for the selected locale.
242+ * They are only added by [withFallback] if they don't interfere with symbols already in the provided [locale]
243+ * (so ',' is not added as grouping separator if '.' is already the locale's decimal separator).
244+ */
245+ internal companion object {
246+ private val allDecimalFormatSymbols by lazy {
247+ Locale .getAvailableLocales().map { DecimalFormatSymbols .getInstance(it) }
248+ }
249+ val MINUS_SIGNS by lazy {
250+ allDecimalFormatSymbols.mapNotNullTo(mutableSetOf ()) { it.minusSign }.toCharArray()
251+ }
252+
253+ // (lowercase) strings that are recognized to represent infinity and NaN in doubles in all locales
254+ val INFINITIES by lazy {
255+ allDecimalFormatSymbols.mapNotNullTo(mutableSetOf ()) { it.infinity }
256+ .plus(arrayOf(" ∞" , " inf" , " infinity" , " infty" ))
257+ .toTypedArray()
258+ }
259+ val PLUS_INFINITIES by lazy { INFINITIES .map { " +$it " }.toTypedArray() }
260+ val MINUS_INFINITIES by lazy {
261+ INFINITIES .flatMap { inf -> MINUS_SIGNS .map { min -> min + inf } }.toTypedArray()
262+ }
263+ val NANS by lazy {
264+ allDecimalFormatSymbols.mapNotNullTo(mutableSetOf ()) { it.naN }
265+ .plus(arrayOf(" nan" , " na" , " n/a" ))
266+ .toTypedArray()
267+ }
268+ val GROUPING_SEPARATORS by lazy {
269+ allDecimalFormatSymbols.mapNotNullTo(mutableSetOf ()) { it.groupingSeparator }.toCharArray()
270+ }
271+ val DECIMAL_SEPARATORS by lazy {
272+ allDecimalFormatSymbols.flatMapTo(mutableSetOf ()) {
273+ listOfNotNull(it.decimalSeparator, it.monetaryDecimalSeparator)
274+ }.toCharArray()
275+ }
276+ val EXPONENTS by lazy {
277+ allDecimalFormatSymbols.mapNotNullTo(mutableSetOf ()) { it.exponentSeparator }.toTypedArray()
278+ }
279+ val WHITE_SPACES = charArrayOf(' ' , ' \u00A0 ' , ' \u2009 ' , ' \u202F ' , ' \t ' )
280+ }
238281}
0 commit comments