Skip to content

Commit 89e0d41

Browse files
committed
Improved fallback mechanism of FastDoubleParser to take into account all other locales, not just ROOT. Finished parse tests
1 parent 6594189 commit 89e0d41

File tree

4 files changed

+255
-99
lines changed

4 files changed

+255
-99
lines changed

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@ import org.jetbrains.kotlinx.dataframe.impl.api.toLocalDateTime
3636
import org.jetbrains.kotlinx.dataframe.impl.api.toLocalTime
3737
import org.jetbrains.kotlinx.dataframe.impl.api.withRowCellImpl
3838
import org.jetbrains.kotlinx.dataframe.impl.headPlusArray
39-
import org.jetbrains.kotlinx.dataframe.io.toDataFrame
4039
import org.jetbrains.kotlinx.dataframe.impl.io.FastDoubleParser
40+
import org.jetbrains.kotlinx.dataframe.io.toDataFrame
4141
import java.math.BigDecimal
4242
import java.math.BigInteger
4343
import java.net.URL

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ import org.jetbrains.kotlinx.dataframe.columns.TypeSuggestion
3131
import org.jetbrains.kotlinx.dataframe.columns.size
3232
import org.jetbrains.kotlinx.dataframe.exceptions.TypeConversionException
3333
import org.jetbrains.kotlinx.dataframe.hasNulls
34-
import org.jetbrains.kotlinx.dataframe.impl.asNullable
3534
import org.jetbrains.kotlinx.dataframe.impl.canParse
3635
import org.jetbrains.kotlinx.dataframe.impl.catchSilent
3736
import org.jetbrains.kotlinx.dataframe.impl.createStarProjectedType

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser.kt

Lines changed: 133 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ import ch.randelshofer.fastdoubleparser.NumberFormatSymbols
55
import io.github.oshai.kotlinlogging.KotlinLogging
66
import org.jetbrains.kotlinx.dataframe.DataFrame
77
import org.jetbrains.kotlinx.dataframe.api.ParserOptions
8-
import org.jetbrains.kotlinx.dataframe.api.parser
98
import org.jetbrains.kotlinx.dataframe.impl.api.Parsers
109
import java.nio.charset.Charset
1110
import java.text.DecimalFormatSymbols
@@ -15,19 +14,24 @@ import java.util.Locale
1514

1615
private val logger = KotlinLogging.logger {}
1716

18-
// (lowercase) strings that are recognized to represent infinity and NaN in doubles in all locales
19-
private val INFINITIES = arrayOf("", "inf", "infinity", "infty")
20-
private val PLUS_INFINITIES = INFINITIES.map { "+$it" }
21-
private val MINUS_INFINITIES = INFINITIES.map { "-$it" }
22-
private val NANS = arrayOf("nan", "na", "n/a")
23-
2417
/**
2518
* Parses a [String]/[CharSequence], [CharArray], or [ByteArray] into a [Double].
2619
*
2720
* If [ParserOptions.useFastDoubleParser] is enabled, it will try to parse the input with the
2821
* fast double parser library, [FastDoubleParser](https://github.com/wrandelshofer/FastDoubleParser).
2922
* If not, or if it fails, it will use [NumberFormat] to parse the input.
3023
*
24+
* The [locale][locale] used by the double parser is defined like:
25+
*
26+
* [parserOptions][parserOptions]`?.`[locale][ParserOptions.locale]` ?: `[Parsers.locale][Parsers.locale]` :? `[Locale.getDefault()][Locale.getDefault]
27+
*
28+
* [FastDoubleParser] has a fallback mechanism; In practice, this means it can recognize symbols and notations
29+
* of any locale recognized by Java as long as that symbol does not conflict with the given locale.
30+
*
31+
* For example, if your locale uses ',' as decimal separator, it will NOT recognize ',' as thousands separator,
32+
* but it will recognize ' ', '٬', '_', ' ', etc. as such.
33+
* The same holds for characters like "e", "inf", "×10^", "NaN", etc.
34+
*
3135
* Public, so it can be used in other modules.
3236
*
3337
* @param parserOptions can be supplied to configure the parser.
@@ -41,106 +45,103 @@ public class FastDoubleParser(private val parserOptions: ParserOptions? = null)
4145

4246
private val useFastDoubleParser = parserOptions?.useFastDoubleParser ?: Parsers.useFastDoubleParser
4347
private val locale = parserOptions?.locale ?: Parsers.locale
44-
private val fallbackLocale = Locale.ROOT
45-
46-
private val localDecimalFormatSymbols = DecimalFormatSymbols.getInstance(locale)
47-
private val fallbackDecimalFormatSymbols = DecimalFormatSymbols.getInstance(fallbackLocale)
4848

4949
private val parser = ConfigurableDoubleParser(/* symbols = */ setupNumberFormatSymbols(), /* ignoreCase = */ true)
5050

5151
/**
5252
* Sets up the [NumberFormatSymbols] for the [ConfigurableDoubleParser] based on
53-
* [localDecimalFormatSymbols] with fallbacks from [fallbackDecimalFormatSymbols].
53+
* the [locale] with fallbacks from all other locales.
5454
*
5555
* Fallback characters/strings are only added if they're not clashing with local characters/strings.
5656
*/
57-
private fun setupNumberFormatSymbols(): NumberFormatSymbols {
58-
// collect all chars and strings that are locale-specific such that we can check whether
59-
// fallback chars and strings are safe to add
60-
val localChars = with(localDecimalFormatSymbols) {
61-
buildSet {
62-
add(decimalSeparator.lowercaseChar())
63-
add(groupingSeparator.lowercaseChar())
64-
add(minusSign.lowercaseChar())
65-
add('+')
66-
add(zeroDigit.lowercaseChar())
57+
private fun setupNumberFormatSymbols(): NumberFormatSymbols =
58+
numberFormatSymbolsCache.getOrPut(locale) {
59+
val localDecimalFormatSymbols = DecimalFormatSymbols.getInstance(locale)
60+
61+
// collect all chars and strings that are locale-specific such that we can check whether
62+
// fallback chars and strings are safe to add
63+
val localChars = with(localDecimalFormatSymbols) {
64+
buildSet {
65+
add(decimalSeparator.lowercaseChar())
66+
add(groupingSeparator.lowercaseChar())
67+
add(minusSign.lowercaseChar())
68+
add('+')
69+
// we don't include zeroDigit here, for notations like ×10^
70+
}
6771
}
68-
}
69-
val localStrings = with(localDecimalFormatSymbols) {
70-
buildSet {
71-
add(exponentSeparator.lowercase())
72-
add(infinity.lowercase())
73-
add(naN.lowercase())
72+
val localStrings = with(localDecimalFormatSymbols) {
73+
buildSet {
74+
add(exponentSeparator.lowercase())
75+
add(infinity.lowercase())
76+
add(naN.lowercase())
77+
}
7478
}
75-
}
7679

77-
/**
78-
* Builds a set with the specified char from [localDecimalFormatSymbols] and
79-
* its fallback char from [fallbackDecimalFormatSymbols] if it's safe to do so.
80-
* [additionals] will be added to the set too, when they're safe to add.
81-
*/
82-
fun ((DecimalFormatSymbols) -> Char).fromLocalWithFallBack(vararg additionals: Char): Set<Char> =
83-
buildSet {
84-
val getChar = this@fromLocalWithFallBack
85-
val char = getChar(localDecimalFormatSymbols).lowercaseChar()
86-
add(char)
87-
88-
// add fallback char if it's safe to do so
89-
val fallbackChar = getChar(fallbackDecimalFormatSymbols).lowercaseChar()
90-
if (fallbackChar !in localChars && !localStrings.any { fallbackChar in it }) {
91-
add(fallbackChar)
92-
}
80+
/**
81+
* Builds a set with the specified char from [this] and
82+
* [fallbackChars] will be added to the set too, when they're safe to add.
83+
*/
84+
fun Char.withFallback(fallbackChars: CharArray): Set<Char> =
85+
buildSet {
86+
val char = this@withFallback.lowercaseChar()
87+
add(char)
9388

94-
// Fixes NBSP and other whitespace characters not being recognized if the user writes space instead.
95-
if (char.isWhitespace()) add(' ')
89+
// Treat NBSP and other whitespace characters the same.
90+
if (char.isWhitespace()) addAll(WHITE_SPACES.asIterable())
9691

97-
// add additional chars if needed
98-
for (additional in additionals) {
99-
val lowercase = additional.lowercaseChar()
100-
if (lowercase !in localChars && !localStrings.any { lowercase in it }) {
101-
add(lowercase)
92+
// add fallback chars if needed
93+
for (char in fallbackChars) {
94+
val lowercase = char.lowercaseChar()
95+
if (lowercase !in localChars && !localStrings.any { lowercase in it }) {
96+
add(lowercase)
97+
}
98+
99+
// Treat NBSP and other whitespace characters the same.
100+
if (char.isWhitespace()) addAll(WHITE_SPACES.asIterable())
102101
}
103102
}
104-
}
105103

106-
/**
107-
* Builds a set with the specified string from [localDecimalFormatSymbols] and
108-
* its fallback string from [fallbackDecimalFormatSymbols] if it's safe to do so.
109-
* [additionals] will be added to the set too, when they're safe to add.
110-
*/
111-
fun ((DecimalFormatSymbols) -> String).fromLocalWithFallBack(vararg additionals: String): Set<String> =
112-
buildSet {
113-
val getString = this@fromLocalWithFallBack
114-
val string = getString(localDecimalFormatSymbols).lowercase()
115-
add(string)
116-
117-
// add fallback string if it's safe to do so
118-
val fallbackString = getString(fallbackDecimalFormatSymbols).lowercase()
119-
if (!fallbackString.any { it in localChars } && fallbackString !in localStrings) {
120-
add(fallbackString)
121-
}
104+
/**
105+
* Builds a set with the specified string from [this] and
106+
* [fallbackStrings] will be added to the set too, when they're safe to add.
107+
*/
108+
fun String.withFallback(fallbackStrings: Array<String>): Set<String> =
109+
buildSet {
110+
val string = this@withFallback.lowercase()
111+
add(string)
112+
113+
// Treat NBSP and other whitespace characters the same.
114+
if (string.isBlank()) addAll(WHITE_SPACES.map { it.toString() })
122115

123-
// Fixes NBSP and other whitespace characters not being recognized if the user writes space instead.
124-
if (string.isBlank()) add(" ")
116+
// add fallback strings if needed
117+
for (string in fallbackStrings) {
118+
val lowercase = string.lowercase()
119+
if (!lowercase.any { it in localChars } && lowercase !in localStrings) {
120+
add(lowercase)
121+
}
125122

126-
// add additional strings if needed
127-
for (additional in additionals) {
128-
val lowercase = additional.lowercase()
129-
if (!lowercase.any { it in localChars } && lowercase !in localStrings) {
130-
add(lowercase)
123+
// Treat NBSP and other whitespace characters the same.
124+
if (string.isBlank()) addAll(WHITE_SPACES.map { it.toString() })
131125
}
132126
}
133-
}
134127

135-
return NumberFormatSymbols.fromDecimalFormatSymbols(localDecimalFormatSymbols)
136-
.withPlusSign(setOf('+'))
137-
.withDecimalSeparator(DecimalFormatSymbols::getDecimalSeparator.fromLocalWithFallBack())
138-
.withGroupingSeparator(DecimalFormatSymbols::getGroupingSeparator.fromLocalWithFallBack())
139-
.withExponentSeparator(DecimalFormatSymbols::getExponentSeparator.fromLocalWithFallBack())
140-
.withMinusSign(DecimalFormatSymbols::getMinusSign.fromLocalWithFallBack())
141-
.withInfinity(DecimalFormatSymbols::getInfinity.fromLocalWithFallBack(*INFINITIES))
142-
.withNaN(DecimalFormatSymbols::getNaN.fromLocalWithFallBack(*NANS))
143-
}
128+
NumberFormatSymbols.fromDecimalFormatSymbols(localDecimalFormatSymbols)
129+
.withPlusSign(
130+
setOf('+'),
131+
).withDecimalSeparator(
132+
localDecimalFormatSymbols.decimalSeparator.withFallback(DECIMAL_SEPARATORS),
133+
).withGroupingSeparator(
134+
localDecimalFormatSymbols.groupingSeparator.withFallback(GROUPING_SEPARATORS),
135+
).withExponentSeparator(
136+
localDecimalFormatSymbols.exponentSeparator.withFallback(EXPONENTS),
137+
).withMinusSign(
138+
localDecimalFormatSymbols.minusSign.withFallback(MINUS_SIGNS),
139+
).withInfinity(
140+
localDecimalFormatSymbols.infinity.withFallback(INFINITIES),
141+
).withNaN(
142+
localDecimalFormatSymbols.naN.withFallback(NANS),
143+
)
144+
}
144145

145146
/** Fallback method for parsing doubles. */
146147
private fun String.parseToDoubleOrNullFallback(): Double? =
@@ -152,7 +153,7 @@ public class FastDoubleParser(private val parserOptions: ParserOptions? = null)
152153
in NANS -> Double.NaN
153154

154155
else -> {
155-
// not thread safe; must be created here
156+
// NumberFormat is not thread safe; must be created in the function body
156157
val numberFormat = NumberFormat.getInstance(locale)
157158
val parsePosition = ParsePosition(0)
158159
val result = numberFormat.parse(this, parsePosition)?.toDouble()
@@ -235,4 +236,49 @@ public class FastDoubleParser(private val parserOptions: ParserOptions? = null)
235236
}
236237
return String(chars = ca, offset = offset, length = length).parseToDoubleOrNullFallback()
237238
}
239+
240+
/**
241+
* Here we store all possible decimal format symbols of all locales on the system.
242+
* These will be used as fallbacks for the selected locale.
243+
* They are only added by [withFallback] if they don't interfere with symbols already in the provided [locale]
244+
* (so ',' is not added as grouping separator if '.' is already the locale's decimal separator).
245+
*/
246+
internal companion object {
247+
private val allDecimalFormatSymbols by lazy {
248+
Locale.getAvailableLocales().map { DecimalFormatSymbols.getInstance(it) }
249+
}
250+
val MINUS_SIGNS by lazy {
251+
allDecimalFormatSymbols.mapNotNullTo(mutableSetOf()) { it.minusSign }.toCharArray()
252+
}
253+
val INFINITIES by lazy {
254+
allDecimalFormatSymbols.mapNotNullTo(mutableSetOf()) { it.infinity }
255+
.plus(arrayOf("", "inf", "infinity", "infty"))
256+
.toTypedArray()
257+
}
258+
val PLUS_INFINITIES by lazy { INFINITIES.map { "+$it" }.toTypedArray() }
259+
val MINUS_INFINITIES by lazy {
260+
INFINITIES.flatMap { inf -> MINUS_SIGNS.map { min -> min + inf } }.toTypedArray()
261+
}
262+
val NANS by lazy {
263+
allDecimalFormatSymbols.mapNotNullTo(mutableSetOf()) { it.naN }
264+
.plus(arrayOf("nan", "na", "n/a"))
265+
.toTypedArray()
266+
}
267+
val WHITE_SPACES = charArrayOf(' ', '\u00A0', '\u2009', '\u202F', '\t')
268+
val GROUPING_SEPARATORS by lazy {
269+
allDecimalFormatSymbols.mapNotNullTo(mutableSetOf()) { it.groupingSeparator }
270+
.plus(arrayOf('\'', '˙', *WHITE_SPACES.toTypedArray()))
271+
.toCharArray()
272+
}
273+
val DECIMAL_SEPARATORS by lazy {
274+
allDecimalFormatSymbols.flatMapTo(mutableSetOf()) {
275+
listOfNotNull(it.decimalSeparator, it.monetaryDecimalSeparator)
276+
}.plus(arrayOf('·', ''))
277+
.toCharArray()
278+
}
279+
val EXPONENTS by lazy {
280+
allDecimalFormatSymbols.mapNotNullTo(mutableSetOf()) { it.exponentSeparator }.toTypedArray()
281+
}
282+
val numberFormatSymbolsCache = mutableMapOf<Locale, NumberFormatSymbols>()
283+
}
238284
}

0 commit comments

Comments
 (0)