From 8655adc6a8936dc299af3a65b1b22368b5a09d92 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Wed, 29 Jan 2025 15:12:40 +0100 Subject: [PATCH 1/6] set useFastDoubleParser parser option to true by default and updated KDocs --- .../org/jetbrains/kotlinx/dataframe/api/convert.kt | 9 +++++---- .../org/jetbrains/kotlinx/dataframe/api/parse.kt | 11 +++++++++-- .../jetbrains/kotlinx/dataframe/impl/api/parse.kt | 14 ++++++++++++-- .../kotlinx/dataframe/impl/io/FastDoubleParser.kt | 4 ++-- .../kotlinx/dataframe/documentation/DelimParams.kt | 2 -- .../kotlinx/dataframe/impl/io/readDelim.kt | 6 +----- 6 files changed, 29 insertions(+), 17 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt index 61d03b3ff7..071fb15fc9 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt @@ -37,6 +37,7 @@ import org.jetbrains.kotlinx.dataframe.impl.api.toLocalTime import org.jetbrains.kotlinx.dataframe.impl.api.withRowCellImpl import org.jetbrains.kotlinx.dataframe.impl.headPlusArray import org.jetbrains.kotlinx.dataframe.io.toDataFrame +import org.jetbrains.kotlinx.dataframe.impl.io.FastDoubleParser import java.math.BigDecimal import java.math.BigInteger import java.net.URL @@ -223,8 +224,8 @@ public fun DataColumn.convertToDouble(locale: Locale? = null): DataColum * @include [DataColumnStringConvertToDoubleDoc] * @param nullStrings a set of strings that should be treated as `null` values. * The default in [DataFrame.parser][DataFrame.Companion.parser] is ["null", "NULL", "NA", "N/A"]. - * @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser. - * The default in [DataFrame.parser][DataFrame.Companion.parser] is `false` for now. + * @param useFastDoubleParser whether to use [FastDoubleParser]. + * The default in [DataFrame.parser][DataFrame.Companion.parser] is `true`. */ @JvmName("convertToDoubleFromString") public fun DataColumn.convertToDouble( @@ -243,8 +244,8 @@ public fun DataColumn.convertToDouble(locale: Locale? = null): DataColu * @include [DataColumnStringConvertToDoubleDoc] * @param nullStrings a set of strings that should be treated as `null` values. * The default in [DataFrame.parser][DataFrame.Companion.parser] is ["null", "NULL", "NA", "N/A"]. - * @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser. - * The default in [DataFrame.parser][DataFrame.Companion.parser] is `false` for now. + * @param useFastDoubleParser whether to use [FastDoubleParser]. + * The default in [DataFrame.parser][DataFrame.Companion.parser] is `true`. */ @JvmName("convertToDoubleFromStringNullable") public fun DataColumn.convertToDouble( diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt index c208e2a4ac..b68f234a80 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt @@ -11,6 +11,7 @@ import org.jetbrains.kotlinx.dataframe.impl.api.Parsers import org.jetbrains.kotlinx.dataframe.impl.api.StringParser import org.jetbrains.kotlinx.dataframe.impl.api.parseImpl import org.jetbrains.kotlinx.dataframe.impl.api.tryParseImpl +import org.jetbrains.kotlinx.dataframe.impl.io.FastDoubleParser import org.jetbrains.kotlinx.dataframe.io.readCSV import org.jetbrains.kotlinx.dataframe.typeClass import org.jetbrains.kotlinx.dataframe.util.PARSER_OPTIONS @@ -45,6 +46,12 @@ public fun DataFrame.parse(vararg columns: ColumnReference, options public fun DataFrame.parse(vararg columns: KProperty, options: ParserOptions? = null): DataFrame = parse(options) { columns.toColumnSet() } +/** + * Global counterpart of [ParserOptions]. + * Settings changed here will affect the defaults for all parsing operations. + * + * The default values are set by [Parsers.resetToDefault]. + */ public interface GlobalParserOptions { public fun addDateTimePattern(pattern: String) @@ -54,7 +61,7 @@ public interface GlobalParserOptions { /** This function can be called to skip some types. Parsing will be attempted for all other types. */ public fun addSkipType(type: KType) - /** Whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now. */ + /** Whether to use [FastDoubleParser], defaults to `true`. Please report any issues you encounter. */ public var useFastDoubleParser: Boolean public fun resetToDefault() @@ -91,7 +98,7 @@ public interface GlobalParserOptions { * `["null", "NULL", "NA", "N/A"]`. * @param skipTypes a set of types that should be skipped during parsing. Parsing will be attempted for all other types. * By default, it's an empty set. To skip all types except a specified one, use [convertTo] instead. - * @param useFastDoubleParser whether to use the new _experimental_ FastDoubleParser, defaults to `false` for now. + * @param useFastDoubleParser whether to use [FastDoubleParser], defaults to `true`. Please report any issues you encounter. */ public class ParserOptions( public val locale: Locale? = null, diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt index 239c22d5c4..50d12b3db1 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt @@ -25,11 +25,13 @@ import org.jetbrains.kotlinx.dataframe.api.isColumnGroup import org.jetbrains.kotlinx.dataframe.api.isFrameColumn import org.jetbrains.kotlinx.dataframe.api.isSubtypeOf import org.jetbrains.kotlinx.dataframe.api.map +import org.jetbrains.kotlinx.dataframe.api.parser import org.jetbrains.kotlinx.dataframe.api.to import org.jetbrains.kotlinx.dataframe.columns.TypeSuggestion import org.jetbrains.kotlinx.dataframe.columns.size import org.jetbrains.kotlinx.dataframe.exceptions.TypeConversionException import org.jetbrains.kotlinx.dataframe.hasNulls +import org.jetbrains.kotlinx.dataframe.impl.asNullable import org.jetbrains.kotlinx.dataframe.impl.canParse import org.jetbrains.kotlinx.dataframe.impl.catchSilent import org.jetbrains.kotlinx.dataframe.impl.createStarProjectedType @@ -47,6 +49,7 @@ import java.time.format.DateTimeFormatterBuilder import java.time.temporal.Temporal import java.time.temporal.TemporalQuery import java.util.Locale +import kotlin.properties.Delegates import kotlin.reflect.KClass import kotlin.reflect.KType import kotlin.reflect.full.withNullability @@ -114,6 +117,13 @@ internal class StringParserWithFormat( } } +/** + * Central implementation for [GlobalParserOptions]. + * + * Can be obtained by a user by calling [DataFrame.parser][DataFrame.Companion.parser]. + * + * Defaults are set by [resetToDefault]. + */ internal object Parsers : GlobalParserOptions { private val formatters: MutableList = mutableListOf() @@ -140,7 +150,7 @@ internal object Parsers : GlobalParserOptions { skipTypesSet.add(type) } - override var useFastDoubleParser: Boolean = false + override var useFastDoubleParser by Delegates.notNull() private var _locale: Locale? = null @@ -165,7 +175,7 @@ internal object Parsers : GlobalParserOptions { .toFormatter() .let { formatters.add(it) } - useFastDoubleParser = false + useFastDoubleParser = true _locale = null nullStrings.addAll(listOf("null", "NULL", "NA", "N/A")) } diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser.kt index 47361dd0a5..ad597e284a 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser.kt @@ -24,8 +24,8 @@ private val NANS = arrayOf("nan", "na", "n/a") /** * Parses a [String]/[CharSequence], [CharArray], or [ByteArray] into a [Double]. * - * If [ParserOptions.useFastDoubleParser] is enabled, it will try to parse the input with an _EXPERIMENTAL_ - * fast double parser, [FastDoubleParser](https://github.com/wrandelshofer/FastDoubleParser). + * If [ParserOptions.useFastDoubleParser] is enabled, it will try to parse the input with the + * fast double parser library, [FastDoubleParser](https://github.com/wrandelshofer/FastDoubleParser). * If not, or if it fails, it will use [NumberFormat] to parse the input. * * Public, so it can be used in other modules. diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/DelimParams.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/DelimParams.kt index aa75feb6a2..c0f39c79c3 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/DelimParams.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/documentation/DelimParams.kt @@ -138,8 +138,6 @@ internal object DelimParams { * ([DataFrame.parser][DataFrame.Companion.parser]) will be queried. * * The only exceptions are: - * - [useFastDoubleParser][ParserOptions.useFastDoubleParser], which will default to `true`, - * regardless of the global setting. * - [nullStrings][ParserOptions.nullStrings], which, if `null`, * will take the global setting + {@include [DefaultNullStringsContentLink]}. * - [skipTypes][ParserOptions.skipTypes], which will always add [typesDeephavenAlreadyParses] to diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readDelim.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readDelim.kt index bf37bc86f3..dec844836c 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readDelim.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readDelim.kt @@ -112,11 +112,7 @@ internal fun readDelimImpl( ): DataFrame<*> { // set up the csv specs val csvSpecs = with(CsvSpecs.builder()) { - // turn on fast double parser if not explicitly set regardless of the global parser options - @Suppress("NullableBooleanElvis") - val adjustedParserOptions = (parserOptions ?: ParserOptions()) - .copy(useFastDoubleParser = parserOptions?.useFastDoubleParser ?: true) - customDoubleParser(DataFrameCustomDoubleParser(adjustedParserOptions)) + customDoubleParser(DataFrameCustomDoubleParser(parserOptions)) // use the given nullStrings if provided, else take the global ones + some extras val nullStrings = parserOptions?.nullStrings ?: (DataFrame.parser.nulls + DEFAULT_DELIM_NULL_STRINGS) From db7c57dd02b40d755d12ffd62f4fa89dd35e3162 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Wed, 29 Jan 2025 16:46:53 +0100 Subject: [PATCH 2/6] Fixed test: converting String to Double in different locales --- .../kotlinx/dataframe/io/ParserTests.kt | 41 ++++++++++++------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt index 64d2ced7b1..3efc180f08 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt @@ -1,6 +1,5 @@ package org.jetbrains.kotlinx.dataframe.io -import io.kotest.assertions.throwables.shouldThrow import io.kotest.matchers.shouldBe import kotlinx.datetime.LocalDateTime import kotlinx.datetime.LocalTime @@ -145,9 +144,9 @@ class ParserTests { parsed.toList() shouldBe listOf(1, 2, null, 3, null, null, 4.0, 5.0) } - @Test // This does not yet use fastDoubleParser! + @Test fun `converting String to Double in different locales`() { - val currentLocale = Locale.getDefault() + val systemLocale = Locale.getDefault() try { // Test 45 behaviour combinations: @@ -157,11 +156,12 @@ class ParserTests { val columnMixed = columnOf("12.345", "67,890") // * // (3 locales as converting parameter + original converting + original converting to nullable) - val parsingLocaleNotDefined: Locale? = null + val parsingLocaleNotDefined: Locale? = null // takes parserOptions.locale ?: Locale.getDefault() val parsingLocaleUsesDot: Locale = Locale.forLanguageTag("en-US") val parsingLocaleUsesComma: Locale = Locale.forLanguageTag("ru-RU") // * // 3 system locales + // -------------------------------------------------------------------------------- Locale.setDefault(Locale.forLanguageTag("C.UTF-8")) @@ -181,9 +181,13 @@ class ParserTests { columnComma.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12345.0, 67890.0) columnMixed.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12.345, 67890.0) - shouldThrow { columnDot.convertToDouble(parsingLocaleUsesComma) } + // uses fallback to ROOT locale + columnDot.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) columnComma.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) - shouldThrow { columnMixed.convertToDouble(parsingLocaleUsesComma) } + // uses fallback to ROOT locale + columnMixed.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) + + // -------------------------------------------------------------------------------- Locale.setDefault(Locale.forLanguageTag("en-US")) @@ -203,33 +207,42 @@ class ParserTests { columnComma.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12345.0, 67890.0) columnMixed.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12.345, 67890.0) - shouldThrow { columnDot.convertToDouble(parsingLocaleUsesComma) } + // uses fallback to ROOT locale + columnDot.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) columnComma.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) - shouldThrow { columnMixed.convertToDouble(parsingLocaleUsesComma) } + // uses fallback to ROOT locale + columnMixed.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) + + // -------------------------------------------------------------------------------- Locale.setDefault(Locale.forLanguageTag("ru-RU")) columnDot.convertTo() shouldBe columnOf(12.345, 67.89) columnComma.convertTo() shouldBe columnOf(12.345, 67.89) - columnMixed.convertTo() shouldBe columnOf(12.345, 67890.0) + // uses fallback to ROOT locale + columnMixed.convertTo() shouldBe columnOf(12.345, 67.89) columnDot.convertTo() shouldBe columnOf(12.345, 67.89) columnComma.convertTo() shouldBe columnOf(12.345, 67.89) - columnMixed.convertTo() shouldBe columnOf(12.345, 67890.0) + // uses fallback to ROOT locale + columnMixed.convertTo() shouldBe columnOf(12.345, 67.89) columnDot.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(12.345, 67.89) columnComma.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(12.345, 67.89) - columnMixed.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(12.345, 67890.0) + // uses fallback to ROOT locale + columnMixed.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(12.345, 67.89) columnDot.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12.345, 67.89) columnComma.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12345.0, 67890.0) columnMixed.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12.345, 67890.0) - shouldThrow { columnDot.convertToDouble(parsingLocaleUsesComma) } + // uses fallback to ROOT locale + columnDot.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) columnComma.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) - shouldThrow { columnMixed.convertToDouble(parsingLocaleUsesComma) } + // uses fallback to ROOT locale + columnMixed.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) } finally { - Locale.setDefault(currentLocale) + Locale.setDefault(systemLocale) } } From 8044d953ce80341a83464fd153f9ca9c786ab1e6 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Wed, 29 Jan 2025 17:32:22 +0100 Subject: [PATCH 3/6] adding comma grouping double parser test --- .../kotlinx/dataframe/io/ParserTests.kt | 96 +++++++++++++++++++ 1 file changed, 96 insertions(+) diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt index 3efc180f08..6d0f06f72f 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt @@ -1,5 +1,6 @@ package org.jetbrains.kotlinx.dataframe.io +import io.kotest.assertions.throwables.shouldThrow import io.kotest.matchers.shouldBe import kotlinx.datetime.LocalDateTime import kotlinx.datetime.LocalTime @@ -157,7 +158,9 @@ class ParserTests { // * // (3 locales as converting parameter + original converting + original converting to nullable) val parsingLocaleNotDefined: Locale? = null // takes parserOptions.locale ?: Locale.getDefault() + // uses dot as decimal separator, comma as grouping separator val parsingLocaleUsesDot: Locale = Locale.forLanguageTag("en-US") + // uses comma as decimal separator, NBSP as grouping separator val parsingLocaleUsesComma: Locale = Locale.forLanguageTag("ru-RU") // * // 3 system locales @@ -246,6 +249,99 @@ class ParserTests { } } + @Test + fun `converting String to Double in different locales with comma grouping`() { + val systemLocale = Locale.getDefault() + try { + // Test 45 behaviour combinations: + + // 3 source columns + val columnDot = columnOf("123,456.789", "0,987,654.321") + val columnComma = columnOf("123.456,789", "0.987.654,321") + val columnMixed = columnOf("123,456.789", "0.987.654,321") + // * + // (3 locales as converting parameter + original converting + original converting to nullable) + val parsingLocaleNotDefined: Locale? = null // takes parserOptions.locale ?: Locale.getDefault() + val parsingLocaleUsesDot: Locale = Locale.forLanguageTag("en-US") + val parsingLocaleUsesComma: Locale = Locale.forLanguageTag("nl-NL") + // * + // 3 system locales + // -------------------------------------------------------------------------------- + + Locale.setDefault(Locale.forLanguageTag("C.UTF-8")) + + columnDot.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + shouldThrow { columnComma.convertTo() } + shouldThrow { columnMixed.convertTo() } + + columnDot.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + shouldThrow { columnComma.convertTo() } + shouldThrow { columnMixed.convertTo() } + + columnDot.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(123_456.789, 987_654.321) + shouldThrow { columnComma.convertToDouble(parsingLocaleNotDefined) } + shouldThrow { columnMixed.convertToDouble(parsingLocaleNotDefined) } + + columnDot.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(123_456.789, 987_654.321) + shouldThrow { columnComma.convertToDouble(parsingLocaleUsesDot) } + shouldThrow { columnMixed.convertToDouble(parsingLocaleUsesDot) } + + shouldThrow { columnDot.convertToDouble(parsingLocaleUsesComma) } + columnComma.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(123_456.789, 987_654.321) + shouldThrow { columnMixed.convertToDouble(parsingLocaleUsesComma) } + + // -------------------------------------------------------------------------------- + + Locale.setDefault(Locale.forLanguageTag("en-US")) + + columnDot.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + shouldThrow { columnComma.convertTo() } + shouldThrow { columnMixed.convertTo() } + + columnDot.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + shouldThrow { columnComma.convertTo() } + shouldThrow { columnMixed.convertTo() } + + columnDot.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(123_456.789, 987_654.321) + shouldThrow { columnComma.convertToDouble(parsingLocaleNotDefined) } + shouldThrow { columnMixed.convertToDouble(parsingLocaleNotDefined) } + + columnDot.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(123_456.789, 987_654.321) + shouldThrow { columnComma.convertToDouble(parsingLocaleUsesDot) } + shouldThrow { columnMixed.convertToDouble(parsingLocaleUsesDot) } + + shouldThrow { columnDot.convertToDouble(parsingLocaleUsesComma) } + columnComma.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(123_456.789, 987_654.321) + shouldThrow { columnMixed.convertToDouble(parsingLocaleUsesComma) } + + // -------------------------------------------------------------------------------- + + Locale.setDefault(Locale.forLanguageTag("nl-NL")) + + columnDot.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + shouldThrow { columnMixed.convertTo() } + + columnDot.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + shouldThrow { columnMixed.convertTo() } + + columnDot.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(123_456.789, 987_654.321) + shouldThrow { columnMixed.convertToDouble(parsingLocaleNotDefined) } + + columnDot.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(123_456.789, 987_654.321) + shouldThrow { columnComma.convertToDouble(parsingLocaleUsesDot) } + shouldThrow { columnMixed.convertToDouble(parsingLocaleUsesDot) } + + shouldThrow { columnDot.convertToDouble(parsingLocaleUsesComma) } + columnComma.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(123_456.789, 987_654.321) + shouldThrow { columnMixed.convertToDouble(parsingLocaleUsesComma) } + } finally { + Locale.setDefault(systemLocale) + } + } + /** Checks fix for [Issue #593](https://github.com/Kotlin/dataframe/issues/593) */ @Test fun `Mixing null and json`() { From 89e0d416b0127e0b49b09af5531333b774eb9731 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Mon, 10 Feb 2025 21:33:07 +0100 Subject: [PATCH 4/6] Improved fallback mechanism of FastDoubleParser to take into account all other locales, not just ROOT. Finished parse tests --- .../kotlinx/dataframe/api/convert.kt | 2 +- .../kotlinx/dataframe/impl/api/parse.kt | 1 - .../dataframe/impl/io/FastDoubleParser.kt | 220 +++++++++++------- .../kotlinx/dataframe/io/ParserTests.kt | 131 ++++++++++- 4 files changed, 255 insertions(+), 99 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt index 973a2340da..69075f2f6c 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt @@ -36,8 +36,8 @@ import org.jetbrains.kotlinx.dataframe.impl.api.toLocalDateTime import org.jetbrains.kotlinx.dataframe.impl.api.toLocalTime import org.jetbrains.kotlinx.dataframe.impl.api.withRowCellImpl import org.jetbrains.kotlinx.dataframe.impl.headPlusArray -import org.jetbrains.kotlinx.dataframe.io.toDataFrame import org.jetbrains.kotlinx.dataframe.impl.io.FastDoubleParser +import org.jetbrains.kotlinx.dataframe.io.toDataFrame import java.math.BigDecimal import java.math.BigInteger import java.net.URL diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt index 50d12b3db1..d2da7201f7 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt @@ -31,7 +31,6 @@ import org.jetbrains.kotlinx.dataframe.columns.TypeSuggestion import org.jetbrains.kotlinx.dataframe.columns.size import org.jetbrains.kotlinx.dataframe.exceptions.TypeConversionException import org.jetbrains.kotlinx.dataframe.hasNulls -import org.jetbrains.kotlinx.dataframe.impl.asNullable import org.jetbrains.kotlinx.dataframe.impl.canParse import org.jetbrains.kotlinx.dataframe.impl.catchSilent import org.jetbrains.kotlinx.dataframe.impl.createStarProjectedType diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser.kt index ad597e284a..815b404cb8 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/FastDoubleParser.kt @@ -5,7 +5,6 @@ import ch.randelshofer.fastdoubleparser.NumberFormatSymbols import io.github.oshai.kotlinlogging.KotlinLogging import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.api.ParserOptions -import org.jetbrains.kotlinx.dataframe.api.parser import org.jetbrains.kotlinx.dataframe.impl.api.Parsers import java.nio.charset.Charset import java.text.DecimalFormatSymbols @@ -15,12 +14,6 @@ import java.util.Locale private val logger = KotlinLogging.logger {} -// (lowercase) strings that are recognized to represent infinity and NaN in doubles in all locales -private val INFINITIES = arrayOf("∞", "inf", "infinity", "infty") -private val PLUS_INFINITIES = INFINITIES.map { "+$it" } -private val MINUS_INFINITIES = INFINITIES.map { "-$it" } -private val NANS = arrayOf("nan", "na", "n/a") - /** * Parses a [String]/[CharSequence], [CharArray], or [ByteArray] into a [Double]. * @@ -28,6 +21,17 @@ private val NANS = arrayOf("nan", "na", "n/a") * fast double parser library, [FastDoubleParser](https://github.com/wrandelshofer/FastDoubleParser). * If not, or if it fails, it will use [NumberFormat] to parse the input. * + * The [locale][locale] used by the double parser is defined like: + * + * [parserOptions][parserOptions]`?.`[locale][ParserOptions.locale]` ?: `[Parsers.locale][Parsers.locale]` :? `[Locale.getDefault()][Locale.getDefault] + * + * [FastDoubleParser] has a fallback mechanism; In practice, this means it can recognize symbols and notations + * of any locale recognized by Java as long as that symbol does not conflict with the given locale. + * + * For example, if your locale uses ',' as decimal separator, it will NOT recognize ',' as thousands separator, + * but it will recognize ' ', '٬', '_', ' ', etc. as such. + * The same holds for characters like "e", "inf", "×10^", "NaN", etc. + * * Public, so it can be used in other modules. * * @param parserOptions can be supplied to configure the parser. @@ -41,106 +45,103 @@ public class FastDoubleParser(private val parserOptions: ParserOptions? = null) private val useFastDoubleParser = parserOptions?.useFastDoubleParser ?: Parsers.useFastDoubleParser private val locale = parserOptions?.locale ?: Parsers.locale - private val fallbackLocale = Locale.ROOT - - private val localDecimalFormatSymbols = DecimalFormatSymbols.getInstance(locale) - private val fallbackDecimalFormatSymbols = DecimalFormatSymbols.getInstance(fallbackLocale) private val parser = ConfigurableDoubleParser(/* symbols = */ setupNumberFormatSymbols(), /* ignoreCase = */ true) /** * Sets up the [NumberFormatSymbols] for the [ConfigurableDoubleParser] based on - * [localDecimalFormatSymbols] with fallbacks from [fallbackDecimalFormatSymbols]. + * the [locale] with fallbacks from all other locales. * * Fallback characters/strings are only added if they're not clashing with local characters/strings. */ - private fun setupNumberFormatSymbols(): NumberFormatSymbols { - // collect all chars and strings that are locale-specific such that we can check whether - // fallback chars and strings are safe to add - val localChars = with(localDecimalFormatSymbols) { - buildSet { - add(decimalSeparator.lowercaseChar()) - add(groupingSeparator.lowercaseChar()) - add(minusSign.lowercaseChar()) - add('+') - add(zeroDigit.lowercaseChar()) + private fun setupNumberFormatSymbols(): NumberFormatSymbols = + numberFormatSymbolsCache.getOrPut(locale) { + val localDecimalFormatSymbols = DecimalFormatSymbols.getInstance(locale) + + // collect all chars and strings that are locale-specific such that we can check whether + // fallback chars and strings are safe to add + val localChars = with(localDecimalFormatSymbols) { + buildSet { + add(decimalSeparator.lowercaseChar()) + add(groupingSeparator.lowercaseChar()) + add(minusSign.lowercaseChar()) + add('+') + // we don't include zeroDigit here, for notations like ×10^ + } } - } - val localStrings = with(localDecimalFormatSymbols) { - buildSet { - add(exponentSeparator.lowercase()) - add(infinity.lowercase()) - add(naN.lowercase()) + val localStrings = with(localDecimalFormatSymbols) { + buildSet { + add(exponentSeparator.lowercase()) + add(infinity.lowercase()) + add(naN.lowercase()) + } } - } - /** - * Builds a set with the specified char from [localDecimalFormatSymbols] and - * its fallback char from [fallbackDecimalFormatSymbols] if it's safe to do so. - * [additionals] will be added to the set too, when they're safe to add. - */ - fun ((DecimalFormatSymbols) -> Char).fromLocalWithFallBack(vararg additionals: Char): Set = - buildSet { - val getChar = this@fromLocalWithFallBack - val char = getChar(localDecimalFormatSymbols).lowercaseChar() - add(char) - - // add fallback char if it's safe to do so - val fallbackChar = getChar(fallbackDecimalFormatSymbols).lowercaseChar() - if (fallbackChar !in localChars && !localStrings.any { fallbackChar in it }) { - add(fallbackChar) - } + /** + * Builds a set with the specified char from [this] and + * [fallbackChars] will be added to the set too, when they're safe to add. + */ + fun Char.withFallback(fallbackChars: CharArray): Set = + buildSet { + val char = this@withFallback.lowercaseChar() + add(char) - // Fixes NBSP and other whitespace characters not being recognized if the user writes space instead. - if (char.isWhitespace()) add(' ') + // Treat NBSP and other whitespace characters the same. + if (char.isWhitespace()) addAll(WHITE_SPACES.asIterable()) - // add additional chars if needed - for (additional in additionals) { - val lowercase = additional.lowercaseChar() - if (lowercase !in localChars && !localStrings.any { lowercase in it }) { - add(lowercase) + // add fallback chars if needed + for (char in fallbackChars) { + val lowercase = char.lowercaseChar() + if (lowercase !in localChars && !localStrings.any { lowercase in it }) { + add(lowercase) + } + + // Treat NBSP and other whitespace characters the same. + if (char.isWhitespace()) addAll(WHITE_SPACES.asIterable()) } } - } - /** - * Builds a set with the specified string from [localDecimalFormatSymbols] and - * its fallback string from [fallbackDecimalFormatSymbols] if it's safe to do so. - * [additionals] will be added to the set too, when they're safe to add. - */ - fun ((DecimalFormatSymbols) -> String).fromLocalWithFallBack(vararg additionals: String): Set = - buildSet { - val getString = this@fromLocalWithFallBack - val string = getString(localDecimalFormatSymbols).lowercase() - add(string) - - // add fallback string if it's safe to do so - val fallbackString = getString(fallbackDecimalFormatSymbols).lowercase() - if (!fallbackString.any { it in localChars } && fallbackString !in localStrings) { - add(fallbackString) - } + /** + * Builds a set with the specified string from [this] and + * [fallbackStrings] will be added to the set too, when they're safe to add. + */ + fun String.withFallback(fallbackStrings: Array): Set = + buildSet { + val string = this@withFallback.lowercase() + add(string) + + // Treat NBSP and other whitespace characters the same. + if (string.isBlank()) addAll(WHITE_SPACES.map { it.toString() }) - // Fixes NBSP and other whitespace characters not being recognized if the user writes space instead. - if (string.isBlank()) add(" ") + // add fallback strings if needed + for (string in fallbackStrings) { + val lowercase = string.lowercase() + if (!lowercase.any { it in localChars } && lowercase !in localStrings) { + add(lowercase) + } - // add additional strings if needed - for (additional in additionals) { - val lowercase = additional.lowercase() - if (!lowercase.any { it in localChars } && lowercase !in localStrings) { - add(lowercase) + // Treat NBSP and other whitespace characters the same. + if (string.isBlank()) addAll(WHITE_SPACES.map { it.toString() }) } } - } - return NumberFormatSymbols.fromDecimalFormatSymbols(localDecimalFormatSymbols) - .withPlusSign(setOf('+')) - .withDecimalSeparator(DecimalFormatSymbols::getDecimalSeparator.fromLocalWithFallBack()) - .withGroupingSeparator(DecimalFormatSymbols::getGroupingSeparator.fromLocalWithFallBack()) - .withExponentSeparator(DecimalFormatSymbols::getExponentSeparator.fromLocalWithFallBack()) - .withMinusSign(DecimalFormatSymbols::getMinusSign.fromLocalWithFallBack()) - .withInfinity(DecimalFormatSymbols::getInfinity.fromLocalWithFallBack(*INFINITIES)) - .withNaN(DecimalFormatSymbols::getNaN.fromLocalWithFallBack(*NANS)) - } + NumberFormatSymbols.fromDecimalFormatSymbols(localDecimalFormatSymbols) + .withPlusSign( + setOf('+'), + ).withDecimalSeparator( + localDecimalFormatSymbols.decimalSeparator.withFallback(DECIMAL_SEPARATORS), + ).withGroupingSeparator( + localDecimalFormatSymbols.groupingSeparator.withFallback(GROUPING_SEPARATORS), + ).withExponentSeparator( + localDecimalFormatSymbols.exponentSeparator.withFallback(EXPONENTS), + ).withMinusSign( + localDecimalFormatSymbols.minusSign.withFallback(MINUS_SIGNS), + ).withInfinity( + localDecimalFormatSymbols.infinity.withFallback(INFINITIES), + ).withNaN( + localDecimalFormatSymbols.naN.withFallback(NANS), + ) + } /** Fallback method for parsing doubles. */ private fun String.parseToDoubleOrNullFallback(): Double? = @@ -152,7 +153,7 @@ public class FastDoubleParser(private val parserOptions: ParserOptions? = null) in NANS -> Double.NaN else -> { - // not thread safe; must be created here + // NumberFormat is not thread safe; must be created in the function body val numberFormat = NumberFormat.getInstance(locale) val parsePosition = ParsePosition(0) val result = numberFormat.parse(this, parsePosition)?.toDouble() @@ -235,4 +236,49 @@ public class FastDoubleParser(private val parserOptions: ParserOptions? = null) } return String(chars = ca, offset = offset, length = length).parseToDoubleOrNullFallback() } + + /** + * Here we store all possible decimal format symbols of all locales on the system. + * These will be used as fallbacks for the selected locale. + * They are only added by [withFallback] if they don't interfere with symbols already in the provided [locale] + * (so ',' is not added as grouping separator if '.' is already the locale's decimal separator). + */ + internal companion object { + private val allDecimalFormatSymbols by lazy { + Locale.getAvailableLocales().map { DecimalFormatSymbols.getInstance(it) } + } + val MINUS_SIGNS by lazy { + allDecimalFormatSymbols.mapNotNullTo(mutableSetOf()) { it.minusSign }.toCharArray() + } + val INFINITIES by lazy { + allDecimalFormatSymbols.mapNotNullTo(mutableSetOf()) { it.infinity } + .plus(arrayOf("∞", "inf", "infinity", "infty")) + .toTypedArray() + } + val PLUS_INFINITIES by lazy { INFINITIES.map { "+$it" }.toTypedArray() } + val MINUS_INFINITIES by lazy { + INFINITIES.flatMap { inf -> MINUS_SIGNS.map { min -> min + inf } }.toTypedArray() + } + val NANS by lazy { + allDecimalFormatSymbols.mapNotNullTo(mutableSetOf()) { it.naN } + .plus(arrayOf("nan", "na", "n/a")) + .toTypedArray() + } + val WHITE_SPACES = charArrayOf(' ', '\u00A0', '\u2009', '\u202F', '\t') + val GROUPING_SEPARATORS by lazy { + allDecimalFormatSymbols.mapNotNullTo(mutableSetOf()) { it.groupingSeparator } + .plus(arrayOf('\'', '˙', *WHITE_SPACES.toTypedArray())) + .toCharArray() + } + val DECIMAL_SEPARATORS by lazy { + allDecimalFormatSymbols.flatMapTo(mutableSetOf()) { + listOfNotNull(it.decimalSeparator, it.monetaryDecimalSeparator) + }.plus(arrayOf('·', '⎖')) + .toCharArray() + } + val EXPONENTS by lazy { + allDecimalFormatSymbols.mapNotNullTo(mutableSetOf()) { it.exponentSeparator }.toTypedArray() + } + val numberFormatSymbolsCache = mutableMapOf() + } } diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt index 6d0f06f72f..553fac5961 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt @@ -184,10 +184,10 @@ class ParserTests { columnComma.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12345.0, 67890.0) columnMixed.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12.345, 67890.0) - // uses fallback to ROOT locale + // uses fallback mechanism columnDot.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) columnComma.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) - // uses fallback to ROOT locale + // uses fallback mechanism columnMixed.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) // -------------------------------------------------------------------------------- @@ -210,10 +210,10 @@ class ParserTests { columnComma.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12345.0, 67890.0) columnMixed.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12.345, 67890.0) - // uses fallback to ROOT locale + // uses fallback mechanism columnDot.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) columnComma.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) - // uses fallback to ROOT locale + // uses fallback mechanism columnMixed.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) // -------------------------------------------------------------------------------- @@ -222,33 +222,141 @@ class ParserTests { columnDot.convertTo() shouldBe columnOf(12.345, 67.89) columnComma.convertTo() shouldBe columnOf(12.345, 67.89) - // uses fallback to ROOT locale + // uses fallback mechanism columnMixed.convertTo() shouldBe columnOf(12.345, 67.89) columnDot.convertTo() shouldBe columnOf(12.345, 67.89) columnComma.convertTo() shouldBe columnOf(12.345, 67.89) - // uses fallback to ROOT locale + // uses fallback mechanism columnMixed.convertTo() shouldBe columnOf(12.345, 67.89) columnDot.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(12.345, 67.89) columnComma.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(12.345, 67.89) - // uses fallback to ROOT locale + // uses fallback mechanism columnMixed.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(12.345, 67.89) columnDot.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12.345, 67.89) columnComma.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12345.0, 67890.0) columnMixed.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(12.345, 67890.0) - // uses fallback to ROOT locale + // uses fallback mechanism columnDot.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) columnComma.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) - // uses fallback to ROOT locale + // uses fallback mechanism columnMixed.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(12.345, 67.89) } finally { Locale.setDefault(systemLocale) } } + @Test + fun `converting String to Double in different locales with NBSP grouping`() { + val systemLocale = Locale.getDefault() + try { + // Test 45 behaviour combinations: + + // 3 source columns + val columnDot = columnOf("123 456.789", "0 987 654.321") + val columnComma = columnOf("123 456,789", "0 987 654,321") + val columnMixed = columnOf( + "123 456.789", + "0'987 654,321", // note the use of two different thousands grouping characters + ) + // * + // (3 locales as converting parameter + original converting + original converting to nullable) + val parsingLocaleNotDefined: Locale? = null // takes parserOptions.locale ?: Locale.getDefault() + // uses dot as decimal separator, comma as grouping separator + val parsingLocaleUsesDot: Locale = Locale.forLanguageTag("en-US") + // uses comma as decimal separator, NBSP as grouping separator + val parsingLocaleUsesComma: Locale = Locale.forLanguageTag("ru-RU") + // * + // 3 system locales + // -------------------------------------------------------------------------------- + + Locale.setDefault(Locale.forLanguageTag("C.UTF-8")) + + columnDot.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertTo() shouldBe columnOf(123_456_789.0, 987_654_321.0) + columnMixed.convertTo() shouldBe columnOf(123_456.789, 987_654_321.0) + + columnDot.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertTo() shouldBe columnOf(123_456_789.0, 987_654_321.0) + columnMixed.convertTo() shouldBe columnOf(123_456.789, 987_654_321.0) + + columnDot.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(123_456_789.0, 987_654_321.0) + columnMixed.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(123_456.789, 987_654_321.0) + + columnDot.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(123_456_789.0, 987_654_321.0) + columnMixed.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(123_456.789, 987_654_321.0) + + // uses fallback mechanism + columnDot.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(123_456.789, 987_654.321) + // uses fallback mechanism + columnMixed.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(123_456.789, 987_654.321) + + // -------------------------------------------------------------------------------- + + Locale.setDefault(Locale.forLanguageTag("en-US")) + + columnDot.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertTo() shouldBe columnOf(123_456_789.0, 987_654_321.0) + columnMixed.convertTo() shouldBe columnOf(123_456.789, 987_654_321.0) + + columnDot.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertTo() shouldBe columnOf(123_456_789.0, 987_654_321.0) + columnMixed.convertTo() shouldBe columnOf(123_456.789, 987_654_321.0) + + columnDot.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(123_456_789.0, 987_654_321.0) + columnMixed.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(123_456.789, 987_654_321.0) + + columnDot.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(123_456_789.0, 987_654_321.0) + columnMixed.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(123_456.789, 987_654_321.0) + + // uses fallback mechanism + columnDot.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(123_456.789, 987_654.321) + // uses fallback mechanism + columnMixed.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(123_456.789, 987_654.321) + + // -------------------------------------------------------------------------------- + + Locale.setDefault(Locale.forLanguageTag("ru-RU")) + + columnDot.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + // uses fallback mechanism + columnMixed.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + + columnDot.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + // uses fallback mechanism + columnMixed.convertTo() shouldBe columnOf(123_456.789, 987_654.321) + + columnDot.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(123_456.789, 987_654.321) + // uses fallback mechanism + columnMixed.convertToDouble(parsingLocaleNotDefined) shouldBe columnOf(123_456.789, 987_654.321) + + columnDot.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(123_456.789, 987_654.321) + // parses correctly but may be surprising + columnComma.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(123_456_789.0, 987_654_321.0) + columnMixed.convertToDouble(parsingLocaleUsesDot) shouldBe columnOf(123_456.789, 987_654_321.0) + + // uses fallback mechanism + columnDot.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(123_456.789, 987_654.321) + columnComma.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(123_456.789, 987_654.321) + // uses fallback mechanism + columnMixed.convertToDouble(parsingLocaleUsesComma) shouldBe columnOf(123_456.789, 987_654.321) + } finally { + Locale.setDefault(systemLocale) + } + } + @Test fun `converting String to Double in different locales with comma grouping`() { val systemLocale = Locale.getDefault() @@ -258,7 +366,10 @@ class ParserTests { // 3 source columns val columnDot = columnOf("123,456.789", "0,987,654.321") val columnComma = columnOf("123.456,789", "0.987.654,321") - val columnMixed = columnOf("123,456.789", "0.987.654,321") + val columnMixed = columnOf( + "123,456.789", + "0'987.654,321", // note the use of two different thousands grouping characters + ) // * // (3 locales as converting parameter + original converting + original converting to nullable) val parsingLocaleNotDefined: Locale? = null // takes parserOptions.locale ?: Locale.getDefault() From 1d12cba1a3e768abeb589b8ca3ba51262379cd72 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Tue, 11 Feb 2025 13:53:50 +0100 Subject: [PATCH 5/6] updated docs regarding double parsing --- docs/StardustDocs/topics/convert.md | 2 +- docs/StardustDocs/topics/parse.md | 71 +++++++++++++++++++++++++++-- 2 files changed, 68 insertions(+), 5 deletions(-) diff --git a/docs/StardustDocs/topics/convert.md b/docs/StardustDocs/topics/convert.md index bd7e2088e5..6cbe4e8e5f 100644 --- a/docs/StardustDocs/topics/convert.md +++ b/docs/StardustDocs/topics/convert.md @@ -44,7 +44,7 @@ df.convert { name }.asFrame { it.add("fullName") { "$firstName $lastName" } } * `Int` (and `Char`) * `Long` * `Float` -* `Double` +* `Double` (See [parsing doubles](parse.md#parsing-doubles) for `String` to `Double` conversion) * `BigDecimal` * `BigInteger` * `LocalDateTime` (kotlinx.datetime and java.time) diff --git a/docs/StardustDocs/topics/parse.md b/docs/StardustDocs/topics/parse.md index a8dbd5806e..30e70fd1f4 100644 --- a/docs/StardustDocs/topics/parse.md +++ b/docs/StardustDocs/topics/parse.md @@ -5,6 +5,10 @@ Returns a [`DataFrame`](DataFrame.md) in which the given `String` columns are pa This is a special case of the [convert](convert.md) operation. +This parsing operation is sometimes executed implicitly, for example, when [reading from CSV](read.md) or +[type converting from `String` columns](convert.md). +You can recognize this by the `locale` or `parserOptions` arguments in these functions. + ```kotlin @@ -25,6 +29,8 @@ df.parse { age and weight } +### Parsing Order + `parse` tries to parse every `String` column into one of supported types in the following order: * `Int` * `Long` @@ -34,16 +40,30 @@ df.parse { age and weight } * `Duration` (`kotlin.time` and `java.time`) * `LocalTime` (`java.time`) * `URL` (`java.net`) -* `Double` (with optional locale settings) +* [`Double` (with optional locale settings)](#parsing-doubles) * `Boolean` * `BigDecimal` * `JSON` (arrays and objects) +### Parser Options + +DataFrame supports multiple parser options that can be used to customize the parsing behavior. +These can be supplied to the `parse` function (or any other function that can implicitly parse `Strings`) +as an argument: + Available parser options: -* `locale: Locale` is used to parse doubles +* `locale: Locale` is used to [parse doubles](#parsing-doubles) + * Default locale is `Locale.getDefault()` * `dateTimePattern: String` is used to parse date and time * `dateTimeFormatter: DateTimeFormatter` is used to parse date and time -* `nullStrings: List` is used to treat particular strings as `null` value. Default null strings are **"null"** and **"NULL"** +* `nullStrings: List` is used to treat particular strings as `null` value + * Default null strings are **"null"** and **"NULL"** + * When [reading from CSV](read.md), we include even more defaults, like **""**, and **"NA"**. + See the KDocs there for the exact details +* `skipTypes: Set` types that should be skipped during parsing + * Empty set by default; parsing can result in any supported type +* `useFastDoubleParser: Boolean` is used to enable or disable the [new fast double parser](#parsing-doubles) + * Enabled by default @@ -54,8 +74,13 @@ df.parse(options = ParserOptions(locale = Locale.CHINA, dateTimeFormatter = Date +### Global Parser Options + You can also set global parser options that will be used by default in [`read`](read.md), [`convert`](convert.md), -and `parse` operations: +and other `parse` operations. +These can be seen as a global fallback for the `parserOptions` argument. + +For example, to change the locale to French and add a custom date-time pattern: @@ -64,4 +89,42 @@ DataFrame.parser.locale = Locale.FRANCE DataFrame.parser.addDateTimePattern("dd.MM.uuuu HH:mm:ss") ``` +This means that the locale being used by the parser is defined as: + +↪ The locale given as function argument directly, or in `parserOptions`, if it is not `null`, else + +    ↪ The locale set by `DataFrame.parser.locale = ...`, if it is not `null`, else + +        ↪ `Locale.getDefault()`, which is the system's default locale that can be changed with `Locale.setDefault()`. + +### Parsing Doubles + +DataFrame has a new fast and powerful double parser enabled by default. +It is based on [the FastDoubleParser library](https://github.com/wrandelshofer/FastDoubleParser) for its +high performance and configurability +(in the future, we might expand this support to `Float`, `BigDecimal`, and `BigInteger` as well). + +The parser is locale-aware; it will use the locale set by the [parser options](#parser-options) to parse the doubles. +It also has a fallback mechanism built in, meaning it can recognize characters from +all other locales (and some from [Wikipedia](https://en.wikipedia.org/wiki/Decimal_separator)) +and parse them correctly as long as they don't conflict with the current locale. + +For example, if your locale uses ',' as decimal separator, it will not recognize ',' as thousands separator, but it will +recognize ''', ' ', '٬', '_', ' ', etc. as such. +The same holds for characters like "e", "inf", "×10^", "NaN", etc. (ignoring case). + +This means you can safely parse `"123'456 789,012.345×10^6"` with a US locale but not `"1.234,5"`. + +Aside from this, DataFrame also explicitly recognizes "∞", "inf", "infinity", and "infty" as `Double.POSITIVE_INFINITY` +(as well as their negative counterparts), "nan", "na", and "n/a" as `Double.NaN`, +and all forms of whitespace are treated equally. + +If `FastDoubleParser` fails to parse a `String` as `Double`, DataFrame will try +to parse it using the standard `NumberFormat.parse()` function as a last resort. + +If you experience any issues with the new parser, you can turn it off by setting +`useFastDoubleParser = false`, which will use the old `NumberFormat.parse()` function instead. + +Please [report](https://github.com/Kotlin/dataframe/issues) any issues you encounter. + From 08570f91ead6287aff7fcaeea4d5d6c3da130cd9 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Wed, 12 Feb 2025 14:26:36 +0100 Subject: [PATCH 6/6] small clarification of parsing docs --- docs/StardustDocs/topics/parse.md | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/docs/StardustDocs/topics/parse.md b/docs/StardustDocs/topics/parse.md index 30e70fd1f4..7a4460b071 100644 --- a/docs/StardustDocs/topics/parse.md +++ b/docs/StardustDocs/topics/parse.md @@ -49,21 +49,26 @@ df.parse { age and weight } DataFrame supports multiple parser options that can be used to customize the parsing behavior. These can be supplied to the `parse` function (or any other function that can implicitly parse `Strings`) -as an argument: +as an argument. + +For each option you don't supply (or supply `null`) DataFrame will take the value from the +[Global Parser Options](#global-parser-options). Available parser options: * `locale: Locale` is used to [parse doubles](#parsing-doubles) - * Default locale is `Locale.getDefault()` + * Global default locale is `Locale.getDefault()` * `dateTimePattern: String` is used to parse date and time + * Global default supports ISO (local) date-time * `dateTimeFormatter: DateTimeFormatter` is used to parse date and time + * Is derived from `dateTimePattern` and/or `locale` if `null` * `nullStrings: List` is used to treat particular strings as `null` value - * Default null strings are **"null"** and **"NULL"** + * Global default null strings are **"null"** and **"NULL"** * When [reading from CSV](read.md), we include even more defaults, like **""**, and **"NA"**. See the KDocs there for the exact details * `skipTypes: Set` types that should be skipped during parsing - * Empty set by default; parsing can result in any supported type + * Empty set by global default; parsing can result in any supported type * `useFastDoubleParser: Boolean` is used to enable or disable the [new fast double parser](#parsing-doubles) - * Enabled by default + * Enabled by global default @@ -76,11 +81,12 @@ df.parse(options = ParserOptions(locale = Locale.CHINA, dateTimeFormatter = Date ### Global Parser Options -You can also set global parser options that will be used by default in [`read`](read.md), [`convert`](convert.md), -and other `parse` operations. -These can be seen as a global fallback for the `parserOptions` argument. +As mentioned before, you can change the default global parser options that will be used by [`read`](read.md), +[`convert`](convert.md), and other `parse` operations. +Whenever you don't explicitly provide [parser options](#parser-options) to a function call, +DataFrame will use these global options instead. -For example, to change the locale to French and add a custom date-time pattern: +For example, to change the locale to French and add a custom date-time pattern for all following DataFrame calls, do: @@ -89,7 +95,7 @@ DataFrame.parser.locale = Locale.FRANCE DataFrame.parser.addDateTimePattern("dd.MM.uuuu HH:mm:ss") ``` -This means that the locale being used by the parser is defined as: +For `locale`, this means that the one being used by the parser is defined as: ↪ The locale given as function argument directly, or in `parserOptions`, if it is not `null`, else @@ -104,7 +110,8 @@ It is based on [the FastDoubleParser library](https://github.com/wrandelshofer/F high performance and configurability (in the future, we might expand this support to `Float`, `BigDecimal`, and `BigInteger` as well). -The parser is locale-aware; it will use the locale set by the [parser options](#parser-options) to parse the doubles. +The parser is locale-aware; it will use the locale set by the +[(global)](#global-parser-options) [parser options](#parser-options) to parse the doubles. It also has a fallback mechanism built in, meaning it can recognize characters from all other locales (and some from [Wikipedia](https://en.wikipedia.org/wiki/Decimal_separator)) and parse them correctly as long as they don't conflict with the current locale.