Skip to content

Commit 4301051

Browse files
committed
POSIX Double parsing per column
1 parent 7572dbf commit 4301051

File tree

4 files changed

+155
-26
lines changed

4 files changed

+155
-26
lines changed

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import org.jetbrains.kotlinx.dataframe.RowValueExpression
1616
import org.jetbrains.kotlinx.dataframe.columns.ColumnReference
1717
import org.jetbrains.kotlinx.dataframe.dataTypes.IFRAME
1818
import org.jetbrains.kotlinx.dataframe.dataTypes.IMG
19+
import org.jetbrains.kotlinx.dataframe.exceptions.TypeConversionException
1920
import org.jetbrains.kotlinx.dataframe.impl.api.Parsers
2021
import org.jetbrains.kotlinx.dataframe.impl.api.convertRowColumnImpl
2122
import org.jetbrains.kotlinx.dataframe.impl.api.convertToTypeImpl
@@ -125,6 +126,48 @@ public fun <T : Any> DataColumn<T?>.convertToString(): DataColumn<String?> = con
125126
public fun <T : Any> DataColumn<T>.convertToDouble(): DataColumn<Double> = convertTo()
126127
public fun <T : Any> DataColumn<T?>.convertToDouble(): DataColumn<Double?> = convertTo()
127128

129+
/**
130+
* Parse String column to Double considering locale (number format).
131+
* If [locale] parameter is defined, it's number format is used for parsing.
132+
* If [locale] parameter is null, the current system locale is used. If column can not be parsed, then POSIX format is used.
133+
*/
134+
@JvmName("convertToDoubleFromString")
135+
public fun DataColumn<String>.convertToDouble(locale: Locale? = null): DataColumn<Double> {
136+
if (locale is Locale) {
137+
val explicitConverter = Parsers.getDoubleConverter(locale) as (String) -> Double?
138+
return map { explicitConverter(it.trim()) ?: error("Can't convert `$it` to Double") }
139+
} else {
140+
return try {
141+
val defaultConverter = Parsers.getDoubleConverter() as (String) -> Double?
142+
map { defaultConverter(it.trim()) ?: error("Can't convert `$it` to Double") }
143+
} catch (e: TypeConversionException) {
144+
val posixConverter = Parsers.getDoubleConverter(Locale.forLanguageTag("C.UTF-8")) as (String) -> Double?
145+
map { posixConverter(it.trim()) ?: error("Can't convert `$it` to Double") }
146+
}
147+
}
148+
}
149+
150+
/**
151+
* Parse String column to Double considering locale (number format).
152+
* If [locale] parameter is defined, it's number format is used for parsing.
153+
* If [locale] parameter is null, the current system locale is used. If column can not be parsed, then POSIX format is used.
154+
*/
155+
@JvmName("convertToDoubleFromStringNullable")
156+
public fun DataColumn<String?>.convertToDouble(locale: Locale? = null): DataColumn<Double?> {
157+
if (locale is Locale) {
158+
val explicitConverter = Parsers.getDoubleConverter(locale) as (String) -> Double?
159+
return map { it?.let { explicitConverter(it.trim()) ?: error("Can't convert `$it` to Double") } }
160+
} else {
161+
return try {
162+
val defaultConverter = Parsers.getDoubleConverter() as (String) -> Double?
163+
map { it?.let { defaultConverter(it.trim()) ?: error("Can't convert `$it` to Double") } }
164+
} catch (e: IllegalStateException) {
165+
val posixConverter = Parsers.getDoubleConverter(Locale.forLanguageTag("C.UTF-8")) as (String) -> Double?
166+
map { it?.let { posixConverter(it.trim()) ?: error("Can't convert `$it` to Double") } }
167+
}
168+
}
169+
}
170+
128171
@JvmName("convertToFloatFromT")
129172
public fun <T : Any> DataColumn<T>.convertToFloat(): DataColumn<Float> = convertTo()
130173
public fun <T : Any> DataColumn<T?>.convertToFloat(): DataColumn<Float?> = convertTo()

core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -171,23 +171,18 @@ internal object Parsers : GlobalParserOptions {
171171
return null
172172
}
173173

174-
private val posixNumberFormat = NumberFormat.getInstance(Locale.forLanguageTag("C.UTF-8"))
175-
176-
private fun String.parseDouble(userNumberFormat: NumberFormat) =
174+
private fun String.parseDouble(format: NumberFormat) =
177175
when (uppercase(Locale.getDefault())) {
178176
"NAN" -> Double.NaN
179177
"INF" -> Double.POSITIVE_INFINITY
180178
"-INF" -> Double.NEGATIVE_INFINITY
181179
"INFINITY" -> Double.POSITIVE_INFINITY
182180
"-INFINITY" -> Double.NEGATIVE_INFINITY
183181
else -> {
184-
fun parseWithFormat(format: NumberFormat): Double? {
185-
val parsePosition = ParsePosition(0)
186-
val result: Double? = format.parse(this, parsePosition)?.toDouble()
187-
return if (parsePosition.index != this.length) null
188-
else result
189-
}
190-
parseWithFormat(userNumberFormat) ?: parseWithFormat(posixNumberFormat)
182+
val parsePosition = ParsePosition(0)
183+
val result: Double? = format.parse(this, parsePosition)?.toDouble()
184+
if (parsePosition.index != this.length) null
185+
else result
191186
}
192187
}
193188

@@ -199,6 +194,12 @@ internal object Parsers : GlobalParserOptions {
199194
inline fun <reified T : Any> stringParserWithOptions(noinline body: (ParserOptions?) -> ((String) -> T?)) =
200195
StringParserWithFormat(typeOf<T>(), body)
201196

197+
private val parserToDoubleWithOptions = stringParserWithOptions { options ->
198+
val numberFormat = NumberFormat.getInstance(options?.locale ?: Locale.getDefault())
199+
val parser = { it: String -> it.parseDouble(numberFormat) }
200+
parser
201+
}
202+
202203
private val parsersOrder = listOf(
203204
stringParser { it.toIntOrNull() },
204205
stringParser { it.toLongOrNull() },
@@ -231,12 +232,12 @@ internal object Parsers : GlobalParserOptions {
231232

232233
stringParser { it.toUrlOrNull() },
233234

234-
stringParserWithOptions { options ->
235+
// Double, with explicit number format or taken from current locale
236+
parserToDoubleWithOptions,
237+
238+
// Double, with POSIX format
239+
stringParser { it.parseDouble(NumberFormat.getInstance(Locale.forLanguageTag("C.UTF-8"))) },
235240

236-
val numberFormat = NumberFormat.getInstance(options?.locale ?: Locale.getDefault())
237-
val parser = { it: String -> it.parseDouble(numberFormat) }
238-
parser
239-
},
240241
stringParser { it.toBooleanOrNull() },
241242
stringParser { it.toBigDecimalOrNull() },
242243

@@ -271,6 +272,13 @@ internal object Parsers : GlobalParserOptions {
271272
) else null
272273
return parser.applyOptions(options)
273274
}
275+
276+
internal fun getDoubleConverter(locale: Locale? = null): TypeConverter {
277+
val options = if (locale != null) ParserOptions(
278+
locale = locale
279+
) else null
280+
return parserToDoubleWithOptions.toConverter(options)
281+
}
274282
}
275283

276284
internal fun DataColumn<String?>.tryParseImpl(options: ParserOptions?): DataColumn<*> {

core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/CsvTests.kt

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,27 @@ class CsvTests {
104104
assertColumnType("quality", Int::class)
105105
}
106106

107+
@Test
108+
fun `read standard CSV with floats when user has alternative locale`() {
109+
val currentLocale = Locale.getDefault()
110+
try {
111+
Locale.setDefault(Locale.forLanguageTag("ru-RU"))
112+
val df = DataFrame.readCSV(wineCsv, delimiter = ';')
113+
val schema = df.schema()
114+
fun assertColumnType(columnName: String, kClass: KClass<*>) {
115+
val col = schema.columns[columnName]
116+
col.shouldNotBeNull()
117+
col.type.classifier shouldBe kClass
118+
}
119+
120+
assertColumnType("citric acid", Double::class)
121+
assertColumnType("alcohol", Double::class)
122+
assertColumnType("quality", Int::class)
123+
} finally {
124+
Locale.setDefault(currentLocale)
125+
}
126+
}
127+
107128
@Test
108129
fun `read with custom header`() {
109130
val header = ('A'..'K').map { it.toString() }

core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt

Lines changed: 68 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
package org.jetbrains.kotlinx.dataframe.io
22

3+
import io.kotest.assertions.throwables.shouldThrow
34
import io.kotest.matchers.shouldBe
45
import kotlinx.datetime.LocalDateTime
56
import org.jetbrains.kotlinx.dataframe.DataColumn
67
import org.jetbrains.kotlinx.dataframe.DataFrame
78
import org.jetbrains.kotlinx.dataframe.api.cast
89
import org.jetbrains.kotlinx.dataframe.api.columnOf
910
import org.jetbrains.kotlinx.dataframe.api.convertTo
11+
import org.jetbrains.kotlinx.dataframe.api.convertToDouble
1012
import org.jetbrains.kotlinx.dataframe.api.parse
1113
import org.jetbrains.kotlinx.dataframe.api.parser
1214
import org.jetbrains.kotlinx.dataframe.api.tryParse
@@ -77,18 +79,73 @@ class ParserTests {
7779
fun `converting String to Double in different locales`() {
7880
val currentLocale = Locale.getDefault()
7981
try {
80-
val stringValues = listOf("1", "2.3", "4,5")
81-
val stringColumn = DataColumn.createValueColumn("nums", stringValues, typeOf<String>())
82-
Locale.setDefault(Locale.forLanguageTag("ru-RU"))
83-
// Use comma as local decimal separator and dot as fallback default (as it is used in POSIX/C.UTF-8)
84-
stringColumn.convertTo<Double>().shouldBe(
85-
DataColumn.createValueColumn("nums", listOf(1.0, 2.3, 4.5), typeOf<Double>())
86-
)
82+
// Test 36 behaviour combinations:
83+
84+
// 3 source columns
85+
val columnDot = columnOf("12.345", "67.890")
86+
val columnComma = columnOf("12,345", "67,890")
87+
val columnMixed = columnOf("12.345", "67,890")
88+
// *
89+
// (3 locales as converting parameter + original converting)
90+
val parsingLocaleNotDefined: Locale? = null
91+
val parsingLocaleUsesDot: Locale = Locale.forLanguageTag("en-US")
92+
val parsingLocaleUsesComma: Locale = Locale.forLanguageTag("ru-RU")
93+
// *
94+
// 3 system locales
95+
96+
Locale.setDefault(Locale.forLanguageTag("C.UTF-8"))
97+
98+
columnDot.convertTo<Double>().shouldBe(columnOf(12.345, 67.89))
99+
columnComma.convertTo<Double>().shouldBe(columnOf(12345.0, 67890.0))
100+
columnMixed.convertTo<Double>().shouldBe(columnOf(12.345, 67890.0))
101+
102+
columnDot.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67.89))
103+
columnComma.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12345.0, 67890.0))
104+
columnMixed.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67890.0))
105+
106+
columnDot.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67.89))
107+
columnComma.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12345.0, 67890.0))
108+
columnMixed.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67890.0))
109+
110+
shouldThrow<TypeConversionException> { columnDot.convertToDouble(parsingLocaleUsesComma) }
111+
columnComma.convertToDouble(parsingLocaleUsesComma).shouldBe(columnOf(12.345, 67.89))
112+
shouldThrow<TypeConversionException> { columnMixed.convertToDouble(parsingLocaleUsesComma) }
113+
87114
Locale.setDefault(Locale.forLanguageTag("en-US"))
88-
// Use dot as local decimal separator. Comma is ignored (as it is group separator in this locale).
89-
stringColumn.convertTo<Double>().shouldBe(
90-
DataColumn.createValueColumn("nums", listOf(1.0, 2.3, 45.0), typeOf<Double>())
91-
)
115+
116+
columnDot.convertTo<Double>().shouldBe(columnOf(12.345, 67.89))
117+
columnComma.convertTo<Double>().shouldBe(columnOf(12345.0, 67890.0))
118+
columnMixed.convertTo<Double>().shouldBe(columnOf(12.345, 67890.0))
119+
120+
columnDot.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67.89))
121+
columnComma.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12345.0, 67890.0))
122+
columnMixed.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67890.0))
123+
124+
columnDot.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67.89))
125+
columnComma.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12345.0, 67890.0))
126+
columnMixed.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67890.0))
127+
128+
shouldThrow<TypeConversionException> { columnDot.convertToDouble(parsingLocaleUsesComma) }
129+
columnComma.convertToDouble(parsingLocaleUsesComma).shouldBe(columnOf(12.345, 67.89))
130+
shouldThrow<TypeConversionException> { columnMixed.convertToDouble(parsingLocaleUsesComma) }
131+
132+
Locale.setDefault(Locale.forLanguageTag("ru-RU"))
133+
134+
columnDot.convertTo<Double>().shouldBe(columnOf(12.345, 67.89))
135+
columnComma.convertTo<Double>().shouldBe(columnOf(12345.0, 67890.0))
136+
columnMixed.convertTo<Double>().shouldBe(columnOf(12.345, 67890.0))
137+
138+
columnDot.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67.89))
139+
columnComma.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67.89))
140+
columnMixed.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67890.0))
141+
142+
columnDot.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67.89))
143+
columnComma.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12345.0, 67890.0))
144+
columnMixed.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67890.0))
145+
146+
shouldThrow<TypeConversionException> { columnDot.convertToDouble(parsingLocaleUsesComma) }
147+
columnComma.convertToDouble(parsingLocaleUsesComma).shouldBe(columnOf(12.345, 67.89))
148+
shouldThrow<TypeConversionException> { columnMixed.convertToDouble(parsingLocaleUsesComma) }
92149
} finally {
93150
Locale.setDefault(currentLocale)
94151
}

0 commit comments

Comments
 (0)