Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions core/api/core.api
Original file line number Diff line number Diff line change
Expand Up @@ -3547,8 +3547,12 @@ public final class org/jetbrains/kotlinx/dataframe/api/ParseKt {
public static synthetic fun parse$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame;[Lorg/jetbrains/kotlinx/dataframe/columns/ColumnReference;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun parseAnyFrameNullable (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static synthetic fun parseAnyFrameNullable$default (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static final fun parseChar (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static synthetic fun parseChar$default (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static final fun tryParse (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static synthetic fun tryParse$default (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static final fun tryParseChar (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
public static synthetic fun tryParseChar$default (Lorg/jetbrains/kotlinx/dataframe/DataColumn;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataColumn;
}

public final class org/jetbrains/kotlinx/dataframe/api/ParserOptions {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import java.time.format.DateTimeFormatter
import java.util.Locale
import kotlin.reflect.KProperty
import kotlin.reflect.KType
import kotlin.reflect.typeOf
import kotlin.uuid.ExperimentalUuidApi
import kotlin.uuid.Uuid

Expand Down Expand Up @@ -312,6 +313,28 @@ public class ParserOptions(
* @return a new column with parsed values */
public fun DataColumn<String?>.tryParse(options: ParserOptions? = null): DataColumn<*> = tryParseImpl(options)

/**
* Tries to parse a column of chars into a column of a different type.
* Each parser in [Parsers] is run in order until a valid parser is found,
* a.k.a. that parser was able to parse all values in the column successfully. If a parser
* fails to parse any value, the next parser is tried. If all the others fail, the final parser
* returns strings.
*
* Parsers that are [covered by][StringParser.coveredBy] other parsers are skipped.
*
* @param options options for parsing, like providing a locale or a custom date-time formatter
* @throws IllegalStateException if no valid parser is found (unlikely, unless the `String` parser is disabled)
* @return a new column with parsed values
*/
@JvmName("tryParseChar")
public fun DataColumn<Char?>.tryParse(options: ParserOptions? = null): DataColumn<*> {
// skip the Char parser, as we're trying to parse away from Char
val providedSkipTypes = options?.skipTypes ?: DataFrame.parser.skipTypes
val parserOptions = (options ?: ParserOptions()).copy(skipTypes = providedSkipTypes + typeOf<Char>())

return map { it?.toString() }.tryParse(parserOptions)
}

public fun <T> DataFrame<T>.parse(options: ParserOptions? = null): DataFrame<T> =
parse(options) {
colsAtAnyDepth().filter { !it.isColumnGroup() }
Expand All @@ -335,6 +358,23 @@ public fun <T> DataFrame<T>.parse(options: ParserOptions? = null): DataFrame<T>
public fun DataColumn<String?>.parse(options: ParserOptions? = null): DataColumn<*> =
tryParse(options).also { if (it.typeClass == String::class) error("Can't guess column type") }

/**
* Tries to parse a column of chars as strings into a column of a different type.
* Each parser in [Parsers] is run in order until a valid parser is found,
* a.k.a. that parser was able to parse all values in the column successfully. If a parser
* fails to parse any value, the next parser is tried.
*
* If all fail, the column is returned as `String`, this can never fail.
*
* Parsers that are [covered by][StringParser.coveredBy] other parsers are skipped.
*
* @param options options for parsing, like providing a locale or a custom date-time formatter
* @return a new column with parsed values
*/
@JvmName("parseChar")
public fun DataColumn<Char?>.parse(options: ParserOptions? = null): DataColumn<*> =
tryParse(options) // no need to throw an exception, as Char can always be parsed as String

@JvmName("parseAnyFrameNullable")
public fun DataColumn<AnyFrame?>.parse(options: ParserOptions? = null): DataColumn<AnyFrame?> =
map { it?.parse(options) }
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,13 @@ internal fun createConverter(from: KType, to: KType, options: ParserOptions? = n

Char::class -> when (toClass) {
Int::class -> convert<Char> { it.code }
else -> null

else -> // convert char to string and then to target type
getConverter(typeOf<String>(), to, options)?.let { stringConverter ->
convert<Char> {
stringConverter(it.toString())
}
}
}

Int::class -> when (toClass) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -716,29 +716,24 @@ internal fun <T> DataFrame<T>.parseImpl(options: ParserOptions?, columns: Column
when {
// when a frame column is requested to be parsed,
// parse each value/frame column at any depth inside each DataFrame in the frame column
col.isFrameColumn() -> {
col.isFrameColumn() ->
col.map {
it.parseImpl(options) {
colsAtAnyDepth().filter { !it.isColumnGroup() }
}
}
}

// when a column group is requested to be parsed,
// parse each column in the group
col.isColumnGroup() -> {
col.isColumnGroup() ->
col.parseImpl(options) { all() }
.asColumnGroup(col.name())
.asDataColumn()
}

// Base case, parse the column if it's a `String?` column
col.isSubtypeOf<String?>() -> {
col.isSubtypeOf<String?>() ->
col.cast<String?>().tryParseImpl(options)
}

else -> {
col
}
else -> col
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package org.jetbrains.kotlinx.dataframe.api
import io.kotest.assertions.throwables.shouldNotThrow
import io.kotest.assertions.throwables.shouldThrow
import io.kotest.matchers.shouldBe
import io.kotest.matchers.shouldNotBe
import kotlinx.datetime.Clock
import kotlinx.datetime.Instant
import kotlinx.datetime.LocalTime
Expand Down Expand Up @@ -69,6 +70,20 @@ class ConvertTests {
@Test
fun `convert string to enum`() {
columnOf("A", "B").convertTo<EnumClass>() shouldBe columnOf(EnumClass.A, EnumClass.B)

dataFrameOf(columnOf("A", "B") named "colA")
.convert("colA").to<EnumClass>()
.getColumn("colA") shouldBe columnOf(EnumClass.A, EnumClass.B).named("colA")
}

@Test
fun `convert char to enum`() {
// Char -> String -> Enum
columnOf('A', 'B').convertTo<EnumClass>() shouldBe columnOf(EnumClass.A, EnumClass.B)

dataFrameOf(columnOf('A', 'B') named "colA")
.convert("colA").to<EnumClass>()
.getColumn("colA") shouldBe columnOf(EnumClass.A, EnumClass.B).named("colA")
}

@JvmInline
Expand Down Expand Up @@ -199,6 +214,15 @@ class ConvertTests {
val col = columnOf(65, 66)
col.convertTo<Char>() shouldBe columnOf('A', 'B')
col.convertTo<Char>().convertTo<Int>() shouldBe col

// this means
columnOf('1', '2').convertToInt() shouldNotBe columnOf(1, 2)
columnOf('1', '2').convertToInt() shouldBe columnOf(49, 50)

// but
columnOf('1', '2').convertToString().convertToInt() shouldBe columnOf(1, 2)
// or
columnOf('1', '2').parse() shouldBe columnOf(1, 2)
}

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,22 @@ import kotlin.time.Instant as StdlibInstant
import kotlinx.datetime.Instant as DeprecatedInstant

class ParseTests {

@Test
fun `parse to chars`() {
val char = columnOf('a', 'b', 'c')
char.parse() shouldBe char
char.tryParse() shouldBe char
char.convertToString().parse() shouldBe char
}

@Test
fun `parse chars to int`() {
val char = columnOf('1', '2', '3')
char.parse() shouldBe columnOf(1, 2, 3)
char.tryParse() shouldBe columnOf(1, 2, 3)
}

@Test
fun parseDate() {
val currentLocale = Locale.getDefault()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ class ParserTests {
DataFrame.parser.resetToDefault()
}

@Test
fun `parse to Char`() {
val col by columnOf("a", "b")
col.parse().type() shouldBe typeOf<Char>()
}

@Test(expected = IllegalStateException::class)
fun `parse should throw`() {
val col by columnOf("a", "bc")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ public class ConverterScope(public val fromType: KType, public val toSchema: Col
* df.convertTo<SomeSchema> {
* // defines how to convert Int? -> String
* convert<Int?>().with { it?.toString() ?: "No input given" }
* // defines how to convert String -> SomeType
* // defines how to convert String/Char -> SomeType
* parser { SomeType(it) }
* // fill missing column `sum` with expression `a+b`
* fill { sum }.with { a + b }
Expand Down Expand Up @@ -102,6 +102,10 @@ public fun <T, C> ConvertToFill<T, C>.with(expr: RowExpression<T, C>) {

/**
* Defines how to convert `String` values into given type [C].
*
* This method is a shortcut for `convert<String>().with { }`.
*
* If no converter is defined for `Char` values, this converter will be used for them as well.
*/
public inline fun <reified C> ConvertSchemaDsl<*>.parser(noinline parser: (String) -> C): Unit =
convert<String>().with(parser)
Expand Down
40 changes: 38 additions & 2 deletions core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ import org.jetbrains.kotlinx.dataframe.impl.api.StringParser
import org.jetbrains.kotlinx.dataframe.impl.api.parseImpl
import org.jetbrains.kotlinx.dataframe.impl.api.tryParseImpl
import org.jetbrains.kotlinx.dataframe.impl.io.FastDoubleParser
import org.jetbrains.kotlinx.dataframe.typeClass
import org.jetbrains.kotlinx.dataframe.util.DEPRECATED_ACCESS_API
import org.jetbrains.kotlinx.dataframe.util.PARSER_OPTIONS
import org.jetbrains.kotlinx.dataframe.util.PARSER_OPTIONS_COPY
Expand Down Expand Up @@ -302,6 +301,23 @@ public class ParserOptions(
/** @include [tryParseImpl] */
public fun DataColumn<String?>.tryParse(options: ParserOptions? = null): DataColumn<*> = tryParseImpl(options)

/**
* Tries to parse a column of chars into a column of a different type.
* Each parser in [Parsers] is run in order until a valid parser is found,
* a.k.a. that parser was able to parse all values in the column successfully. If a parser
* fails to parse any value, the next parser is tried. If all the others fail, the final parser
* returns strings.
*
* Parsers that are [covered by][StringParser.coveredBy] other parsers are skipped.
*
* @param options options for parsing, like providing a locale or a custom date-time formatter
* @throws IllegalStateException if no valid parser is found (unlikely, unless the `String` parser is disabled)
* @return a new column with parsed values
*/
@JvmName("tryParseChar")
public fun DataColumn<Char?>.tryParse(options: ParserOptions? = null): DataColumn<*> =
map { it?.toString() }.tryParseImpl(options)

public fun <T> DataFrame<T>.parse(options: ParserOptions? = null): DataFrame<T> =
parse(options) {
colsAtAnyDepth().filter { !it.isColumnGroup() }
Expand All @@ -323,7 +339,27 @@ public fun <T> DataFrame<T>.parse(options: ParserOptions? = null): DataFrame<T>
* @return a new column with parsed values
*/
public fun DataColumn<String?>.parse(options: ParserOptions? = null): DataColumn<*> =
tryParse(options).also { if (it.typeClass == String::class) error("Can't guess column type") }
tryParse(options).also { if (it.isSubtypeOf<String?>()) error("Can't guess column type") }

/**
* Tries to parse a column of chars as strings into a column of a different type.
* Each parser in [Parsers] is run in order until a valid parser is found,
* a.k.a. that parser was able to parse all values in the column successfully. If a parser
* fails to parse any value, the next parser is tried.
*
* If all fail [IllegalStateException] is thrown. If you don't want this exception to be thrown,
* use [tryParse] instead.
*
* Parsers that are [covered by][StringParser.coveredBy] other parsers are skipped.
*
* @param options options for parsing, like providing a locale or a custom date-time formatter
* @return a new column with parsed values
*/
@JvmName("parseChar")
public fun DataColumn<Char?>.parse(options: ParserOptions? = null): DataColumn<*> =
map { it?.toString() }
.tryParse(options)
.also { if (it.isSubtypeOf<Char?>() || it.isSubtypeOf<String?>()) error("Can't guess column type") }

@JvmName("parseAnyFrameNullable")
public fun DataColumn<AnyFrame?>.parse(options: ParserOptions? = null): DataColumn<AnyFrame?> =
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,8 @@ internal fun getConverter(from: KType, to: KType, options: ParserOptions? = null

internal typealias TypeConverter = (Any) -> Any?

private val TypeConverterIdentity: TypeConverter = { it }

internal fun Any.convertTo(type: KType): Any? {
val clazz = javaClass.kotlin
if (clazz.isSubclassOf(type.jvmErasure)) return this
Expand All @@ -242,6 +244,7 @@ internal inline fun <T> convert(crossinline converter: (T) -> Any?): TypeConvert

private enum class DummyEnum

@Suppress("UNCHECKED_CAST")
internal fun createConverter(from: KType, to: KType, options: ParserOptions? = null): TypeConverter? {
if (from.arguments.isNotEmpty() || to.arguments.isNotEmpty()) return null
if (from.isMarkedNullable) {
Expand All @@ -250,25 +253,24 @@ internal fun createConverter(from: KType, to: KType, options: ParserOptions? = n
}
val fromClass = from.jvmErasure
val toClass = to.jvmErasure
return when {
fromClass == toClass -> TypeConverterIdentity

if (fromClass == toClass) return { it }

if (toClass.isValue) {
val constructor =
toClass.primaryConstructor ?: error("Value type $toClass doesn't have primary constructor")
val underlyingType = constructor.parameters.single().type
val converter = getConverter(from, underlyingType)
?: throw TypeConverterNotFoundException(from, underlyingType, null)
return convert<Any> {
val converted = converter(it)
if (converted == null && !underlyingType.isMarkedNullable) {
throw TypeConversionException(it, from, underlyingType, null)
toClass.isValue -> {
val constructor =
toClass.primaryConstructor ?: error("Value type $toClass doesn't have primary constructor")
val underlyingType = constructor.parameters.single().type
val converter = getConverter(from, underlyingType)
?: throw TypeConverterNotFoundException(from, underlyingType, null)
return convert<Any> {
val converted = converter(it)
if (converted == null && !underlyingType.isMarkedNullable) {
throw TypeConversionException(it, from, underlyingType, null)
}
constructor.call(converted)
}
constructor.call(converted)
}
}

return when {
fromClass == String::class -> {
val parser = Parsers[to.withNullability(false)]
when {
Expand Down Expand Up @@ -369,7 +371,13 @@ internal fun createConverter(from: KType, to: KType, options: ParserOptions? = n

Char::class -> when (toClass) {
Int::class -> convert<Char> { it.code }
else -> null

else -> // convert char to string and then to target type
getConverter(typeOf<String>(), to, options)?.let { stringConverter ->
convert<Char> {
stringConverter(it.toString())
}
}
}

Int::class -> when (toClass) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,10 @@ import org.jetbrains.kotlinx.dataframe.schema.ColumnSchema
import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema
import org.jetbrains.kotlinx.dataframe.size
import kotlin.reflect.KType
import kotlin.reflect.full.isSubtypeOf
import kotlin.reflect.full.withNullability
import kotlin.reflect.jvm.jvmErasure
import kotlin.reflect.typeOf

private val logger = KotlinLogging.logger {}

Expand Down Expand Up @@ -144,6 +146,25 @@ internal fun AnyFrame.convertToImpl(
val from = originalColumn.type()
val to = targetSchema.type
val converter = dsl.getConverter(from, targetSchema)
?: run {
// Special case for Char columns:
// If there is no explicit Char converter,
// check if we have any converters for String -> target
// if so, we can convert Char -> String -> target
// this allows `parser {}` to work both for Strings and Chars :)

if (!from.isSubtypeOf(typeOf<Char?>())) return@run null

val stringConverter = dsl.getConverter(
fromType = typeOf<String>().withNullability(from.isMarkedNullable),
toSchema = targetSchema,
) ?: return@run null

Converter(
transform = { stringConverter.transform(this, (it as Char?)?.toString()) },
skipNulls = stringConverter.skipNulls,
)
}

val convertedColumn = if (converter != null) {
val nullsAllowed = to.isMarkedNullable
Expand Down
Loading
Loading