Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions core/api/core.api
Original file line number Diff line number Diff line change
Expand Up @@ -4462,7 +4462,9 @@ public final class org/jetbrains/kotlinx/dataframe/api/TakeKt {
}

public final class org/jetbrains/kotlinx/dataframe/api/ToDataFrameKt {
public static final fun toDataFrame (Ljava/util/List;Ljava/util/List;Z)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun toDataFrame (Ljava/util/Map;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static synthetic fun toDataFrame$default (Ljava/util/List;Ljava/util/List;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun toDataFrameAnyColumn (Ljava/lang/Iterable;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun toDataFrameColumnPathAnyNullable (Ljava/lang/Iterable;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
public static final fun toDataFrameColumnPathAnyNullable (Ljava/util/Map;)Lorg/jetbrains/kotlinx/dataframe/DataFrame;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2525,4 +2525,4 @@ public fun <T, C> Convert<T, List<List<C>>>.toDataFrames(containsColumns: Boolea
* @return A new [DataColumn] with the values converted to [DataFrame].
*/
public fun <T> DataColumn<List<List<T>>>.toDataFrames(containsColumns: Boolean = false): DataColumn<AnyFrame> =
map { it.toDataFrame(containsColumns) }
map { it.toDataFrame(containsColumns = containsColumns) }
Original file line number Diff line number Diff line change
Expand Up @@ -256,3 +256,49 @@ public fun Map<ColumnPath, Iterable<Any?>>.toDataFrame(): AnyFrame =
}.toDataFrameFromPairs<Unit>()

// endregion

/**
* Converts a list of lists into a [DataFrame].
*
* By default, treats lists as row values. If [header] is not provided, the first inner list becomes a header (column names), and the remaining lists are treated as data.
*
* With [containsColumns] = `true`, interprets each inner list as a column.
* If [header] is not provided, the first element will be used as the column name, and the remaining elements as values.
*
* @param T The type of elements contained in the nested lists.
* @param containsColumns If `true`, treats each nested list as a column.
* Otherwise, each nested list is a row.
* Defaults to `false`.
* @param header overrides extraction of column names from lists - all values are treated as data instead.
* @return A [DataFrame] containing the data from the nested list structure.
* Returns an empty [DataFrame] if the input is empty or invalid.
*/
@Refine
@Interpretable("ValuesListsToDataFrame")
public fun <T> List<List<T>>.toDataFrame(header: List<String>?, containsColumns: Boolean = false): AnyFrame =
when {
containsColumns -> {
mapIndexedNotNull { index, list ->
if (list.isEmpty()) return@mapIndexedNotNull null
val name = header?.get(index) ?: list[0].toString()
val values = if (header == null) list.drop(1) else list
createColumnGuessingType(name, values)
}.toDataFrame()
}

isEmpty() -> DataFrame.Empty

else -> {
val data = if (header == null) drop(1) else this
(header ?: get(0).map { it.toString() }).mapIndexed { colIndex, name ->
val values = data.map { row ->
if (row.size <= colIndex) {
null
} else {
row[colIndex]
}
}
createColumnGuessingType(name, values)
}.toDataFrame()
}
}
48 changes: 6 additions & 42 deletions core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/common.kt
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ import org.apache.commons.io.input.BOMInputStream
import org.jetbrains.kotlinx.dataframe.AnyFrame
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.api.toDataFrame
import org.jetbrains.kotlinx.dataframe.impl.columns.createColumnGuessingType
import org.jetbrains.kotlinx.dataframe.util.IS_URL
import org.jetbrains.kotlinx.dataframe.util.IS_URL_IMPORT
import org.jetbrains.kotlinx.dataframe.util.IS_URL_REPLACE
import org.jetbrains.kotlinx.dataframe.util.LISTS_TO_DATAFRAME_MIGRATION
import java.io.File
import java.io.InputStream
import java.net.HttpURLConnection
Expand Down Expand Up @@ -45,48 +45,12 @@ public fun catchHttpResponse(url: URL, body: (InputStream) -> AnyFrame): AnyFram
}
}

/**
* Converts a list of lists into a [DataFrame].
*
* By default, treats the first inner list as a header (column names), and the remaining lists as rows.
* If [containsColumns] is `true`, interprets each inner list as a column,
* where the first element is used as the column name, and the remaining elements as values.
*
* @param T The type of elements contained in the nested lists.
* @param containsColumns If `true`, treats each nested list as a column with its first element as the column name.
* Otherwise, the first list is treated as the header.
* Defaults to `false`.
* @return A [DataFrame] containing the data from the nested list structure.
* Returns an empty [DataFrame] if the input is empty or invalid.
*/
@Deprecated(
LISTS_TO_DATAFRAME_MIGRATION,
ReplaceWith("this.toDataFrame(header = null, containsColumns)", "org.jetbrains.kotlinx.dataframe.api.toDataFrame"),
)
public fun <T> List<List<T>>.toDataFrame(containsColumns: Boolean = false): AnyFrame =
when {
containsColumns -> {
mapNotNull {
if (it.isEmpty()) return@mapNotNull null
val name = it[0].toString()
val values = it.drop(1)
createColumnGuessingType(name, values)
}.toDataFrame()
}

isEmpty() -> DataFrame.Empty

else -> {
val header = get(0).map { it.toString() }
val data = drop(1)
header.mapIndexed { colIndex, name ->
val values = data.map { row ->
if (row.size <= colIndex) {
null
} else {
row[colIndex]
}
}
createColumnGuessingType(name, values)
}.toDataFrame()
}
}
toDataFrame(header = null, containsColumns)

@Deprecated(
message = IS_URL,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,9 @@ internal const val GET_ROWS_RANGE_REPLACE = "df().getRows(indices)"
internal const val GET_ROW_OR_NULL_REPLACE = "df().getRowOrNull(index)"
internal const val COPY_REPLACE = "columns().toDataFrame().cast()"

internal const val LISTS_TO_DATAFRAME_MIGRATION =
"Function moved from io to api package, and a new `header` parameter is introduced. $MESSAGE_1_1"

// endregion

// region keep across releases
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -700,4 +700,92 @@ class CreateDataFrameTests {
val df = list.toDataFrame(maxDepth = 2)
df["map"].type() shouldBe typeOf<Map<String, Int>>()
}

@Test
fun `parsing row-major lines into structured dataframe`() {
// I think finding data in such format will be rare, so we need an optional header parameter.
val lines = buildList {
addAll(listOf("stamp", "header", "data"))
repeat(33) { row ->
add("stamp $row")
add("header $row")
add("data $row")
}
}

val df = lines.chunked(3).toDataFrame()

df.columnNames() shouldBe listOf("stamp", "header", "data")
df.columnTypes() shouldBe listOf(typeOf<String>(), typeOf<String>(), typeOf<String>())
df.rowsCount() shouldBe 33
df[0].values() shouldBe listOf("stamp 0", "header 0", "data 0")
}

@Test
fun `parsing srt lines into structured dataframe`() {
// *.srt subtitle file format
val lines = buildList {
repeat(33) { row ->
add("stamp $row")
add("header $row")
add("data $row")
add("\n")
}
}

val df = lines.chunked(4).map { it.dropLast(1) }.toDataFrame(header = listOf("stamp", "header", "data"))

df.columnNames() shouldBe listOf("stamp", "header", "data")
df.columnTypes() shouldBe listOf(typeOf<String>(), typeOf<String>(), typeOf<String>())
df.rowsCount() shouldBe 33
df[0].values() shouldBe listOf("stamp 0", "header 0", "data 0")

// Different approach. I think the dropLast one is better
lines.chunked(4)
.toDataFrame(header = listOf("stamp", "header", "data", "whitespace"))
.remove("whitespace") shouldBe df
}

@Test
fun `parsing column-major lines into structured dataframe`() {
val lines = buildList {
repeat(4) { col ->
repeat(5) { row ->
add("data$col $row")
}
add("\n")
}
}

val header = List(4) { "col $it" }
val df = lines
.chunked(6)
.map { it.dropLast(1) }
.toDataFrame(header = header, containsColumns = true)
df.columnNames() shouldBe header
df.columnTypes() shouldBe List(4) { typeOf<String>() }
df["col 0"].values() shouldBe listOf("data0 0", "data0 1", "data0 2", "data0 3", "data0 4")
}

@Test
fun `parsing column-major lines with header into structured dataframe`() {
val lines = buildList {
repeat(4) { col ->
add("col $col")
repeat(5) { row ->
add("data$col $row")
}
add("\n")
}
}

val header = List(4) { "col $it" }
val df = lines
.chunked(7)
.map { it.dropLast(1) }
.toDataFrame(header = null, containsColumns = true)
df.columnNames() shouldBe header
df.columnTypes() shouldBe List(4) { typeOf<String>() }
df["col 0"].values() shouldBe listOf("data0 0", "data0 1", "data0 2", "data0 3", "data0 4")
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -425,4 +425,22 @@ class Create : TestBase() {
val df = files.toDataFrame(columnName = "data")
// SampleEnd
}

@Test
@TransformDataFrameExpressions
fun toDataFrameLists() {
// SampleStart
val lines = """
1
00:00:05,000 --> 00:00:07,500
This is the first subtitle.

2
00:00:08,000 --> 00:00:10,250
This is the second subtitle.
""".trimIndent().lines()

lines.chunked(4) { it.take(3) }.toDataFrame(header = listOf("n", "timestamp", "text"))
// SampleEnd
}
}
Loading