-
Notifications
You must be signed in to change notification settings - Fork 76
Compare DataFrames #1556
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
CarloMariaProietti
wants to merge
18
commits into
Kotlin:master
Choose a base branch
from
CarloMariaProietti:compare_df
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+379
−0
Open
Compare DataFrames #1556
Changes from all commits
Commits
Show all changes
18 commits
Select commit
Hold shift + click to select a range
9580248
length of edit script is correct, working on path
CarloMariaProietti eeee3f3
trying
CarloMariaProietti 5fd2bfa
this is working but snake before f.r.e.
CarloMariaProietti fccaaf6
cleaning
CarloMariaProietti bcc41e0
cleaning
CarloMariaProietti 51e2b23
refining logic
CarloMariaProietti f4b21e3
algorythm works good with strings, next step is swithcing to dataFram…
CarloMariaProietti ce46f72
pull master
CarloMariaProietti b25c712
cleaning
CarloMariaProietti 026ffe6
tests
CarloMariaProietti 2de3ad1
improve logic
CarloMariaProietti 352bf45
improving comments
CarloMariaProietti 0db34d8
Update ValueColumn.kt
CarloMariaProietti 3387489
works fine with df
CarloMariaProietti e052646
compareImpl
CarloMariaProietti 4db19bf
compare is ready to use
CarloMariaProietti b4be510
ready for review
CarloMariaProietti e7b648f
pull
CarloMariaProietti File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
196 changes: 196 additions & 0 deletions
196
core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/compareDataFrames.kt
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,196 @@ | ||
| package org.jetbrains.kotlinx.dataframe.impl.api | ||
|
|
||
| import org.jetbrains.kotlinx.dataframe.DataFrame | ||
| import org.jetbrains.kotlinx.dataframe.api.DataRowSchema | ||
| import org.jetbrains.kotlinx.dataframe.api.concat | ||
| import org.jetbrains.kotlinx.dataframe.api.dataFrameOf | ||
| import org.jetbrains.kotlinx.dataframe.api.emptyDataFrame | ||
| import org.jetbrains.kotlinx.dataframe.nrow | ||
|
|
||
| internal class ComparisonDescription( | ||
| val rowAtIndex: Int, | ||
| val of: String, | ||
| val wasRemoved: Boolean?, | ||
| val wasInserted: Boolean?, | ||
| val afterRow: Int?, | ||
| ) : DataRowSchema | ||
|
|
||
| /** | ||
| * Returns a DataFrame whose rows explain the differences between dfA and dfB. | ||
| * One must think of the set of commands in a script as being executed simultaneously | ||
| */ | ||
| internal fun <T> compareDataFramesImpl(dfA: DataFrame<T>, dfB: DataFrame<T>): DataFrame<ComparisonDescription> { | ||
| var comparisonDf = emptyDataFrame<ComparisonDescription>() | ||
| // compare by exploiting Myers difference algorithm | ||
| val shortestEditScript = myersDifferenceAlgorithmImpl(dfA, dfB) | ||
| var x: Int? | ||
| var y: Int? | ||
| var xPrev: Int? | ||
| var yPrev: Int? | ||
| for (i in 1 until shortestEditScript.size) { | ||
| x = shortestEditScript[i].first | ||
| y = shortestEditScript[i].second | ||
| xPrev = shortestEditScript[i - 1].first | ||
| yPrev = shortestEditScript[i - 1].second | ||
| when { | ||
| // row at index 'x-1' of dfA was removed | ||
| xPrev + 1 == x && yPrev + 1 != y -> { | ||
| comparisonDf = comparisonDf.concat( | ||
| dataFrameOf | ||
| (ComparisonDescription(x - 1, "dfA", true, null, null)), | ||
| ) | ||
| } | ||
|
|
||
| // row at index 'y-1' of dfB was inserted after row in position 'x-1' of dfA | ||
| yPrev + 1 == y && xPrev + 1 != x -> { | ||
| comparisonDf = comparisonDf.concat( | ||
| dataFrameOf( | ||
| ComparisonDescription | ||
| (y - 1, "dfB", null, true, x - 1), | ||
| ), | ||
| ) | ||
| } | ||
| } | ||
| } | ||
| return comparisonDf | ||
| } | ||
|
|
||
| /** | ||
| * dfs with same schema. Returns an optimal path from origin to (N,M) in the edit graph. | ||
| * N is dfA.nrow, M is dfB.nrow. | ||
| * Knowing this path is knowing the differences between dfA and dfB | ||
| * and the shortest edit script to get B from A. | ||
| * The cost of this alg's worst case in O( (N+M)D ), D is the length of shortest edit script. | ||
| * | ||
| * The idea of the algorithm is the following: try to cross the edit graph making 'd' non-diagonal moves, | ||
| * increase 'd' until you succeed. | ||
| * Non-diagonal moves make edit script longer, while diagonal moves do not. | ||
| * | ||
| * snake: non-diagonal edge and then a possibly empty sequence of diagonal edges | ||
| * D-path: a path starting at (0,0) that has exactly D non-diagonal edges | ||
| */ | ||
| internal fun <T> myersDifferenceAlgorithmImpl(dfA: DataFrame<T>, dfB: DataFrame<T>): List<Pair<Int, Int>> { | ||
| // Return value | ||
| val path = mutableListOf<Pair<Int, Int>>() | ||
| // 'ses' stands for shortest edit script, next var is never returned, it is in the code | ||
| // to show the capabilities of the algorithm | ||
| var sesLength: Int? | ||
| val sumOfLength = dfA.nrow + dfB.nrow | ||
| // matrix containing the endpoint of the furthest reaching D-path ending in diagonal k | ||
| // for each d-k couple of interest | ||
| val v = arrayListOf<IntArray>() | ||
| for (d in 0..sumOfLength) { | ||
| v.add(IntArray(sumOfLength * 2 + 1)) | ||
| } | ||
| var isOver = false | ||
| // starting the algorithm | ||
| // 0 position is -(M+N) position in the alg's paper -> need to normalize each access to v | ||
| val normalizer = sumOfLength | ||
| v[0][1 + normalizer] = 0 // fitticious | ||
| // d is the number of non-diagonal edges | ||
| var d = 0 | ||
| while (d <= sumOfLength && !isOver) { | ||
| for (k in -d..d step 2) { | ||
| var x: Int? | ||
| // Each furthest reaching D-path ending in diagonal k | ||
| // is built by exploiting the furthest reaching (D-1)-path ending in k-1 or (exclusive or) k+1 | ||
| if (k == -d || k != d && v[d][k - 1 + normalizer] < v[d][k + 1 + normalizer]) { | ||
| x = v[d][k + 1 + normalizer] | ||
| } else { | ||
| x = v[d][k - 1 + normalizer] + 1 | ||
| } | ||
| var y = x - k | ||
| while (x < dfA.nrow && y < dfB.nrow && dfA[x] == dfB[y]) { | ||
| x += 1 | ||
| y += 1 | ||
| } | ||
| v[d][k + normalizer] = x | ||
| // need this data in the next iteration | ||
| if (d < sumOfLength) { | ||
| v[d + 1][k + normalizer] = x | ||
| } | ||
| // Edit graph was fully crossed | ||
| if (x >= dfA.nrow && y >= dfB.nrow) { | ||
| isOver = true | ||
| sesLength = d | ||
| recoursivePathFill(path, v, d, k, normalizer, dfA, dfB) | ||
| break | ||
| } | ||
| } | ||
| // try with a longer edit script | ||
| d++ | ||
| } | ||
| val immutablePath = path.toList() | ||
| return immutablePath | ||
| } | ||
|
|
||
| internal fun <T> recoursivePathFill( | ||
| path: MutableList<Pair<Int, Int>>, | ||
| v: ArrayList<IntArray>, | ||
| d: Int, | ||
| k: Int, | ||
| normalizer: Int, | ||
| dfA: DataFrame<T>, | ||
| dfB: DataFrame<T>, | ||
| ) { | ||
| // Enlist my self | ||
| val xCurrent = v[d][k + normalizer] | ||
| val yCurrent = xCurrent - k | ||
| path.add(Pair(xCurrent, yCurrent)) | ||
| // I look for endpoint I was built from, it is represented by kPrev. | ||
| // It will be an argument of the next recoursive step. | ||
| // Moreover, I need to enlist the points composing the snake that precedes me (it may be empty). | ||
| if (d > 0) { | ||
| var kPrev: Int? = null | ||
| var xSnake: Int? = null | ||
| if (k == -d || k != d && v[d][k - 1 + normalizer] < v[d][k + 1 + normalizer]) { | ||
| kPrev = k + 1 | ||
| xSnake = v[d - 1][kPrev + normalizer] | ||
| } else { | ||
| kPrev = k - 1 | ||
| xSnake = v[d - 1][kPrev + normalizer] + 1 | ||
| } | ||
| var ySnake = xSnake - k | ||
| val snake = mutableListOf<Pair<Int, Int>>() | ||
| do { | ||
| snake.add(0, Pair(xSnake, ySnake)) | ||
| if (xSnake == xCurrent && ySnake == yCurrent) { | ||
| if (snake.isNotEmpty()) { | ||
| snake.removeFirst() | ||
| for (e in snake) { | ||
| path.add(e) | ||
| } | ||
| } | ||
| recoursivePathFill(path, v, d - 1, kPrev, normalizer, dfA, dfB) | ||
| return | ||
| } | ||
| if (xSnake < dfA.nrow && | ||
| ySnake < dfB.nrow && | ||
| xSnake >= 0 && | ||
| ySnake >= 0 && | ||
| dfA[xSnake] == dfB[ySnake] | ||
| ) { | ||
| xSnake += 1 | ||
| ySnake += 1 | ||
| } | ||
| } | ||
| while (xSnake <= xCurrent && ySnake <= yCurrent) | ||
| } | ||
| // Step base. | ||
| // Eventually need to add diagonal edges from origin to the furthest reaching point with d=0. | ||
| // Moreover, the path is reversed so that it can be read from left to right correctly | ||
| if (d == 0) { | ||
| if (path.last().first != 0 && path.last().second != 0) { | ||
| val last = path.last() | ||
| var x = last.first - 1 | ||
| var y = last.second - 1 | ||
| while (x >= 0 && y >= 0) { | ||
| path.add(Pair(x, y)) | ||
| x -= 1 | ||
| y -= 1 | ||
| } | ||
| } | ||
| path.reverse() | ||
| return | ||
| } | ||
| } | ||
183 changes: 183 additions & 0 deletions
183
core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/compareDataFrames.kt
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,183 @@ | ||
| package org.jetbrains.kotlinx.dataframe.api | ||
|
|
||
| import io.kotest.matchers.shouldBe | ||
| import org.jetbrains.kotlinx.dataframe.impl.api.ComparisonDescription | ||
| import org.jetbrains.kotlinx.dataframe.impl.api.compareDataFramesImpl | ||
| import org.jetbrains.kotlinx.dataframe.impl.api.myersDifferenceAlgorithmImpl | ||
| import org.junit.Test | ||
| import kotlin.Pair | ||
|
|
||
| private class SchemaForThisTest(val integer: Int, val string: String) : DataRowSchema | ||
|
|
||
| class CompareDataFramesTest { | ||
|
|
||
| // compareDataFrames region | ||
|
|
||
| @Test | ||
| fun `Need both to delete and insert rows, preserving some rows`() { | ||
| val dfA = dataFrameOf( | ||
| SchemaForThisTest(0, "a"), | ||
| SchemaForThisTest(1, "b"), | ||
| SchemaForThisTest(2, "c"), | ||
| SchemaForThisTest(0, "a"), | ||
| SchemaForThisTest(1, "b"), | ||
| SchemaForThisTest(1, "b"), | ||
| SchemaForThisTest(0, "a"), | ||
| ) | ||
| val dfB = dataFrameOf( | ||
| SchemaForThisTest(2, "c"), | ||
| SchemaForThisTest(1, "b"), | ||
| SchemaForThisTest(0, "a"), | ||
| SchemaForThisTest(1, "b"), | ||
| SchemaForThisTest(0, "a"), | ||
| SchemaForThisTest(2, "c"), | ||
| ) | ||
| val comparison = compareDataFramesImpl(dfA, dfB) | ||
| comparison shouldBe dataFrameOf( | ||
| ComparisonDescription(0, "dfA", true, null, null), | ||
| ComparisonDescription(1, "dfA", true, null, null), | ||
| ComparisonDescription(1, "dfB", null, true, 2), | ||
| ComparisonDescription(5, "dfA", true, null, null), | ||
| ComparisonDescription(5, "dfB", null, true, 6), | ||
| ) | ||
| } | ||
|
|
||
| @Test | ||
| fun `need to do nothing`() { | ||
| val dfA = dataFrameOf( | ||
| SchemaForThisTest(0, "a"), | ||
| SchemaForThisTest(0, "a"), | ||
| SchemaForThisTest(0, "a"), | ||
| ) | ||
| val dfB = dataFrameOf( | ||
| SchemaForThisTest(0, "a"), | ||
| SchemaForThisTest(0, "a"), | ||
| SchemaForThisTest(0, "a"), | ||
| ) | ||
| val comparison = compareDataFramesImpl(dfA, dfB) | ||
| comparison shouldBe emptyDataFrame() | ||
| } | ||
|
|
||
| @Test | ||
| fun `need to remove each row of dfA and insert each row of dfB`() { | ||
| val dfA = dataFrameOf( | ||
| SchemaForThisTest(0, "a"), | ||
| SchemaForThisTest(1, "b"), | ||
| SchemaForThisTest(2, "c"), | ||
| ) | ||
| val dfB = dataFrameOf( | ||
| SchemaForThisTest(3, "d"), | ||
| SchemaForThisTest(4, "e"), | ||
| SchemaForThisTest(5, "f"), | ||
| ) | ||
| val comparison = compareDataFramesImpl(dfA, dfB) | ||
| comparison shouldBe dataFrameOf( | ||
| ComparisonDescription(0, "dfA", true, null, null), | ||
| ComparisonDescription(1, "dfA", true, null, null), | ||
| ComparisonDescription(2, "dfA", true, null, null), | ||
| ComparisonDescription(0, "dfB", null, true, 2), | ||
| ComparisonDescription(1, "dfB", null, true, 2), | ||
| ComparisonDescription(2, "dfB", null, true, 2), | ||
| ) | ||
| } | ||
|
|
||
| // end region | ||
|
|
||
| // Myers algorithm region | ||
|
|
||
| @Test | ||
| fun `Need both to delete and insert rows, preserving some rows, Myers algorithm`() { | ||
| val dfA = dataFrameOf( | ||
| SchemaForThisTest(0, "a"), | ||
| SchemaForThisTest(1, "b"), | ||
| SchemaForThisTest(2, "c"), | ||
| SchemaForThisTest(0, "a"), | ||
| SchemaForThisTest(1, "b"), | ||
| SchemaForThisTest(1, "b"), | ||
| SchemaForThisTest(0, "a"), | ||
| ) | ||
| val dfB = dataFrameOf( | ||
| SchemaForThisTest(2, "c"), | ||
| SchemaForThisTest(1, "b"), | ||
| SchemaForThisTest(0, "a"), | ||
| SchemaForThisTest(1, "b"), | ||
| SchemaForThisTest(0, "a"), | ||
| SchemaForThisTest(2, "c"), | ||
| ) | ||
| val path = myersDifferenceAlgorithmImpl(dfA, dfB) | ||
| path shouldBe listOf( | ||
| Pair(0, 0), | ||
| Pair(1, 0), | ||
| Pair(2, 0), | ||
| Pair(3, 1), | ||
| Pair(3, 2), | ||
| Pair(4, 3), | ||
| Pair(5, 4), | ||
| Pair(6, 4), | ||
| Pair(7, 5), | ||
| Pair(7, 6), | ||
| ) | ||
| } | ||
|
|
||
| @Test | ||
| fun `need to do nothing, Myers algorithm`() { | ||
| val dfA = dataFrameOf( | ||
| SchemaForThisTest(0, "a"), | ||
| SchemaForThisTest(0, "a"), | ||
| SchemaForThisTest(0, "a"), | ||
| ) | ||
| val dfB = dataFrameOf( | ||
| SchemaForThisTest(0, "a"), | ||
| SchemaForThisTest(0, "a"), | ||
| SchemaForThisTest(0, "a"), | ||
| ) | ||
| val path = myersDifferenceAlgorithmImpl(dfA, dfB) | ||
| path shouldBe listOf( | ||
| Pair(0, 0), | ||
| Pair(1, 1), | ||
| Pair(2, 2), | ||
| Pair(3, 3), | ||
| ) | ||
| } | ||
|
|
||
| @Test | ||
| fun `need to remove each row of dfA and insert each row of dfB, Myers Algorithm`() { | ||
| val dfA = dataFrameOf( | ||
| SchemaForThisTest(0, "a"), | ||
| SchemaForThisTest(1, "b"), | ||
| SchemaForThisTest(2, "c"), | ||
| ) | ||
| val dfB = dataFrameOf( | ||
| SchemaForThisTest(3, "d"), | ||
| SchemaForThisTest(4, "e"), | ||
| SchemaForThisTest(5, "f"), | ||
| ) | ||
| val path = myersDifferenceAlgorithmImpl(dfA, dfB) | ||
| path shouldBe listOf( | ||
| Pair(0, 0), | ||
| Pair(1, 0), | ||
| Pair(2, 0), | ||
| Pair(3, 0), | ||
| Pair(3, 1), | ||
| Pair(3, 2), | ||
| Pair(3, 3), | ||
| ) | ||
| } | ||
|
|
||
| @Test | ||
| fun `need to add each row, Myers algorithm`() { | ||
| val dfA = emptyDataFrame<SchemaForThisTest>() | ||
| val dfB = dataFrameOf( | ||
| SchemaForThisTest(0, "a"), | ||
| SchemaForThisTest(1, "b"), | ||
| SchemaForThisTest(2, "c"), | ||
| ) | ||
| val path = myersDifferenceAlgorithmImpl(dfA, dfB) | ||
| path shouldBe listOf( | ||
| Pair(0, 0), | ||
| Pair(0, 1), | ||
| Pair(0, 2), | ||
| Pair(0, 3), | ||
| ) | ||
| } | ||
| } |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
data schemas are created with
@DataSchema, not with: DataRowSchemaThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you for reviewing! I added
@DataSchema, however with the current implementation: DataRowSchemais still necessary because of lines 41-42..