-
Notifications
You must be signed in to change notification settings - Fork 70
fix test failures on 1.12, avoid race condition in multithreaded partitioned writes #582
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 9 commits
5af9cfd
86c6005
58c1a9e
c3dffb6
b387e5c
62c6a99
ac66cf7
fb22434
75c06ec
29a04a6
7622b09
58470ae
f68a332
db35d62
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -142,55 +142,58 @@ function arrowvector( | |
| kw..., | ||
| ) | ||
| id = x.encoding.id | ||
| # XXX This is a race condition if two workers hit this block at the same time, then they'll create | ||
| # distinct locks | ||
| if !haskey(de, id) | ||
| de[id] = Lockable(x.encoding) | ||
| else | ||
| encodinglockable = de[id] | ||
| Base.@lock encodinglockable begin | ||
| encoding = encodinglockable.value | ||
| # in this case, we just need to check if any values in our local pool need to be delta dicationary serialized | ||
| deltas = setdiff(x.encoding, encoding) | ||
| if !isempty(deltas) | ||
| ET = indextype(encoding) | ||
| if length(deltas) + length(encoding) > typemax(ET) | ||
| error( | ||
| "fatal error serializing dict encoded column with ref index type of $ET; subsequent record batch unique values resulted in $(length(deltas) + length(encoding)) unique values, which exceeds possible index values in $ET", | ||
| ) | ||
| end | ||
| data = arrowvector( | ||
| deltas, | ||
| i, | ||
| nl, | ||
| fi, | ||
| de, | ||
| ded, | ||
| nothing; | ||
| dictencode=dictencodenested, | ||
| dictencodenested=dictencodenested, | ||
| dictencoding=true, | ||
| kw..., | ||
| return x | ||
| end | ||
|
|
||
| encodinglockable = de[id] | ||
| Base.@lock encodinglockable begin | ||
| encoding = encodinglockable.value | ||
| # in this case, we just need to check if any values in our local pool need to be delta dicationary serialized | ||
| deltas = setdiff(x.encoding, encoding) | ||
| if !isempty(deltas) | ||
| ET = indextype(encoding) | ||
| if length(deltas) + length(encoding) > typemax(ET) | ||
| error( | ||
| "fatal error serializing dict encoded column with ref index type of $ET; subsequent record batch unique values resulted in $(length(deltas) + length(encoding)) unique values, which exceeds possible index values in $ET", | ||
| ) | ||
| push!( | ||
| ded, | ||
| DictEncoding{eltype(data),ET,typeof(data)}( | ||
| id, | ||
| data, | ||
| false, | ||
| getmetadata(data), | ||
| ), | ||
| end | ||
| data = arrowvector( | ||
| deltas, | ||
| i, | ||
| nl, | ||
| fi, | ||
| de, | ||
| ded, | ||
| nothing; | ||
| dictencode=dictencodenested, | ||
| dictencodenested=dictencodenested, | ||
| dictencoding=true, | ||
| kw..., | ||
| ) | ||
| push!( | ||
| ded, | ||
| DictEncoding{eltype(data),ET,typeof(data)}( | ||
| id, | ||
| data, | ||
| false, | ||
| getmetadata(data), | ||
| ), | ||
| ) | ||
| if typeof(encoding.data) <: ChainedVector | ||
| append!(encoding.data, data) | ||
| else | ||
| data2 = ChainedVector([encoding.data, data]) | ||
| encoding = DictEncoding{eltype(data2),ET,typeof(data2)}( | ||
| id, | ||
| data2, | ||
| false, | ||
| getmetadata(encoding), | ||
| ) | ||
| if typeof(encoding.data) <: ChainedVector | ||
| append!(encoding.data, data) | ||
| else | ||
| data2 = ChainedVector([encoding.data, data]) | ||
| encoding = DictEncoding{eltype(data2),ET,typeof(data2)}( | ||
| id, | ||
| data2, | ||
| false, | ||
| getmetadata(encoding), | ||
| ) | ||
| de[id] = Lockable(encoding) | ||
| end | ||
| de[id] = Lockable(encoding, encodinglockable.lock) | ||
| end | ||
| end | ||
| end | ||
|
|
@@ -215,6 +218,8 @@ function arrowvector( | |
| x = x.data | ||
| len = length(x) | ||
| validity = ValidityBitmap(x) | ||
| # XXX This is a race condition if two workers hit this block at the same time, then they'll create | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @quinnj I think there is a race condition baked into the current architecture that can't be addressed without a very large refactoring. The current architecture creates the locks on a worker thread if they don't already exist, which means that threads are competing for the creation of the initial lock. The locks should be created before any tasks are spawned. |
||
| # distinct locks | ||
| if !haskey(de, id) | ||
| # dict encoding doesn't exist yet, so create for 1st time | ||
| if DataAPI.refarray(x) === x || DataAPI.refpool(x) === nothing | ||
|
|
@@ -326,7 +331,7 @@ function arrowvector( | |
| false, | ||
| getmetadata(encoding), | ||
| ) | ||
| de[id] = Lockable(encoding) | ||
| de[id] = Lockable(encoding, encodinglockable.lock) | ||
| end | ||
| end | ||
| end | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,12 +28,16 @@ using DataAPI | |
| using FilePathsBase | ||
| using DataFrames | ||
| import Random: randstring | ||
| using TestSetExtensions: ExtendedTestSet | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. given how long the Arrow tests take, it's useful to have some indication of progress so that we can tell if tests have hung. ExtendedTestSet shows a (We also get colored diffs of arrays when tests fail, which is nice.) |
||
|
|
||
| # this formulation tests the loaded ArrowTypes, even if it's not the dev version | ||
| # within the mono-repo | ||
| include(joinpath(dirname(pathof(ArrowTypes)), "../test/tests.jl")) | ||
| include(joinpath(dirname(pathof(Arrow)), "../test/testtables.jl")) | ||
| include(joinpath(dirname(pathof(Arrow)), "../test/testappend.jl")) | ||
| include(joinpath(dirname(pathof(Arrow)), "../test/integrationtest.jl")) | ||
| include(joinpath(dirname(pathof(Arrow)), "../test/dates.jl")) | ||
|
|
||
| include(joinpath(@__DIR__, "testtables.jl")) | ||
| include(joinpath(@__DIR__, "testappend.jl")) | ||
| include(joinpath(@__DIR__, "integrationtest.jl")) | ||
| include(joinpath(@__DIR__, "dates.jl")) | ||
|
|
||
| struct CustomStruct | ||
| x::Int | ||
|
|
@@ -45,7 +49,7 @@ struct CustomStruct2{sym} | |
| x::Int | ||
| end | ||
|
|
||
| @testset "Arrow" begin | ||
| @testset ExtendedTestSet "Arrow" begin | ||
| @testset "table roundtrips" begin | ||
| for case in testtables | ||
| testtable(case...) | ||
|
|
@@ -381,6 +385,8 @@ end | |
| end | ||
|
|
||
| @testset "# 126" begin | ||
| # XXX This test also captures a race condition in multithreaded | ||
| # writes of dictionary encoded arrays | ||
| t = Tables.partitioner(( | ||
| (a=Arrow.toarrowvector(PooledArray([1, 2, 3])),), | ||
| (a=Arrow.toarrowvector(PooledArray([1, 2, 3, 4])),), | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.