-
Notifications
You must be signed in to change notification settings - Fork 36
Open
Description
Hi @ihnorton, sorry to bother once more, but I think I found a couple of bugs in conjunction with indexing.
Setup:
Details
# +
import json
import tiledb
import numpy as np
import pandas as pd
import random
# -
import json
test_df = pd.DataFrame.from_records(json.loads('{"chrom":{"0":"chr1","1":"chr1","2":"chr2","3":"chr2","4":"chr1","5":"chr1","8":"chr1","9":"chr1"},"log10_len":{"0":1,"1":1,"2":1,"3":1,"4":1,"5":1,"8":0,"9":0},"start":{"0":10108,"1":10108,"2":10108,"3":10108,"4":10108,"5":10108,"8":10143,"9":10143},"end":{"0":10114,"1":10114,"2":10114,"3":10114,"4":10114,"5":10114,"8":10144,"9":10144},"ref":{"0":"AACCCT","1":"AACCCT","2":"AACCCT","3":"AACCCT","4":"AACCCT","5":"AACCCT","8":"T","9":"T"},"alt":{"0":"A","1":"A","2":"A","3":"A","4":"A","5":"A","8":"C","9":"C"},"sample_id":{"0":"A","1":"B","2":"C","3":"D","4":"E","5":"F","8":"A","9":"B"},"GT":{"0":1,"1":1,"2":1,"3":1,"4":1,"5":1,"8":1,"9":1},"GQ":{"0":79,"2":60,"3":99,"4":26,"5":62,"8":22,"9":65},"DP":{"0":12,"1":9,"2":39,"3":26,"4":9,"5":9,"8":35,"9":34}}'))
test_df
output_path="test.tdb"
ctx = tiledb.default_ctx()
ctx
# +
genotype_domain = tiledb.Domain(
tiledb.Dim(name="chrom", domain=(None,None), tile=1, dtype=np.bytes_, ctx=ctx),
tiledb.Dim(name="log10_len", domain=(0, np.iinfo(np.int8).max), tile=1, dtype=np.int8, ctx=ctx),
tiledb.Dim(name="start", domain=(0, np.iinfo(np.int32).max), tile=100000, dtype=np.int32, ctx=ctx),
tiledb.Dim(name="alt", domain=(None,None), tile=None, dtype=np.bytes_, ctx=ctx),
# tiledb.Dim(name="end", domain=(1, np.iinfo(np.int32).max), dtype=np.int32, ctx=ctx),
tiledb.Dim(name="sample_id", domain=(None,None), tile=None, dtype=np.bytes_, ctx=ctx),
ctx=ctx,
)
string_filters = tiledb.FilterList([tiledb.ZstdFilter(level=-1),])
int_filters = tiledb.FilterList([tiledb.ByteShuffleFilter(), tiledb.ZstdFilter(level=-1),])
attrs = [
tiledb.Attr(name='end', dtype='int32', var=False, nullable=False, filters=int_filters),
tiledb.Attr(name='ref', dtype='S', nullable=False, filters=string_filters),
tiledb.Attr(name='GT', dtype='int8', var=False, nullable=False, filters=int_filters),
tiledb.Attr(name='GQ', dtype='int32', var=False, nullable=True, filters=int_filters),
tiledb.Attr(name='DP', dtype='int32', var=False, nullable=True, filters=int_filters),
]
# -
schema = tiledb.ArraySchema(
domain=genotype_domain,
attrs=attrs,
sparse=True,
cell_order="hilbert",
# capacity=10000,
ctx=ctx,
)
schema
if tiledb.array_exists(output_path):
print("Deleting array at '%s'..." % output_path)
import shutil
shutil.rmtree(output_path)
print("Creating array at '%s'..." % output_path)
tiledb.array.SparseArray.create(output_path, schema, ctx=ctx)
tiledb.from_dataframe(
output_path,
test_df.astype({
'end': 'int32',
'ref': 'S',
'GT': 'int8',
'GQ': 'Int32',
'DP': 'Int32',
}),
sparse=True,
mode="append"
)
tiledb.open_dataframe(output_path, use_arrow=True)
A = tiledb.open(output_path, ctx=ctx, mode='r')
ANow my trials:
# works
A.query(use_arrow=True, coords=True).df[:]
# works
A["chr2"]
# +
# kernel breaks with `realloc(): invalid pointer`
# A.df["chr2"]
# kernel breaks with `realloc(): invalid pointer`
# A.query(use_arrow=True, coords=True).df["chr2"]
# -
# empty result
A.multi_index["chr2"]
# works
A["chr1", 0]
# works
A.multi_index[:]
# empty result
A.multi_index[:, 0]
# empty result
A.multi_index["chr2", 0]
# ---------------------------------------------------------------------------
# IndexError Traceback (most recent call last)
# <ipython-input-22-20675d6deb0c> in <module>
# 12
# 13 # IndexError: invalid index type: <class 'list'>
# ---> 14 A[[("chr1", 0), ("chr1", 1),]]
#
# tiledb/libtiledb.pyx in tiledb.libtiledb.SparseArrayImpl.__getitem__()
#
# tiledb/libtiledb.pyx in tiledb.libtiledb.SparseArrayImpl.subarray()
#
# tiledb/libtiledb.pyx in tiledb.libtiledb.index_domain_subarray()
#
# IndexError: invalid index type: <class 'list'>
A[[
("chr1", 0),
("chr1", 1),
]]- Is there a way to keep the python kernel from crashing, even when having invalid input to tiledb?
- How to do key-based indexing? (
A[[("chr1", 0), ("chr1", 1),]]) - How to do
multi_indexindexing with DataFrame as return type?
Metadata
Metadata
Assignees
Labels
No labels