Skip to content

Indexing bugs #594

@Hoeze

Description

@Hoeze

Hi @ihnorton, sorry to bother once more, but I think I found a couple of bugs in conjunction with indexing.

Setup:

Details
# +
import json

import tiledb
import numpy as np
import pandas as pd
import random
# -

import json
test_df = pd.DataFrame.from_records(json.loads('{"chrom":{"0":"chr1","1":"chr1","2":"chr2","3":"chr2","4":"chr1","5":"chr1","8":"chr1","9":"chr1"},"log10_len":{"0":1,"1":1,"2":1,"3":1,"4":1,"5":1,"8":0,"9":0},"start":{"0":10108,"1":10108,"2":10108,"3":10108,"4":10108,"5":10108,"8":10143,"9":10143},"end":{"0":10114,"1":10114,"2":10114,"3":10114,"4":10114,"5":10114,"8":10144,"9":10144},"ref":{"0":"AACCCT","1":"AACCCT","2":"AACCCT","3":"AACCCT","4":"AACCCT","5":"AACCCT","8":"T","9":"T"},"alt":{"0":"A","1":"A","2":"A","3":"A","4":"A","5":"A","8":"C","9":"C"},"sample_id":{"0":"A","1":"B","2":"C","3":"D","4":"E","5":"F","8":"A","9":"B"},"GT":{"0":1,"1":1,"2":1,"3":1,"4":1,"5":1,"8":1,"9":1},"GQ":{"0":79,"2":60,"3":99,"4":26,"5":62,"8":22,"9":65},"DP":{"0":12,"1":9,"2":39,"3":26,"4":9,"5":9,"8":35,"9":34}}'))
test_df

output_path="test.tdb"

ctx = tiledb.default_ctx()
ctx

# +
genotype_domain = tiledb.Domain(
    tiledb.Dim(name="chrom", domain=(None,None), tile=1, dtype=np.bytes_, ctx=ctx),
    tiledb.Dim(name="log10_len", domain=(0, np.iinfo(np.int8).max), tile=1, dtype=np.int8, ctx=ctx),
    tiledb.Dim(name="start", domain=(0, np.iinfo(np.int32).max), tile=100000, dtype=np.int32, ctx=ctx),
    tiledb.Dim(name="alt", domain=(None,None), tile=None, dtype=np.bytes_, ctx=ctx),
#     tiledb.Dim(name="end", domain=(1, np.iinfo(np.int32).max), dtype=np.int32, ctx=ctx),
    tiledb.Dim(name="sample_id", domain=(None,None), tile=None, dtype=np.bytes_, ctx=ctx),
    ctx=ctx,
)

string_filters = tiledb.FilterList([tiledb.ZstdFilter(level=-1),])
int_filters = tiledb.FilterList([tiledb.ByteShuffleFilter(), tiledb.ZstdFilter(level=-1),])
attrs = [
    tiledb.Attr(name='end', dtype='int32', var=False, nullable=False, filters=int_filters),
    tiledb.Attr(name='ref', dtype='S', nullable=False, filters=string_filters),
    tiledb.Attr(name='GT', dtype='int8', var=False, nullable=False, filters=int_filters),
    tiledb.Attr(name='GQ', dtype='int32', var=False, nullable=True, filters=int_filters),
    tiledb.Attr(name='DP', dtype='int32', var=False, nullable=True, filters=int_filters),
]
# -

schema = tiledb.ArraySchema(
    domain=genotype_domain,
    attrs=attrs,
    sparse=True,
    cell_order="hilbert",
#     capacity=10000,
    ctx=ctx,
)
schema

if tiledb.array_exists(output_path):
    print("Deleting array at '%s'..." % output_path)
    import shutil
    shutil.rmtree(output_path)
print("Creating array at '%s'..." % output_path)
tiledb.array.SparseArray.create(output_path, schema, ctx=ctx)

tiledb.from_dataframe(
    output_path,
    test_df.astype({
        'end': 'int32',
        'ref': 'S',
        'GT': 'int8',
        'GQ': 'Int32',
        'DP': 'Int32',
    }),
    sparse=True,
    mode="append"
)

tiledb.open_dataframe(output_path, use_arrow=True)

A = tiledb.open(output_path, ctx=ctx, mode='r')
A

Now my trials:

# works
A.query(use_arrow=True, coords=True).df[:]

# works
A["chr2"]

# +
# kernel breaks with `realloc(): invalid pointer`
# A.df["chr2"]
# kernel breaks with `realloc(): invalid pointer`
# A.query(use_arrow=True, coords=True).df["chr2"]
# -

# empty result
A.multi_index["chr2"]

# works
A["chr1", 0]

# works
A.multi_index[:]

# empty result
A.multi_index[:, 0]

# empty result
A.multi_index["chr2", 0]

# ---------------------------------------------------------------------------
# IndexError                                Traceback (most recent call last)
# <ipython-input-22-20675d6deb0c> in <module>
#      12 
#      13 # IndexError: invalid index type: <class 'list'>
# ---> 14 A[[("chr1", 0), ("chr1", 1),]]
#
# tiledb/libtiledb.pyx in tiledb.libtiledb.SparseArrayImpl.__getitem__()
#
# tiledb/libtiledb.pyx in tiledb.libtiledb.SparseArrayImpl.subarray()
#
# tiledb/libtiledb.pyx in tiledb.libtiledb.index_domain_subarray()
#
# IndexError: invalid index type: <class 'list'>
A[[
    ("chr1", 0), 
    ("chr1", 1),
]]
  • Is there a way to keep the python kernel from crashing, even when having invalid input to tiledb?
  • How to do key-based indexing? (A[[("chr1", 0), ("chr1", 1),]])
  • How to do multi_index indexing with DataFrame as return type?

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions