Addition of from_numpy Support for mode={ingest,schema_only,append} (#1185)

nguyenv · web-flow · commit 353f2d1c5724 · 2022-07-08T08:11:51.000-05:00
* Addition of `from_numpy` Support for `mode={ingest,schema_only,append}`

* Addition of `start_idx` Parameter for `from_numpy`

* Only Call `nonempty_domain` In Append Mode
diff --git a/HISTORY.md b/HISTORY.md
@@ -9,6 +9,7 @@
 ## API Changes
 * Support `QueryCondition` for dense arrays [#1198](https://github.com/TileDB-Inc/TileDB-Py/pull/1198)
 * Querying dense array with `[:]` returns shape that matches nonempty domain, consistent with `.df[:]` and `.multi_index[:]` [#1199](https://github.com/TileDB-Inc/TileDB-Py/pull/1199)
+* Addition of `from_numpy` support for `mode={ingest,schema_only,append}` [#1185](https://github.com/TileDB-Inc/TileDB-Py/pull/1185)
 
 # TileDB-Py 0.16.1 Release Notes
 
diff --git a/tiledb/dataframe_.py b/tiledb/dataframe_.py
@@ -405,7 +405,6 @@ def from_pandas(uri: str, dataframe: "pd.DataFrame", **kwargs):
 
     :param uri: URI for new TileDB array
     :param dataframe: pandas DataFrame
-    :param mode: Creation mode, one of 'ingest' (default), 'schema_only', 'append'
 
     :Keyword Arguments:
 
@@ -416,7 +415,7 @@ def from_pandas(uri: str, dataframe: "pd.DataFrame", **kwargs):
                           which `tiledb.read_csv` checks for in order to correctly read a file batchwise.
         * **index_dims** (``List[str]``) -- List of column name(s) to use as dimension(s) in TileDB array schema. This is the recommended way to create dimensions.
         * **allows_duplicates** - Generated schema should allow duplicates
-        * **mode** - (default ``ingest``), Ingestion mode: ``ingest``, ``schema_only``, ``append``
+        * **mode** - Creation mode, one of 'ingest' (default), 'schema_only', 'append'
         * **attr_filters** - FilterList to apply to Attributes: FilterList or Dict[str -> FilterList] for any attribute(s). Unspecified attributes will use default.
         * **dim_filters** - FilterList to apply to Dimensions: FilterList or Dict[str -> FilterList] for any dimensions(s). Unspecified dimensions will use default.
         * **offsets_filters** - FilterList to apply to all offsets
@@ -457,7 +456,7 @@ def _from_pandas(uri, dataframe, tiledb_args):
     mode = tiledb_args.get("mode", "ingest")
 
     if mode != "append" and tiledb.array_exists(uri):
-        raise TileDBError("Array URI '{}' already exists!".format(uri))
+        raise TileDBError(f"Array URI '{uri}' already exists!")
 
     sparse = tiledb_args["sparse"]
     index_dims = tiledb_args.get("index_dims") or ()
@@ -476,7 +475,7 @@ def _from_pandas(uri, dataframe, tiledb_args):
                     "Cannot append to dense array without 'row_start_idx'"
                 )
         elif mode != "ingest":
-            raise TileDBError("Invalid mode specified ('{}')".format(mode))
+            raise TileDBError(f"Invalid mode specified ('{mode}')")
 
     # TODO: disentangle the full_domain logic
     full_domain = tiledb_args.get("full_domain", False)
@@ -696,7 +695,7 @@ def from_csv(uri: str, csv_file: Union[str, List[str]], **kwargs):
         * **sparse** - (default True) Create sparse schema
         * **index_dims** (``List[str]``) -- List of column name(s) to use as dimension(s) in TileDB array schema. This is the recommended way to create dimensions. (note: the Pandas ``read_csv`` argument ``index_col`` will be passed through if provided, which results in indexes that will be converted to dimnesions by default; however ``index_dims`` is preferred).
         * **allows_duplicates** - Generated schema should allow duplicates
-        * **mode** - (default ``ingest``), Ingestion mode: ``ingest``, ``schema_only``, ``append``
+        * **mode** - Creation mode, one of 'ingest' (default), 'schema_only', 'append'
         * **attr_filters** - FilterList to apply to Attributes: FilterList or Dict[str -> FilterList] for any attribute(s). Unspecified attributes will use default.
         * **dim_filters** - FilterList to apply to Dimensions: FilterList or Dict[str -> FilterList] for any dimensions(s). Unspecified dimensions will use default.
         * **offsets_filters** - FilterList to apply to all offsets
diff --git a/tiledb/highlevel.py b/tiledb/highlevel.py
@@ -77,6 +77,14 @@ def from_numpy(uri, array, config=None, ctx=None, **kwargs):
     :raises TypeError: cannot convert ``uri`` to unicode string
     :raises: :py:exc:`tiledb.TileDBError`
 
+    :Keyword Arguments:
+
+        * **full_domain** - Dimensions should be created with full range of the dtype (default: False)
+        * **mode** - Creation mode, one of 'ingest' (default), 'schema_only', 'append'
+        * **append_dim** - The dimension along which the Numpy array is append (default: 0).
+        * **start_idx** - The starting index to append to. By default, append to the end of the existing data.
+        * **timestamp** - Write TileDB array at specific timestamp.
+
     **Example:**
 
     >>> import tiledb, numpy as np, tempfile
diff --git a/tiledb/libtiledb.pyx b/tiledb/libtiledb.pyx
@@ -11,6 +11,7 @@ import html
 import sys
 import warnings
 from collections import OrderedDict
+from collections.abc import Sequence
 
 from .ctx import default_ctx
 from .filter import FilterList
@@ -240,13 +241,48 @@ def schema_like_numpy(array, ctx=None, **kw):
     tiling = regularize_tiling(kw.pop('tile', None), array.ndim)
 
     attr_name = kw.pop('attr_name', '')
-    dim_dtype = kw.pop('dim_dtype', np.uint64)
+    dim_dtype = kw.pop('dim_dtype', np.dtype("uint64"))
+    full_domain = kw.pop('full_domain', False)
     dims = []
+
     for (dim_num,d) in enumerate(range(array.ndim)):
         # support smaller tile extents by kw
         # domain is based on full shape
         tile_extent = tiling[d] if tiling else array.shape[d]
-        domain = (0, array.shape[d] - 1)
+        if full_domain:
+            if dim_dtype not in (np.bytes_, np.str_):
+                # Use the full type domain, deferring to the constructor
+                dtype_min, dtype_max = dtype_range(dim_dtype)
+                dim_max = dtype_max
+                if dim_dtype.kind == "M":
+                    date_unit = np.datetime_data(dim_dtype)[0]
+                    dim_min = np.datetime64(dtype_min, date_unit)
+                    tile_max = np.iinfo(np.uint64).max - tile_extent
+                    if np.uint64(dtype_max - dtype_min) > tile_max:
+                        dim_max = np.datetime64(dtype_max - tile_extent, date_unit)
+                else:
+                    dim_min = dtype_min
+
+                if np.issubdtype(dim_dtype, np.integer):
+                    tile_max = np.iinfo(np.uint64).max - tile_extent
+                    if np.uint64(dtype_max - dtype_min) > tile_max:
+                        dim_max = dtype_max - tile_extent
+                domain = (dim_min, dim_max)
+            else:
+                domain = (None, None)
+
+            if np.issubdtype(dim_dtype, np.integer) or dim_dtype.kind == "M":
+                # we can't make a tile larger than the dimension range or lower than 1
+                tile_extent = max(1, min(tile_extent, np.uint64(dim_max - dim_min)))
+            elif np.issubdim_dtype(dim_dtype, np.floating):
+                # this difference can be inf
+                with np.errstate(over="ignore"):
+                    dim_range = dim_max - dim_min
+                if dim_range < tile_extent:
+                    tile_extent = np.ceil(dim_range)
+        else:
+            domain = (0, array.shape[d] - 1)
+
         dims.append(Dim(domain=domain, tile=tile_extent, dtype=dim_dtype, ctx=ctx))
 
     var = False
@@ -4240,7 +4276,7 @@ cdef class DenseArrayImpl(Array):
     def __init__(self, *args, **kw):
         super().__init__(*args, **kw)
         if self.schema.sparse:
-            raise ValueError("Array at {} is not a dense array".format(self.uri))
+            raise ValueError(f"Array at {self.uri} is not a dense array")
         return
 
     @staticmethod
@@ -4250,20 +4286,38 @@ cdef class DenseArrayImpl(Array):
         """
         if not ctx:
             ctx = default_ctx()
+        
+        mode = kw.pop("mode", "ingest")
+        timestamp = kw.pop("timestamp", None)
 
-        # pop the write timestamp before creating schema
-        timestamp = kw.pop('timestamp', None)
-
-        schema = schema_like_numpy(array, ctx=ctx, **kw)
-        Array.create(uri, schema)
+        if mode not in ("ingest", "schema_only", "append"):
+            raise TileDBError(f"Invalid mode specified ('{mode}')")
 
+        if mode in ("ingest", "schema_only"):
+            try:
+                with Array.load_typed(uri):
+                    raise TileDBError(f"Array URI '{uri}' already exists!")
+            except TileDBError:
+                pass
+        
+        if mode == "append":
+            kw["append_dim"] = kw.get("append_dim", 0)
+            if ArraySchema.load(uri).sparse:
+                raise TileDBError("Cannot append to sparse array")
+
+        if mode in ("ingest", "schema_only"):
+            schema = schema_like_numpy(array, ctx=ctx, **kw)
+            Array.create(uri, schema)
+
+        if mode in ("ingest", "append"):
+            kw["mode"] = mode
+            with DenseArray(uri, mode='w', ctx=ctx, timestamp=timestamp) as arr:
+                # <TODO> probably need better typecheck here
+                if array.dtype == object:
+                    arr[:] = array
+                else:
+                    arr.write_direct(np.ascontiguousarray(array), **kw)
 
-        with DenseArray(uri, mode='w', ctx=ctx, timestamp=timestamp) as arr:
-            # <TODO> probably need better typecheck here
-            if array.dtype == object:
-                arr[:] = array
-            else:
-                arr.write_direct(np.ascontiguousarray(array))
         return DenseArray(uri, mode='r', ctx=ctx)
 
     def __len__(self):
@@ -4687,7 +4741,7 @@ cdef class DenseArrayImpl(Array):
             return array.astype(dtype)
         return array
 
-    def write_direct(self, np.ndarray array not None):
+    def write_direct(self, np.ndarray array not None, **kw):
         """
         Write directly to given array attribute with minimal checks,
         assumes that the numpy array is the same shape as the array's domain
@@ -4698,6 +4752,10 @@ cdef class DenseArrayImpl(Array):
         :raises: :py:exc:`tiledb.TileDBError`
 
         """
+        append_dim = kw.pop("append_dim", None)
+        mode = kw.pop("mode", "ingest")
+        start_idx = kw.pop("start_idx", None)
+
         if not self.isopen or self.mode != 'w':
             raise TileDBError("DenseArray is not opened for writing")
         if self.schema.nattr != 1:
@@ -4715,6 +4773,7 @@ cdef class DenseArrayImpl(Array):
 
         cdef void* buff_ptr = np.PyArray_DATA(array)
         cdef uint64_t buff_size = array.nbytes
+        cdef np.ndarray subarray = np.zeros(2*array.ndim, np.uint64)
 
         use_global_order = self.ctx.config().get("py.use_global_order_1d_write", False) == "true"
 
@@ -4733,13 +4792,69 @@ cdef class DenseArrayImpl(Array):
             rc = tiledb_query_set_layout(ctx_ptr, query_ptr, layout)
             if rc != TILEDB_OK:
                 _raise_ctx_err(ctx_ptr, rc)
-            rc = tiledb_query_set_buffer(ctx_ptr, query_ptr, attr_name_ptr, buff_ptr, &buff_size)
+
+            range_start_idx = start_idx or 0
+            for n in range(array.ndim):
+                subarray[n*2] = range_start_idx
+                subarray[n*2 + 1] = array.shape[n] + range_start_idx - 1
+
+            if mode == "append":
+                with Array.load_typed(self.uri) as A:
+                    ned = A.nonempty_domain()
+
+                if array.ndim <= append_dim:
+                    raise IndexError("`append_dim` out of range")
+                
+                if array.ndim != len(ned):
+                    raise ValueError(
+                        "The number of dimension of the TileDB array and "
+                        "Numpy array to append do not match"
+                    )
+
+                for n in range(array.ndim): 
+                    if n == append_dim:
+                        if start_idx is not None:
+                            range_start_idx = start_idx 
+                            range_end_idx = array.shape[n] + start_idx -1
+                        else:
+                            range_start_idx = ned[n][1] + 1
+                            range_end_idx = array.shape[n] + ned[n][1]
+
+                        subarray[n*2] = range_start_idx
+                        subarray[n*2 + 1] = range_end_idx
+                    else:
+                        if array.shape[n] != ned[n][1] - ned[n][0] + 1:
+                            raise ValueError(
+                                "The input Numpy array must be of the same "
+                                "shape as the TileDB array, exluding the "
+                                "`append_dim`, but the Numpy array at index "
+                                f"{n} has {array.shape[n]} dimension(s) and "
+                                f"the TileDB array has {ned[n][1]-ned[n][0]}."
+                            )
+            
+            rc = tiledb_query_set_subarray(
+                    ctx_ptr, 
+                    query_ptr, 
+                    <void*>np.PyArray_DATA(subarray)
+            )
             if rc != TILEDB_OK:
                 _raise_ctx_err(ctx_ptr, rc)
+
+            rc = tiledb_query_set_buffer(
+                    ctx_ptr, 
+                    query_ptr, 
+                    attr_name_ptr, 
+                    buff_ptr, 
+                    &buff_size
+            )
+            if rc != TILEDB_OK:
+                _raise_ctx_err(ctx_ptr, rc)
+
             with nogil:
                 rc = tiledb_query_submit(ctx_ptr, query_ptr)
             if rc != TILEDB_OK:
                 _raise_ctx_err(ctx_ptr, rc)
+
             with nogil:
                 rc = tiledb_query_finalize(ctx_ptr, query_ptr)
             if rc != TILEDB_OK:
diff --git a/tiledb/tests/test_libtiledb.py b/tiledb/tests/test_libtiledb.py