0.0.188

joocer · joocer · commit 4dfa0e9bdd91 · 2025-03-05T18:54:11.000Z
diff --git a/orso/compute/compiled.pyx b/orso/compute/compiled.pyx
@@ -30,6 +30,7 @@ cimport numpy as cnp
 from numpy cimport ndarray
 from libc.stdint cimport int32_t, int64_t
 from cpython.dict cimport PyDict_GetItem
+from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM
 
 cnp.import_array()
 
@@ -174,3 +175,139 @@ def process_table(table, row_factory, int max_chunksize) -> list:
             rows[i] = row_factory(row)
             i += 1
     return rows
+
+
+
+# cython: language_level=3
+# cython: boundscheck=False
+# cython: wraparound=False
+# cython: nonecheck=False
+# cython: cdivision=True
+# cython: initializedcheck=False
+# cython: infer_types=True
+
+import pyarrow
+cimport cython
+from libc.stdint cimport int64_t, uint8_t, int32_t
+from libc.stdint cimport int32_t, int64_t, uint8_t, uint64_t, uintptr_t
+from cpython.tuple cimport PyTuple_New, PyTuple_SET_ITEM
+
+# cython: language_level=3
+# cython: boundscheck=False
+# cython: wraparound=False
+# cython: nonecheck=False
+# cython: cdivision=True
+# cython: initializedcheck=False
+# cython: infer_types=True
+
+import pyarrow
+import struct
+cimport cython
+from libc.stdint cimport int32_t, int64_t, uint8_t
+
+cpdef list _process_table(table, object row_factory, int max_chunksize):
+    """
+    Converts a PyArrow table into a list of tuples efficiently.
+
+    Parameters:
+        table: PyArrow Table
+            The input table to process.
+        row_factory: function
+            A function applied to each row.
+        max_chunksize: int
+            The batch size to process at a time.
+
+    Returns:
+        A list of transformed rows.
+    """
+    cdef list result = []
+    cdef Py_ssize_t num_cols = table.num_columns
+    cdef Py_ssize_t row_idx, col_idx, chunk_offset
+    cdef object chunk, buffers
+    cdef const uint8_t* validity
+    cdef const int32_t* int_offsets
+    cdef const char* data
+    cdef Py_ssize_t row_count, str_start, str_end
+    cdef bytes value
+    cdef object row_tuple
+    cdef uint8_t null_mask
+    cdef Py_ssize_t bit_offset, byte_offset, bit_index
+
+    for batch in table.to_batches(max_chunksize):
+        batch_cols = batch.columns
+        batch_num_rows = batch.num_rows
+
+        # Preallocate row storage
+        batch_result = [None] * batch_num_rows
+
+        for row_idx in range(batch_num_rows):
+            row_tuple = [None] * num_cols
+
+            for col_idx in range(num_cols):
+                chunk = batch_cols[col_idx]
+                buffers = chunk.buffers()
+
+                # Extract validity bitmap
+                validity = <const uint8_t*><uintptr_t>buffers[0].address if buffers[0] else NULL
+
+                # Compute null mask offsets
+                if validity:
+                    byte_offset = row_idx // 8
+                    bit_index = row_idx % 8
+                    null_mask = validity[byte_offset] & (1 << bit_index)
+                else:
+                    null_mask = 1  # If no validity buffer, assume all valid
+
+                if null_mask == 0:
+                    # NULL value case
+                    continue
+
+                # Process based on type
+                if pyarrow.types.is_string(chunk.type) or pyarrow.types.is_binary(chunk.type):
+                    int_offsets = <const int32_t*><uintptr_t>buffers[1].address
+                    data = <const char*><uintptr_t>buffers[2].address if len(buffers) > 2 else NULL
+
+                    str_start = int_offsets[row_idx]
+                    str_end = int_offsets[row_idx + 1]
+                    
+                    if str_start < str_end and data:
+                        value = data[str_start:str_end]
+                        row_tuple[col_idx] = value.decode()
+                    else:
+                        PyTuple_SET_ITEM(row_tuple, col_idx, "")
+
+                elif pyarrow.types.is_integer(chunk.type):
+                    # Get raw pointer to numeric data
+                    raw_data = <const uint8_t*><uintptr_t>buffers[1].address
+                    item_size = chunk.type.bit_width // 8
+                    
+                    if item_size == 8:  # int64
+                        row_tuple[col_idx] = struct.unpack_from("<q", raw_data, row_idx * 8)[0]
+                    elif item_size == 4:  # int32
+                        row_tuple[col_idx] = struct.unpack_from("<i", raw_data, row_idx * 4)[0]
+                    elif item_size == 2:  # int16
+                        row_tuple[col_idx] = struct.unpack_from("<h", raw_data, row_idx * 2)[0]
+                    elif item_size == 1:  # int8
+                        row_tuple[col_idx] = struct.unpack_from("<b", raw_data, row_idx * 1)[0]
+
+
+
+                elif pyarrow.types.is_floating(chunk.type):
+                    row_tuple[col_idx] = chunk[row_idx].as_py()
+
+                elif pyarrow.types.is_boolean(chunk.type):
+                    # Booleans are bit-packed
+                    bool_data = <const uint8_t*><uintptr_t>buffers[1].address if buffers[1] else NULL
+                    bool_value = (bool_data[byte_offset] & (1 << bit_index)) != 0
+                    row_tuple[col_idx] = bool(bool_value)
+
+#                else:
+#                    # Fallback for unsupported types
+#                    PyTuple_SET_ITEM(row_tuple, col_idx, chunk[row_idx].as_py())
+
+#            batch_result[row_idx] = row_factory(row_tuple)
+            print(row_tuple)
+
+#        result.extend(batch_result)
+
+    return result
diff --git a/orso/schema.py b/orso/schema.py
@@ -51,6 +51,7 @@
 
 """
 
+import re
 from collections import defaultdict
 from dataclasses import _MISSING_TYPE
 from dataclasses import asdict
@@ -123,6 +124,46 @@ def load(cls: Type["Expectation"], serialized: Union[Dict[str, Any], str]) -> "E
         )
 
 
+def _parse_type(type_str: str) -> Union[str, Tuple[str, Tuple[int, ...]]]:
+    """
+    Parses a SQL type string into its base type and optional parameters.
+
+    Parameters:
+        type_str (str): The type definition string (e.g., 'DECIMAL(10,2)', 'VARCHAR[255]', 'ARRAY<VARCHAR>').
+
+    Returns:
+        Union[str, Tuple[str, Tuple[int, ...]]]:
+            - Just the base type (e.g., "INTEGER", "TEXT").
+            - A tuple with the base type and a tuple of integer parameters if applicable (e.g., ("DECIMAL", (10, 2))).
+    """
+
+    # Match ARRAY<TYPE>
+    array_match = re.match(r"ARRAY<([\w\s]+)>", type_str)
+    if array_match:
+        return "ARRAY", (array_match.group(1),)
+
+    # Match DECIMAL(p,s)
+    decimal_match = re.match(r"DECIMAL\((\d+),\s*(\d+)\)", type_str)
+    if decimal_match:
+        precision, scale = map(int, decimal_match.groups())
+        return "DECIMAL", (precision, scale)
+
+    # Match VARCHAR[n]
+    varchar_match = re.match(r"VARCHAR\[(\d+)\]", type_str)
+    if varchar_match:
+        length = int(varchar_match.group(1))
+        return "VARCHAR", (length,)
+
+    # Match BLOB[n]
+    blob_match = re.match(r"BLOB\[(\d+)\]", type_str)
+    if blob_match:
+        size = int(blob_match.group(1))
+        return "BLOB", (size,)
+
+    # If no parameters, return base type as a string
+    return type_str.upper()
+
+
 @dataclass(init=False)
 class FlatColumn:
     """
@@ -134,12 +175,14 @@ class FlatColumn:
     name: str
     default: Optional[Any] = None
     type: OrsoTypes = OrsoTypes._MISSING_TYPE
+    subtype: Optional[OrsoTypes] = None
     description: Optional[str] = None
     disposition: Optional[ColumnDisposition] = None
     aliases: Optional[List[str]] = field(default_factory=list)  # type: ignore
     nullable: bool = True
     expectations: Optional[Expectation] = field(default_factory=list)
     identity: str = field(default_factory=random_string)
+    length: Optional[int] = None
     precision: Optional[int] = None
     scale: Optional[int] = None
     origin: Optional[List[str]] = field(default_factory=list)
@@ -178,25 +221,58 @@ def __init__(self, **kwargs):
         # map literals to OrsoTypes
         if self.type.__class__ is not OrsoTypes:
             type_name = str(self.type).upper()
-            if type_name in OrsoTypes.__members__:
-                self.type = OrsoTypes[type_name]
-            elif type_name == "LIST":
-                warn("Column type LIST will be deprecated in a future version, use ARRAY instead.")
-                self.type = OrsoTypes.ARRAY
-            elif type_name == "NUMERIC":
-                warn(
-                    "Column type NUMERIC will be deprecated in a future version, use DECIMAL, DOUBLE or INTEGER instead. Mapped to DOUBLE, this may not be compatible with all values NUMERIC was compatible with."
-                )
-                self.type = OrsoTypes.DOUBLE
-            elif type_name == "BSON":
-                warn("Column type BSON will be deprecated in a future version, use JSONB instead.")
-                self.type = OrsoTypes.JSONB
-            elif type_name == "STRING":
-                raise ValueError(
-                    f"Unknown column type '{self.type}' for column '{self.name}'. Did you mean 'VARCHAR'?"
-                )
-            elif self.type != 0:
+            parsed_types = _parse_type(type_name)
+            if isinstance(parsed_types, str):
+                if parsed_types == "ARRAY":
+                    warn("Column type ARRAY without subtype, defaulting to VARCHAR.")
+                    self.type = OrsoTypes.ARRAY
+                    self.subtype = OrsoTypes.VARCHAR
+                elif parsed_types in OrsoTypes.__members__:
+                    self.type = OrsoTypes[parsed_types]
+                elif parsed_types == "LIST":
+                    warn(
+                        "Column type LIST will be deprecated in a future version, use ARRAY instead."
+                    )
+                    self.type = OrsoTypes.ARRAY
+                elif parsed_types == "NUMERIC":
+                    warn(
+                        "Column type NUMERIC will be deprecated in a future version, use DECIMAL, DOUBLE or INTEGER instead. Mapped to DOUBLE, this may not be compatible with all values NUMERIC was compatible with."
+                    )
+                    self.type = OrsoTypes.DOUBLE
+                elif parsed_types == "BSON":
+                    warn(
+                        "Column type BSON will be deprecated in a future version, use JSONB instead."
+                    )
+                    self.type = OrsoTypes.JSONB
+                elif parsed_types == "STRING":
+                    raise ValueError(
+                        f"Unknown column type '{self.type}' for column '{self.name}'. Did you mean 'VARCHAR'?"
+                    )
+                elif type_name == "0":
+                    self.type = 0
+                else:
+                    raise ValueError(f"Unknown column type '{self.type}' for column '{self.name}'.")
+            elif parsed_types[0] == "ARRAY":
+                subtype = parsed_types[1][0]
+                if subtype in ("ARRAY", "LIST", "NUMERIC", "BSON", "STRING"):
+                    raise ValueError(f"Invalid subtype '{subtype}' for ARRAY type.")
+                if subtype in OrsoTypes.__members__:
+                    self.type = OrsoTypes.ARRAY
+                    self.subtype = OrsoTypes[subtype]
+                else:
+                    raise ValueError(f"Unknown column type '{subtype}' for column '{self.name}'.")
+            elif parsed_types[0] == "DECIMAL":
+                self.type = OrsoTypes.DECIMAL
+                self.precision, self.scale = parsed_types[1]
+            elif parsed_types[0] == "VARCHAR":
+                self.type = OrsoTypes.VARCHAR
+                self.length = parsed_types[1][0]
+            elif parsed_types[0] == "BLOB":
+                self.type = OrsoTypes.BLOB
+                self.length = parsed_types[1][0]
+            else:
                 raise ValueError(f"Unknown column type '{self.type}' for column '{self.name}'.")
+        # validate decimal properties
 
         if self.type == OrsoTypes.DECIMAL and self.precision is None:
             from decimal import getcontext
@@ -271,6 +347,7 @@ def to_flatcolumn(self) -> "FlatColumn":
             aliases=self.aliases,
             identity=self.identity,
             type=self.type,
+            subtype=self.subtype,
             nullable=self.nullable,
             scale=self.scale,
             precision=self.precision,
diff --git a/orso/version.py b/orso/version.py
@@ -10,5 +10,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__: str = "0.0.187"
+__version__: str = "0.0.188"
 __author__: str = "@joocer"
diff --git a/tests/test_schema.py b/tests/test_schema.py
@@ -335,7 +335,49 @@ def test_parsers():
     assert isinstance(parsed, str), type(parsed)
     assert parsed == "1718530754", parsed
 
-
+def test_type_name_parsing():
+    _type = FlatColumn(name="col", type="INTEGER")
+    assert _type.type == OrsoTypes.INTEGER, _type.type
+    _type = FlatColumn(name="col", type="VARCHAR")
+    assert _type.type == OrsoTypes.VARCHAR, _type.type
+    assert _type.length is None
+    _type = FlatColumn(name="col", type="BLOB")
+    assert _type.type == OrsoTypes.BLOB, _type.type
+    assert _type.length is None
+    _type = FlatColumn(name="col", type="DOUBLE")
+    assert _type.type == OrsoTypes.DOUBLE, _type.type
+    _type = FlatColumn(name="col", type="DECIMAL")
+    assert _type.type == OrsoTypes.DECIMAL, _type.type
+    _type = FlatColumn(name="col", type="BOOLEAN")
+    assert _type.type == OrsoTypes.BOOLEAN, _type.type
+    _type = FlatColumn(name="col", type="TIMESTAMP")
+    assert _type.type == OrsoTypes.TIMESTAMP, _type.type
+    _type = FlatColumn(name="col", type="ARRAY")
+    assert _type.type == OrsoTypes.ARRAY, _type.type
+    assert _type.subtype == OrsoTypes.VARCHAR, _type.subtype
+    _type = FlatColumn(name="col", type="ARRAY<INTEGER>")
+    assert _type.type == OrsoTypes.ARRAY, _type.type
+    assert _type.subtype == OrsoTypes.INTEGER, _type.subtype
+    _type = FlatColumn(name="col", type="ARRAY<VARCHAR>")
+    assert _type.type == OrsoTypes.ARRAY, _type.type
+    assert _type.subtype == OrsoTypes.VARCHAR, _type.subtype
+    with pytest.raises(ValueError):
+        _type = FlatColumn(name="col", type="ARRAY<A")
+    with pytest.raises(ValueError):
+        _type = FlatColumn(name="col", type="ARRAY<BIT>")
+    with pytest.raises(ValueError):
+        _type = FlatColumn(name="col", type="ARRAY<ARRAY>")
+    _type = FlatColumn(name="col", type="DECIMAL(10,2)")
+    assert _type.type == OrsoTypes.DECIMAL, _type.type
+    assert _type.precision == 10, _type.precision
+    assert _type.scale == 2, _type.scale
+    _type = FlatColumn(name="col", type="VARCHAR[12]")
+    assert _type.type == OrsoTypes.VARCHAR, _type.type
+    assert _type.length == 12, _type.length
+    _type = FlatColumn(name="col", type="BLOB[12]")
+    assert _type.type == OrsoTypes.BLOB, _type.type
+    assert _type.length == 12, _type.length
+ 
 if __name__ == "__main__":  # prgama: nocover
     from tests import run_tests
 
diff --git a/tests/test_schema_columns.py b/tests/test_schema_columns.py
@@ -135,7 +135,7 @@ def test_column_type_mapping():
     assert fc.type == OrsoTypes.DOUBLE, fc.type
 
     fc = FlatColumn(name="athled", type=0)
-    assert fc.type == 0
+    assert fc.type == 0, fc.type
 
     with pytest.raises(ValueError):
         FlatColumn(name="able", type="LEFT")