feat: add column bytes

joocer · joocer · commit ca17bc406149 · 2026-01-30T19:51:26.000Z
diff --git a/opteryx_catalog/catalog/dataset.py b/opteryx_catalog/catalog/dataset.py
@@ -955,6 +955,7 @@ def describe(self, snapshot_id: Optional[int] = None, bins: int = 10) -> dict:
                 "file_hist_infos": [],
                 "min_displays": [],
                 "max_displays": [],
+                "uncompressed_bytes": 0,
             }
 
         total_rows = 0
@@ -1007,6 +1008,7 @@ def _decode_minmax(v):
             xv = ent.get("max_values") or []
             mv_disp = ent.get("min_values_display") or []
             xv_disp = ent.get("max_values_display") or []
+            col_sizes = ent.get("column_uncompressed_sizes_in_bytes") or []
 
             for cname, cidx in col_to_idx.items():
                 # nulls
@@ -1091,6 +1093,12 @@ def _decode_display(v):
                     except Exception:
                         pass
 
+                # uncompressed bytes for this column (sum across files)
+                try:
+                    stats[cname]["uncompressed_bytes"] += int((col_sizes or [0])[cidx])
+                except Exception:
+                    pass
+
         # Build results per column
         results: dict[str, dict] = {}
         for cname, cidx in col_to_idx.items():
@@ -1212,6 +1220,7 @@ def _decode_display(v):
                 "min": global_min,
                 "max": global_max,
                 "null_count": s["null_count"],
+                "uncompressed_bytes": s["uncompressed_bytes"],
                 "approx_cardinality": approx_cardinality,
                 "distribution": distribution,
             }
@@ -1229,24 +1238,6 @@ def _decode_display(v):
             except Exception:
                 is_text = False
 
-            def _int_to_prefix(v, max_chars=16):
-                try:
-                    if not isinstance(v, int):
-                        return None
-                    if v == 0:
-                        return None
-                    blen = (v.bit_length() + 7) // 8
-                    blen = max(blen, 1)
-                    b = v.to_bytes(blen, "big")
-                    b = b.strip(b"\x00")
-                    if not b:
-                        return None
-                    s = b.decode("utf-8", errors="replace")
-                    s = s.rstrip("\x00")
-                    return s[:16]
-                except Exception:
-                    return None
-
             if is_text:
                 # Use only textual display values collected from manifests.
                 # Decode bytes and strip truncation marker (0xFF) if present.
@@ -1325,8 +1316,9 @@ def refresh_manifest(self, agent: str, author: Optional[str] = None) -> Optional
             import pyarrow as pa
             import pyarrow.parquet as pq
 
-            prev_table = pq.read_table(pa.BufferReader(prev_data))
-            prev_rows = prev_table.to_pylist()
+            # the manifest is a parquet file, read into a pyarrow Table
+            prev_manifest = pq.read_table(pa.BufferReader(prev_data))
+            prev_rows = prev_manifest.to_pylist()
         except Exception:
             prev_rows = []
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "opteryx-catalog"
-version = "0.4.19"
+version = "0.4.20"
 description = "Opteryx Cloud Catalog"
 readme = { file = "README.md", content-type = "text/markdown" }
 authors = [
diff --git a/tests/test_describe_uncompressed.py b/tests/test_describe_uncompressed.py
@@ -0,0 +1,129 @@
+import io
+
+import pyarrow as pa
+import pyarrow.parquet as pq
+
+from opteryx_catalog.catalog.dataset import SimpleDataset
+from opteryx_catalog.catalog.metadata import DatasetMetadata, Snapshot
+
+
+class _MemInput:
+    def __init__(self, data: bytes):
+        self._data = data
+
+    def open(self):
+        # Provide a file-like BytesIO which .read() returns the bytes
+        return io.BytesIO(self._data)
+
+
+class _MemIO:
+    def __init__(self, mapping: dict):
+        self._mapping = mapping
+
+    def new_input(self, path: str):
+        return _MemInput(self._mapping[path])
+
+
+def _build_manifest_bytes():
+    # Construct a parquet manifest with two entries, two columns per file
+    schema = pa.schema(
+        [
+            ("file_path", pa.string()),
+            ("file_format", pa.string()),
+            ("record_count", pa.int64()),
+            ("file_size_in_bytes", pa.int64()),
+            ("uncompressed_size_in_bytes", pa.int64()),
+            ("column_uncompressed_sizes_in_bytes", pa.list_(pa.int64())),
+            ("null_counts", pa.list_(pa.int64())),
+            ("min_k_hashes", pa.list_(pa.int64())),
+            ("histogram_counts", pa.list_(pa.int64())),
+            ("histogram_bins", pa.int64()),
+            ("min_values", pa.list_(pa.int64())),
+            ("max_values", pa.list_(pa.int64())),
+            ("min_values_display", pa.list_(pa.string())),
+            ("max_values_display", pa.list_(pa.string())),
+        ]
+    )
+
+    file_path = pa.array(["f1.parquet", "f2.parquet"], type=pa.string())
+    file_format = pa.array(["parquet", "parquet"], type=pa.string())
+    record_count = pa.array([10, 20], type=pa.int64())
+    file_size_in_bytes = pa.array([100, 200], type=pa.int64())
+    uncompressed_size_in_bytes = pa.array([1000, 2000], type=pa.int64())
+    column_uncompressed_sizes_in_bytes = pa.array(
+        [[100, 400], [300, 200]], type=pa.list_(pa.int64())
+    )
+    null_counts = pa.array([[0, 0], [0, 0]], type=pa.list_(pa.int64()))
+    min_k_hashes = pa.array([[1, 2], [1]], type=pa.list_(pa.int64()))
+    histogram_counts = pa.array([[1, 2], [3, 4]], type=pa.list_(pa.int64()))
+    histogram_bins = pa.array([32, 32], type=pa.int64())
+    min_values = pa.array([[10, 20], [5, 30]], type=pa.list_(pa.int64()))
+    max_values = pa.array([[100, 400], [300, 200]], type=pa.list_(pa.int64()))
+    min_values_display = pa.array([[None, None], [None, None]], type=pa.list_(pa.string()))
+    max_values_display = pa.array([[None, None], [None, None]], type=pa.list_(pa.string()))
+
+    table = pa.Table.from_arrays(
+        [
+            file_path,
+            file_format,
+            record_count,
+            file_size_in_bytes,
+            uncompressed_size_in_bytes,
+            column_uncompressed_sizes_in_bytes,
+            null_counts,
+            min_k_hashes,
+            histogram_counts,
+            histogram_bins,
+            min_values,
+            max_values,
+            min_values_display,
+            max_values_display,
+        ],
+        schema=schema,
+    )
+
+    buf = io.BytesIO()
+    pq.write_table(table, buf)
+    return buf.getvalue()
+
+
+def test_describe_includes_uncompressed_bytes():
+    manifest_bytes = _build_manifest_bytes()
+    manifest_path = "mem://manifest"
+
+    meta = DatasetMetadata(
+        dataset_identifier="tests_temp.test",
+        location="mem://",
+        schema=None,
+        properties={},
+    )
+
+    # Add a schema with two columns so describe() can map names -> indices
+    meta.schemas.append(
+        {"schema_id": "s1", "columns": [{"name": "a"}, {"name": "b"}]}
+    )
+    meta.current_schema_id = "s1"
+
+    # Prepare snapshot referencing our in-memory manifest
+    snap = Snapshot(
+        snapshot_id=1,
+        timestamp_ms=1,
+        manifest_list=manifest_path,
+    )
+    meta.snapshots.append(snap)
+    meta.current_snapshot_id = 1
+
+    ds = SimpleDataset(identifier="tests_temp.test", _metadata=meta)
+
+    # Inject our in-memory IO mapping
+    ds.io = _MemIO({manifest_path: manifest_bytes})
+
+    desc = ds.describe()
+
+    assert "a" in desc
+    assert "b" in desc
+
+    # Column 'a' should have uncompressed bytes = 100 + 300 = 400
+    assert desc["a"]["uncompressed_bytes"] == 400
+    # Column 'b' should have uncompressed bytes = 400 + 200 = 600
+    assert desc["b"]["uncompressed_bytes"] == 600