Add use_dictionary parameter for parquet dictionary encoding

Copilot · joocer · Copilot · commit 26fbadeb35a1 · 2025-10-06T13:59:04.000Z
Co-authored-by: joocer &lt;1688479+joocer@users.noreply.github.com&gt;
diff --git a/mabel/data/writers/internals/blob_writer.py b/mabel/data/writers/internals/blob_writer.py
@@ -33,12 +33,14 @@ def __init__(
         schema: Optional[RelationSchema] = None,
         parquet_row_group_size: int = 5000,
         sort_by: Optional[Union[str, List]] = None,
+        use_dictionary: Optional[Union[bool, List[str]]] = None,
         **kwargs,
     ):
         self.format = format
         self.maximum_blob_size = blob_size
         self.parquet_row_group_size = parquet_row_group_size
         self.sort_by = sort_by
+        self.use_dictionary = use_dictionary
 
         if format not in SUPPORTED_FORMATS_ALGORITHMS:
             raise ValueError(
@@ -172,9 +174,10 @@ def commit(self):
                         pytable = pytable.sort_by(sort_spec)
 
                     tempfile = io.BytesIO()
-                    pyarrow.parquet.write_table(
-                        pytable, where=tempfile, row_group_size=self.parquet_row_group_size
-                    )
+                    write_kwargs = {"row_group_size": self.parquet_row_group_size}
+                    if self.use_dictionary is not None:
+                        write_kwargs["use_dictionary"] = self.use_dictionary
+                    pyarrow.parquet.write_table(pytable, where=tempfile, **write_kwargs)
 
                     tempfile.seek(0)
                     write_buffer = tempfile.read()
diff --git a/tests/test_writer_parquet_features.py b/tests/test_writer_parquet_features.py
@@ -237,6 +237,105 @@ def test_parquet_sorting_single_column_list():
     shutil.rmtree("_temp_sort_single_list", ignore_errors=True)
 
 
+def test_parquet_dictionary_encoding_all():
+    """Test that use_dictionary parameter can be set to True for all columns"""
+    shutil.rmtree("_temp_dict_all", ignore_errors=True)
+    
+    w = BatchWriter(
+        inner_writer=DiskWriter,
+        dataset="_temp_dict_all",
+        format="parquet",
+        date=datetime.datetime.utcnow().date(),
+        schema=[
+            {"name": "id", "type": "INTEGER"},
+            {"name": "category", "type": "VARCHAR"}
+        ],
+        use_dictionary=True,  # Enable dictionary encoding for all columns
+    )
+    
+    # Write records with repeated category values (good for dictionary encoding)
+    for i in range(100):
+        w.append({"id": i, "category": f"category_{i % 5}"})
+    
+    w.finalize()
+    
+    # Read back and verify data
+    r = Reader(inner_reader=DiskReader, dataset="_temp_dict_all")
+    records = list(r)
+    assert len(records) == 100, f"Expected 100 records, got {len(records)}"
+    
+    shutil.rmtree("_temp_dict_all", ignore_errors=True)
+
+
+def test_parquet_dictionary_encoding_disabled():
+    """Test that use_dictionary parameter can be set to False to disable dictionary encoding"""
+    shutil.rmtree("_temp_dict_disabled", ignore_errors=True)
+    
+    w = BatchWriter(
+        inner_writer=DiskWriter,
+        dataset="_temp_dict_disabled",
+        format="parquet",
+        date=datetime.datetime.utcnow().date(),
+        schema=[
+            {"name": "id", "type": "INTEGER"},
+            {"name": "category", "type": "VARCHAR"}
+        ],
+        use_dictionary=False,  # Disable dictionary encoding
+    )
+    
+    # Write records
+    for i in range(100):
+        w.append({"id": i, "category": f"category_{i % 5}"})
+    
+    w.finalize()
+    
+    # Read back and verify data
+    r = Reader(inner_reader=DiskReader, dataset="_temp_dict_disabled")
+    records = list(r)
+    assert len(records) == 100, f"Expected 100 records, got {len(records)}"
+    
+    shutil.rmtree("_temp_dict_disabled", ignore_errors=True)
+
+
+def test_parquet_dictionary_encoding_specific_columns():
+    """Test that use_dictionary parameter can specify specific columns for dictionary encoding"""
+    shutil.rmtree("_temp_dict_specific", ignore_errors=True)
+    
+    w = BatchWriter(
+        inner_writer=DiskWriter,
+        dataset="_temp_dict_specific",
+        format="parquet",
+        date=datetime.datetime.utcnow().date(),
+        schema=[
+            {"name": "id", "type": "INTEGER"},
+            {"name": "category", "type": "VARCHAR"},
+            {"name": "value", "type": "VARCHAR"}
+        ],
+        use_dictionary=["category"],  # Only encode 'category' column with dictionary
+    )
+    
+    # Write records with repeated category values
+    for i in range(100):
+        w.append({
+            "id": i,
+            "category": f"category_{i % 5}",
+            "value": f"unique_value_{i}"
+        })
+    
+    w.finalize()
+    
+    # Read back and verify data
+    r = Reader(inner_reader=DiskReader, dataset="_temp_dict_specific")
+    records = list(r)
+    assert len(records) == 100, f"Expected 100 records, got {len(records)}"
+    
+    # Verify the data is correct
+    assert records[0]["category"] == "category_0"
+    assert records[50]["category"] == "category_0"
+    
+    shutil.rmtree("_temp_dict_specific", ignore_errors=True)
+
+
 if __name__ == "__main__":  # pragma: no cover
     from tests.helpers.runner import run_tests