Merge pull request #34 from praekeltfoundation/manage_large_data_volumes

TuscanyBothaReach · web-flow · commit e669b3b8d71a · 2025-10-22T15:56:42.000+02:00
Manage large data volumes
diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@
     <img src="https://github.com/praekeltfoundation/rdw-ingestion-tools/workflows/lint/badge.svg" width="120" />
     <img src="https://github.com/praekeltfoundation/rdw-ingestion-tools/workflows/release/badge.svg" width="145" />
     <img src="https://github.com/praekeltfoundation/rdw-ingestion-tools/workflows/test/badge.svg" width="120" />
-    <img src="https://img.shields.io/badge/version-2.0.5.dev0-orange" width="145" />
+    <img src="https://img.shields.io/badge/version-2.0.5-orange" width="145" />
     <img src="https://img.shields.io/badge/license-MIT-blue" width="100" />
   </p>
 </div>
diff --git a/examples/rapidpro/contacts.py b/examples/rapidpro/contacts.py
@@ -1,7 +1,7 @@
 from api.rapidpro import pyRapid
 
 contacts = pyRapid().contacts.get_contacts(
-    end_datetime="2023-01-01 01:00:50", start_datetime="2023-01-01 00:00:00"
+    end_datetime="2025-10-16 00:00:00", start_datetime="2025-10-15 00:00:00"
 )
 
 print(contacts.collect())
diff --git a/examples/rapidpro/flowstarts.py b/examples/rapidpro/flowstarts.py
@@ -1,7 +1,7 @@
 from api.rapidpro import pyRapid
 
 flowstarts = pyRapid().flow_starts.get_flowstarts(
-    end_datetime="2023-01-02 00:00:00", start_datetime="2023-01-01 00:00:00"
+    end_datetime="2025-10-16 00:00:00", start_datetime="2025-10-15 00:00:00"
 )
 
 print(flowstarts.collect())
diff --git a/examples/rapidpro/runs.py b/examples/rapidpro/runs.py
@@ -1,7 +1,7 @@
 from api.rapidpro import pyRapid
 
 runs = pyRapid().runs.get_runs(
-    end_datetime="2024-06-22 00:00:10", start_datetime="2024-06-22 00:00:00"
+    end_datetime="2025-10-16 00:00:00", start_datetime="2025-10-15 00:00:00"
 )
 
 print(runs.collect())
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "rdw-ingestion-tools"
-version = "2.0.5.dev0"
+version = "2.0.5"
 description = "A Python package for integrating third-party data to Reach Digital Health's AWS Data Lake."
 authors = [
     {name = "Schalk <schalk@reachdigitalhealth.org>"},
@@ -14,6 +14,7 @@ dependencies = [
     "boto3>=1.34.103",
     "httpx>=0.27.0",
     "httpx-retries>=0.4.2",
+    "more-itertools>=10.8.0",
     "pandas>=2.2.2",
     "types-tqdm>=4.66.0.20240417",
 ]
diff --git a/rdw_ingestion_tools/api/__init__.py b/rdw_ingestion_tools/api/__init__.py
@@ -1,6 +1,7 @@
 import os
 from collections.abc import Iterator
 
+from more_itertools import chunked
 from pandas import DataFrame
 from pandas import json_normalize as pd_json_normalize
 from polars import (
@@ -71,9 +72,10 @@ def get_polars_schema(
     object_columns: list[str], data: list[dict[str, Object]]
 ) -> dict[str, Object]:
     """
-    Creates a normalised LazyFrame and uses the schema to generate a schema
-    dictionary using the column names.
-    Columns that are `list` types need to be type `Object` before they can be cast
+    Creates a normalised LazyFrame. Returns a schema dictionary using the
+    LazyFrame's column names.
+
+    Note: Columns that are `list` types need to be type `Object` before they can be cast
     to string.
     All other column types can be cast directly to string using the schema generated.
     """
@@ -94,17 +96,22 @@ def get_polars_schema(
 def concatenate_to_string_lazyframe(
     objs: list[dict] | dict[Never, Never] | list[Never] | Iterator,
     object_columns: list[str],
+    batch_size: int = 2000,
 ) -> LazyFrame:
     """
-    Flattens JSON data. Returns a LazyFrame with String columns.
+    Flattens JSON data. Returns a LazyFrame with columns of type `String`.
     """
-    data = list(objs)
-
-    schema = get_polars_schema(data=data, object_columns=object_columns)
-    lf = (
-        json_normalize(data, separator="_", schema=schema)
-        .lazy()
-        .with_columns(col(Object).map_elements(lambda x: str(x), return_dtype=String))
-    )
+    lf = LazyFrame()
+
+    for data in chunked(objs, batch_size):
+        schema = get_polars_schema(data=data, object_columns=object_columns)
+        response_lf = (
+            json_normalize(data, separator="_", schema=schema)
+            .lazy()
+            .with_columns(
+                col(Object).map_elements(lambda x: str(x), return_dtype=String)
+            )
+        )
+        lf = concat([lf, response_lf], how="diagonal")
 
     return lf
diff --git a/tests/test_concatenation/test_concatenation.py b/tests/test_concatenation/test_concatenation.py
@@ -1,35 +1,29 @@
+import pytest
 from polars import LazyFrame, Object, String
 from polars.testing import assert_frame_equal
 
 from rdw_ingestion_tools.api import concatenate_to_string_lazyframe, get_polars_schema
 
 
 def test_get_polars_schema_empty_data():
-    """
-    Tests that schemas generated for empty responses are empty dictionaries.
-
-    """
+    """Tests that schemas generated for empty responses are empty dictionaries."""
     schema = get_polars_schema(object_columns=[], data=[])
 
     assert schema == {}
 
 
 def test_concatenate_to_string_lazyframe_empty_response():
-    """
-    Tests that concatenate_to_string_lazyframe returns an empty LazyFrame for
+    """Tests that concatenate_to_string_lazyframe returns an empty LazyFrame for
     empty response data.
-
     """
     lf = concatenate_to_string_lazyframe(objs=[], object_columns=[])
 
     assert_frame_equal(lf, LazyFrame(schema={}))
 
 
 def test_get_polars_schema_primitive_types():
-    """
-    Schemas generated from response data use type `String`
+    """Tests that schemas generated from response data use type `String`
     for all primitive types.
-
     """
     data = [
         {
@@ -57,10 +51,8 @@ def test_get_polars_schema_primitive_types():
 
 
 def test_get_polars_schema_list_types():
-    """
-    Tests that generated schemas from response data use type `Object`
+    """Tests that generated schemas from response data use type `Object`
     for list columns.
-
     """
     data = [{"col1": [1, 2, 3], "col2": [{"key": "value"}], "col3": False}]
 
@@ -72,10 +64,8 @@ def test_get_polars_schema_list_types():
 
 
 def test_get_polars_schema_json_types():
-    """
-    Tests that generated schemas from response data with JSON columns
+    """Tests that generated schemas from response data with JSON columns
     normalise the column names in the schema.
-
     """
     data = [{"col1": {"key": {"inner_key": "value"}}, "col2": {"key": "value"}}]
 
@@ -86,18 +76,19 @@ def test_get_polars_schema_json_types():
     assert schema == expected_schema
 
 
-def test_concatenate_to_string_lazyframe():
-    """
-    Tests that response data is concatenated and normalised into LazyFrames
+@pytest.mark.parametrize("batch_size", [1, 2])
+def test_concatenate_to_string_lazyframe(batch_size):
+    """Tests that response data is concatenated and normalised into LazyFrames
     with column type `String`.
-
     """
     data = [
         {"col1": 1, "col2": [1, 2, 3], "col3": {"key": "value1"}},
         {"col1": 2, "col2": [1, 2, 3], "col3": {"key": "value2"}},
     ]
 
-    lf = concatenate_to_string_lazyframe(objs=data, object_columns=["col2"])
+    lf = concatenate_to_string_lazyframe(
+        objs=data, object_columns=["col2"], batch_size=batch_size
+    )
 
     expected_lf = LazyFrame(
         {
@@ -108,3 +99,27 @@ def test_concatenate_to_string_lazyframe():
     )
 
     assert_frame_equal(lf, expected_lf)
+
+
+@pytest.mark.parametrize("batch_size", [1, 2, 3])
+def test_concatenate_to_string_lazyframe_uses_all_rows(batch_size):
+    """Tests that the key names in every JSON column are used."""
+    data = [
+        {"column1": {"key1": "1"}},
+        {"column2": {"key1": "1"}},
+        {"column2": {"key1": "1", "key2": "2"}},
+    ]
+
+    expected_lf = LazyFrame(
+        {
+            "column1_key1": ["1", None, None],
+            "column2_key1": [None, "1", "1"],
+            "column2_key2": [None, None, "2"],
+        }
+    )
+
+    lf = concatenate_to_string_lazyframe(
+        objs=data, object_columns=[], batch_size=batch_size
+    )
+
+    assert_frame_equal(lf, expected_lf)
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`from api.rapidpro import pyRapid`
`2`	`2`
`3`	`3`	`contacts = pyRapid().contacts.get_contacts(`
`4`		`- end_datetime="2023-01-01 01:00:50", start_datetime="2023-01-01 00:00:00"`
	`4`	`+ end_datetime="2025-10-16 00:00:00", start_datetime="2025-10-15 00:00:00"`
`5`	`5`	`)`
`6`	`6`
`7`	`7`	`print(contacts.collect())`
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`from api.rapidpro import pyRapid`
`2`	`2`
`3`	`3`	`flowstarts = pyRapid().flow_starts.get_flowstarts(`
`4`		`- end_datetime="2023-01-02 00:00:00", start_datetime="2023-01-01 00:00:00"`
	`4`	`+ end_datetime="2025-10-16 00:00:00", start_datetime="2025-10-15 00:00:00"`
`5`	`5`	`)`
`6`	`6`
`7`	`7`	`print(flowstarts.collect())`
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`from api.rapidpro import pyRapid`
`2`	`2`
`3`	`3`	`runs = pyRapid().runs.get_runs(`
`4`		`- end_datetime="2024-06-22 00:00:10", start_datetime="2024-06-22 00:00:00"`
	`4`	`+ end_datetime="2025-10-16 00:00:00", start_datetime="2025-10-15 00:00:00"`
`5`	`5`	`)`
`6`	`6`
`7`	`7`	`print(runs.collect())`