Optimize Genie Parse Result (#120)

aravind-segu · web-flow · commit 6c52210dea68 · 2025-05-19T11:54:47.000-07:00
diff --git a/src/databricks_ai_bridge/genie.py b/src/databricks_ai_bridge/genie.py
@@ -1,3 +1,4 @@
+import bisect
 import logging
 import time
 from dataclasses import dataclass
@@ -59,15 +60,35 @@ def _parse_query_result(resp) -> Union[str, pd.DataFrame]:
 
         rows.append(row)
 
-    query_result = pd.DataFrame(rows, columns=header).to_markdown()
-
+    dataframe = pd.DataFrame(rows, columns=header)
+    query_result = dataframe.to_markdown()
     tokens_used = _count_tokens(query_result)
-    while tokens_used > MAX_TOKENS_OF_DATA:
-        rows.pop()
-        query_result = pd.DataFrame(rows, columns=header).to_markdown()
-        tokens_used = _count_tokens(query_result)
 
-    return query_result.strip() if query_result else query_result
+    # If the full result fits, return it
+    if tokens_used <= MAX_TOKENS_OF_DATA:
+        return query_result.strip()
+
+    def is_too_big(n):
+        return _count_tokens(dataframe.iloc[:n].to_markdown()) > MAX_TOKENS_OF_DATA
+
+    # Use bisect_left to find the cutoff point of rows within the max token data limit in a O(log n) complexity
+    # Passing True, as this is the target value we are looking for when _is_too_big returns
+    cutoff = bisect.bisect_left(range(len(dataframe) + 1), True, key=is_too_big)
+
+    # Slice to the found limit
+    truncated_df = dataframe.iloc[:cutoff]
+
+    # Edge case: Cannot return any rows because of tokens so return an empty string
+    if len(truncated_df) == 0:
+        return ""
+
+    truncated_result = truncated_df.to_markdown()
+
+    # Double-check edge case if we overshot by one
+    if _count_tokens(truncated_result) > MAX_TOKENS_OF_DATA:
+        truncated_result = truncated_df.iloc[:-1].to_markdown()
+
+    return truncated_result.strip()
 
 
 class Genie:
diff --git a/tests/databricks_ai_bridge/test_genie.py b/tests/databricks_ai_bridge/test_genie.py
@@ -1,4 +1,6 @@
-from datetime import datetime
+import random
+from datetime import datetime, timedelta
+from io import StringIO
 from unittest.mock import patch
 
 import pandas as pd
@@ -186,7 +188,7 @@ def test_parse_query_result_with_null_values():
     assert result == expected_df.to_markdown()
 
 
-def test_parse_query_result_trims_large_data():
+def test_parse_query_result_trims_data():
     # patch MAX_TOKENS_OF_DATA to 100 for this test
     with patch("databricks_ai_bridge.genie.MAX_TOKENS_OF_DATA", 100):
         resp = {
@@ -232,6 +234,86 @@ def test_parse_query_result_trims_large_data():
         assert _count_tokens(result) <= 100
 
 
+def markdown_to_dataframe(markdown_str: str) -> pd.DataFrame:
+    if markdown_str == "":
+        return pd.DataFrame()
+
+    lines = markdown_str.strip().splitlines()
+
+    # Remove Markdown separator row (2nd line)
+    lines = [line.strip().strip("|") for i, line in enumerate(lines) if i != 1]
+
+    # Re-join cleaned lines and parse
+    cleaned_markdown = "\n".join(lines)
+    df = pd.read_csv(StringIO(cleaned_markdown), sep="|")
+
+    # Strip whitespace from column names and values
+    df.columns = [col.strip() for col in df.columns]
+    df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
+
+    # Drop the first column
+    df = df.drop(columns=[df.columns[0]])
+
+    return df
+
+
+@pytest.mark.parametrize("max_tokens", [1, 100, 1000, 2000, 8000, 10000, 15000, 19000, 100000])
+def test_parse_query_result_trims_large_data(max_tokens):
+    """
+    Ensure _parse_query_result trims output to stay within token limits.
+    """
+    with patch("databricks_ai_bridge.genie.MAX_TOKENS_OF_DATA", max_tokens):
+        base_date = datetime(2023, 1, 1)
+        names = ["Alice", "Bob", "Charlie", "David", "Eve", "Frank", "Grace", "Hank", "Ivy", "Jack"]
+
+        data_array = [
+            [
+                str(i + 1),
+                random.choice(names),
+                (base_date + timedelta(days=random.randint(0, 365))).strftime("%Y-%m-%dT%H:%M:%SZ"),
+            ]
+            for i in range(1000)
+        ]
+
+        response = {
+            "manifest": {
+                "schema": {
+                    "columns": [
+                        {"name": "id", "type_name": "INT"},
+                        {"name": "name", "type_name": "STRING"},
+                        {"name": "created_at", "type_name": "TIMESTAMP"},
+                    ]
+                }
+            },
+            "result": {"data_array": data_array},
+        }
+
+        markdown_result = _parse_query_result(response)
+        result_df = markdown_to_dataframe(markdown_result)
+
+        expected_df = pd.DataFrame(
+            {
+                "id": [int(row[0]) for row in data_array],
+                "name": [row[1] for row in data_array],
+                "created_at": [
+                    datetime.strptime(row[2], "%Y-%m-%dT%H:%M:%SZ").date() for row in data_array
+                ],
+            }
+        )
+
+        expected_markdown = (
+            "" if len(result_df) == 0 else expected_df[: len(result_df)].to_markdown()
+        )
+        # Ensure result matches expected subset and respects token limit
+        assert markdown_result == expected_markdown
+        assert _count_tokens(markdown_result) <= max_tokens
+        # Ensure adding one more row would exceed token limit or we're at full length
+        next_row_exceeds = (
+            _count_tokens(expected_df.iloc[: len(result_df) + 1].to_markdown()) > max_tokens
+        )
+        assert len(result_df) == len(expected_df) or next_row_exceeds
+
+
 def test_poll_query_results_max_iterations(genie, mock_workspace_client):
     # patch MAX_ITERATIONS to 2 for this test and sleep to avoid delays
     with (