feat: adding conditional gleaning (#375)

shreyashankar · web-flow · commit b8d2beb602fe · 2025-07-01T22:40:38.000-07:00
* fix: improve caching and don't raise error for bad gather configs

* fix: improve caching and don't raise error for bad gather configs

* feat: adding conditional gleaning
diff --git a/docetl/operations/utils/api.py b/docetl/operations/utils/api.py
@@ -239,6 +239,9 @@ def _cached_call_llm(
                     )
 
                     for rnd in range(num_gleaning_rounds):
+                        # Break early if gleaning condition is not met
+                        if not self.should_glean(gleaning_config, parsed_output):
+                            break
                         # Prepare validator prompt
                         validator_prompt = strict_render(
                             gleaning_config["validation_prompt"],
@@ -963,3 +966,32 @@ def validate_output(self, operation: Dict, output: Dict, console: Console) -> bo
                 console.log(f"[yellow]Output:[/yellow] {output}")
                 return False
         return True
+
+    def should_glean(self, gleaning_config: Optional[Dict[str, Any]], output: Dict[str, Any]) -> bool:
+        """Determine whether to execute a gleaning round based on an optional conditional expression.
+
+        If ``gleaning_config`` contains an ``"if"`` key, its value is treated as a Python
+        boolean expression that will be evaluated with the current ``output`` bound to the
+        name ``output`` using :pyfunc:`safe_eval`. When the expression evaluates to
+        ``True`` the gleaning round proceeds. If it evaluates to ``False`` (or raises an
+        exception) the gleaning loop should terminate early.
+
+        If no ``"if"`` key is present the method defaults to returning ``True`` so that
+        gleaning proceeds normally.
+        """
+        # No gleaning_config or no conditional -> always glean
+        if not gleaning_config or "if" not in gleaning_config:
+            return True
+
+        condition = gleaning_config.get("if")
+        if not isinstance(condition, str):
+            raise ValueError(f"Invalid gleaning condition (should be a string): {condition}")
+
+        try:
+            return safe_eval(condition, output)
+        except Exception as exc:
+            # If evaluation fails, default to not glean and log for visibility
+            self.runner.console.log(
+                f"[bold red]Error evaluating gleaning condition '{condition}': {exc}; executing gleaning round anyway[/bold red]"
+            )
+            return False
diff --git a/docs/concepts/operators.md b/docs/concepts/operators.md
@@ -155,11 +155,8 @@ To enable gleaning, specify:
 
 - `validation_prompt`: Instructions for the LLM to evaluate and improve the output.
 - `num_rounds`: The maximum number of refinement iterations.
-<<<<<<< HEAD
 - `model` (optional): The model to use for the LLM executing the validation prompt. Defaults to the model specified for this operation. **Note that if the validator LLM determines the output needs to be improved, the final output will be generated by the model specified for this operation.**
-=======
-- `model` (optional): The model to use for the LLM executing the validation prompt. Defaults to the model specified for that operation.
->>>>>>> 070110d (docs: improve gleaning description)
+- `if` (optional): A Python boolean expression (evaluated with `safe_eval`) that refers to **fields in the current `output`**. If the expression evaluates to `False`, DocETL skips gleaning entirely.
 
 Example:
 
@@ -189,19 +186,27 @@ Example map operation (with a different model for the validation prompt):
     schema:
       insights_summary: "string"
   gleaning:
-    num_rounds: 2 # Will refine the output up to 2 times, if the judge LLM (gpt-4o-mini) suggests improvements
+    if: "len(output['insights_summary']) < 10"  # Only refine if summary is too short
+    num_rounds: 2 # Will refine up to 2 times if needed
     model: gpt-4o-mini
     validation_prompt: |
       There should be at least 2 insights, and each insight should have at least 1 supporting action.
 ```
 
 !!! tip "Choosing a Different Model for Validation"
 
-<<<<<<< HEAD
-    In the example above, the `gpt-4o` model is used to generate the main outputs, while the `gpt-4o-mini` model is used only for the validation and refinement steps. This means the more powerful (and expensive) model produces the final output, but a less expensive model handles the iterative validation, helping to reduce costs without sacrificing output quality.
-=======
     You may want to use a different model for the validation prompt. For example, you can use a more powerful (and expensive) model for generating outputs, but a cheaper model for validation—especially if the validation only checks a single aspect. This approach helps reduce costs while still ensuring quality, since the final output is always produced by the more capable model.
->>>>>>> 070110d (docs: improve gleaning description)
+
+!!! tip "Conditional Gleaning"
+
+    You can also use the `if` field to conditionally skip gleaning. For example, if you only want to glean if the output is too short, you can use:
+    ```yaml
+    gleaning:
+      if: "len(output['insights_summary']) < 10"
+      num_rounds: 2
+    ```
+
+    If the `if` field evaluates to `False`, DocETL skips gleaning entirely. Or, if the `if` field does not exist, DocETL will always glean.
 
 ### How Gleaning Works
 
diff --git a/tests/basic/test_basic_map.py b/tests/basic/test_basic_map.py
@@ -505,4 +505,20 @@ def test_map_operation_calibration_with_larger_sample(simple_map_config, map_sam
     )
 
     # Verify that cost is greater than 0
-    assert cost > 0
+    assert cost > 0
+
+def test_should_glean_condition(api_wrapper):
+    """Unit-test the conditional gleaning logic on DSLRunner.api.should_glean."""
+
+    wrapper = api_wrapper.api  # APIWrapper instance attached to the runner
+
+    # Case 1: condition evaluates to True
+    gleaning_config = {"if": "output['flag'] == True"}
+    assert wrapper.should_glean(gleaning_config, {"flag": True}) is True
+
+    # Case 2: condition evaluates to False
+    assert wrapper.should_glean(gleaning_config, {"flag": False}) is False
+
+    # Case 3: No condition key -> default to True
+    assert wrapper.should_glean({}, {"flag": False}) is True
+    assert wrapper.should_glean(None, {"flag": False}) is True