Bug/fix citation mapping (#33)

agamm · web-flow · commit 04f5ebcaa895 · 2025-08-11T13:20:10.000-05:00
* Fix citation window match in anthropic

* Fix json parsing

* Fix json fallback and tests
diff --git a/batchata/providers/anthropic/citation_mapper.py b/batchata/providers/anthropic/citation_mapper.py
@@ -170,7 +170,7 @@ def _is_field_relevant(citation_text: str, field_name: str, field_value: Any) ->
         return False
     
     # Create a window around the value (50 chars before and after)
-    window_size = 50
+    window_size = 250
     start_pos = max(0, value_position - window_size)
     end_pos = min(len(citation_lower), value_position + window_size)
     text_window = citation_lower[start_pos:end_pos]
diff --git a/batchata/providers/anthropic/parse_results.py b/batchata/providers/anthropic/parse_results.py
@@ -163,30 +163,74 @@ def _parse_content(content: Any, job: Optional['Job']) -> Tuple[str, List[Tuple[
     return "".join(text_parts), citation_blocks
 
 
-def _extract_json_model(text: str, response_model: Type[BaseModel]) -> BaseModel | None:
-    """Extract JSON from text and parse into Pydantic model."""
-    try:
-        # First try to extract JSON from markdown code blocks
-        import re
-        code_block_pattern = r'```(?:json)?\s*\n([\s\S]*?)\n```'
-        match = re.search(code_block_pattern, text)
-        
+def _extract_json_model(text: str, response_model: Type[BaseModel]) -> BaseModel | Dict | None:
+    """Extract JSON from text and parse into Pydantic model.
+    
+    Returns:
+        - Pydantic model instance if validation succeeds
+        - Dict with raw JSON data if JSON parsing succeeds but Pydantic validation fails
+        - None if JSON extraction/parsing fails completely
+    """
+    import re
+    from pydantic import ValidationError
+    
+    json_str = None
+    
+    # Try multiple patterns to extract JSON
+    patterns = [
+        r'```json\s*([\s\S]*?)\s*```',  # More flexible: allows any whitespace
+        r'```(?:json)?\s*\n([\s\S]*?)\n```',  # Original pattern
+        r'```\s*([\s\S]*?)\s*```',  # Any code block
+    ]
+    
+    for i, pattern in enumerate(patterns):
+        match = re.search(pattern, text)
         if match:
-            json_str = match.group(1)
-        else:
-            # Fall back to finding JSON in text
-            start_idx = text.find('{')
-            end_idx = text.rfind('}') + 1
-            
-            if start_idx == -1 or end_idx <= start_idx:
-                return None
-            
-            json_str = text[start_idx:end_idx]
+            json_str = match.group(1).strip()
+            logger.debug(f"Extracted JSON using pattern {i+1}: {pattern}")
+            break
+    
+    if not json_str:
+        # Fall back to finding JSON object in text
+        start_idx = text.find('{')
+        end_idx = text.rfind('}') + 1
+        
+        if start_idx == -1 or end_idx <= start_idx:
+            logger.warning("No JSON structure found in response text")
+            return None
         
+        json_str = text[start_idx:end_idx]
+        logger.debug("Extracted JSON by finding braces")
+    
+    # Try to parse JSON
+    try:
         json_data = json.loads(json_str)
-        return response_model(**json_data)
-    except:
+        logger.debug(f"Successfully parsed JSON with keys: {list(json_data.keys())}")
+    except json.JSONDecodeError as e:
+        logger.error(f"JSON decode failed at position {e.pos}: {e.msg}")
+        logger.error(f"Attempted JSON string: {json_str[:200]}...")
         return None
+    
+    # Try to create Pydantic model
+    try:
+        model_instance = response_model(**json_data)
+        logger.debug(f"Successfully created {response_model.__name__} instance")
+        return model_instance
+    except ValidationError as e:
+        # Log validation errors but return the raw dict
+        error_details = []
+        for error in e.errors():
+            field = '.'.join(str(f) for f in error['loc'])
+            msg = error['msg']
+            error_details.append(f"{field}: {msg}")
+        
+        logger.warning(f"Pydantic validation failed for {response_model.__name__}: {'; '.join(error_details)}")
+        logger.warning(f"Returning raw JSON data instead: {list(json_data.keys())}")
+        return json_data  # Return the parsed JSON as dict
+    except Exception as e:
+        logger.error(f"Unexpected error creating {response_model.__name__}: {type(e).__name__}: {str(e)}")
+        logger.warning(f"Returning raw JSON data instead: {list(json_data.keys())}")
+        return json_data  # Return the parsed JSON as dict
 
 
 def _save_raw_response(result: Any, job_id: str, raw_files_dir: str) -> None:
diff --git a/tests/providers/anthropic/test_citation_mapper.py b/tests/providers/anthropic/test_citation_mapper.py
@@ -180,4 +180,36 @@ class ManyFieldsModel(BaseModel):
     assert len(mappings_few) == 1  # Only field1 mapped
     assert "field1" in mappings_few
     assert warning is not None
-    assert "field2, field3, field4" in warning  # Should mention unmapped fields
+    assert "field2, field3, field4" in warning  # Should mention unmapped fields
+
+
+def test_citation_window_size_prevents_false_positives():
+    """Test that window size (250 chars) prevents distant false matches."""
+    
+    class PropertyModel(BaseModel):
+        cap_rate: str
+        occupancy_rate: str
+    
+    # Create citation where field words appear far from actual values
+    padding = "x" * 300  # More than 250 char window
+    long_citation = (
+        f"This property analysis mentions cap rates in general. {padding}"
+        f"The actual occupancy rate is 95% according to lease data. {padding}"
+        f"Various other rate calculations are mentioned here."
+    )
+    
+    parsed = PropertyModel(
+        cap_rate="8.5%",     # This value doesn't appear in citation
+        occupancy_rate="95%" # This value DOES appear in citation
+    )
+    
+    citation = Citation(text="test", source="report.pdf", page=1)
+    citation_blocks = [(long_citation, citation)]
+    
+    mappings, warning = map_citations_to_fields(citation_blocks, parsed)
+    
+    # Should map occupancy_rate (value + field words nearby)
+    assert "occupancy_rate" in mappings
+    
+    # Should NOT map cap_rate (value "8.5%" not in citation, despite "cap" and "rate" words)
+    assert "cap_rate" not in mappings
diff --git a/tests/providers/anthropic/test_parse_results.py b/tests/providers/anthropic/test_parse_results.py
@@ -621,4 +621,33 @@ class LanguageInfo(BaseModel):
         assert citation2.text == "Python first appeared in 1991"
         assert citation2.source == "Programming Language Timeline"
         assert citation2.metadata['type'] == "historical_fact"
-        assert citation2.metadata['start_page_number'] == 15
+        assert citation2.metadata['start_page_number'] == 15
+    
+    def test_json_fallback_to_dict_on_pydantic_validation_error(self):
+        """Test that JSON parsing falls back to dict when Pydantic validation fails."""
+        from batchata.providers.anthropic.parse_results import _extract_json_model
+        
+        # Model with strict types that won't match the JSON response
+        class StrictModel(BaseModel):
+            cap_rate: float        # Expects float, gets string "7.00%"
+            occupancy: int         # Wrong field name (JSON has "occupancy_rate")
+            active: bool           # Missing from JSON
+        
+        json_response = '''
+        ```json
+        {
+          "cap_rate": "7.00%",
+          "occupancy_rate": 95,
+          "extra_field": "bonus"
+        }
+        ```
+        '''
+        
+        result = _extract_json_model(json_response, StrictModel)
+        
+        # Should return dict (fallback), not None
+        assert result is not None
+        assert isinstance(result, dict)
+        assert result["cap_rate"] == "7.00%"
+        assert result["occupancy_rate"] == 95
+        assert result["extra_field"] == "bonus"
diff --git a/uv.lock b/uv.lock