Skip to content

Commit 04f5ebc

Browse files
authored
Bug/fix citation mapping (#33)
* Fix citation window match in anthropic * Fix json parsing * Fix json fallback and tests
1 parent ec5de6d commit 04f5ebc

File tree

5 files changed

+129
-24
lines changed

5 files changed

+129
-24
lines changed

batchata/providers/anthropic/citation_mapper.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ def _is_field_relevant(citation_text: str, field_name: str, field_value: Any) ->
170170
return False
171171

172172
# Create a window around the value (50 chars before and after)
173-
window_size = 50
173+
window_size = 250
174174
start_pos = max(0, value_position - window_size)
175175
end_pos = min(len(citation_lower), value_position + window_size)
176176
text_window = citation_lower[start_pos:end_pos]

batchata/providers/anthropic/parse_results.py

Lines changed: 64 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -163,30 +163,74 @@ def _parse_content(content: Any, job: Optional['Job']) -> Tuple[str, List[Tuple[
163163
return "".join(text_parts), citation_blocks
164164

165165

166-
def _extract_json_model(text: str, response_model: Type[BaseModel]) -> BaseModel | None:
167-
"""Extract JSON from text and parse into Pydantic model."""
168-
try:
169-
# First try to extract JSON from markdown code blocks
170-
import re
171-
code_block_pattern = r'```(?:json)?\s*\n([\s\S]*?)\n```'
172-
match = re.search(code_block_pattern, text)
173-
166+
def _extract_json_model(text: str, response_model: Type[BaseModel]) -> BaseModel | Dict | None:
167+
"""Extract JSON from text and parse into Pydantic model.
168+
169+
Returns:
170+
- Pydantic model instance if validation succeeds
171+
- Dict with raw JSON data if JSON parsing succeeds but Pydantic validation fails
172+
- None if JSON extraction/parsing fails completely
173+
"""
174+
import re
175+
from pydantic import ValidationError
176+
177+
json_str = None
178+
179+
# Try multiple patterns to extract JSON
180+
patterns = [
181+
r'```json\s*([\s\S]*?)\s*```', # More flexible: allows any whitespace
182+
r'```(?:json)?\s*\n([\s\S]*?)\n```', # Original pattern
183+
r'```\s*([\s\S]*?)\s*```', # Any code block
184+
]
185+
186+
for i, pattern in enumerate(patterns):
187+
match = re.search(pattern, text)
174188
if match:
175-
json_str = match.group(1)
176-
else:
177-
# Fall back to finding JSON in text
178-
start_idx = text.find('{')
179-
end_idx = text.rfind('}') + 1
180-
181-
if start_idx == -1 or end_idx <= start_idx:
182-
return None
183-
184-
json_str = text[start_idx:end_idx]
189+
json_str = match.group(1).strip()
190+
logger.debug(f"Extracted JSON using pattern {i+1}: {pattern}")
191+
break
192+
193+
if not json_str:
194+
# Fall back to finding JSON object in text
195+
start_idx = text.find('{')
196+
end_idx = text.rfind('}') + 1
197+
198+
if start_idx == -1 or end_idx <= start_idx:
199+
logger.warning("No JSON structure found in response text")
200+
return None
185201

202+
json_str = text[start_idx:end_idx]
203+
logger.debug("Extracted JSON by finding braces")
204+
205+
# Try to parse JSON
206+
try:
186207
json_data = json.loads(json_str)
187-
return response_model(**json_data)
188-
except:
208+
logger.debug(f"Successfully parsed JSON with keys: {list(json_data.keys())}")
209+
except json.JSONDecodeError as e:
210+
logger.error(f"JSON decode failed at position {e.pos}: {e.msg}")
211+
logger.error(f"Attempted JSON string: {json_str[:200]}...")
189212
return None
213+
214+
# Try to create Pydantic model
215+
try:
216+
model_instance = response_model(**json_data)
217+
logger.debug(f"Successfully created {response_model.__name__} instance")
218+
return model_instance
219+
except ValidationError as e:
220+
# Log validation errors but return the raw dict
221+
error_details = []
222+
for error in e.errors():
223+
field = '.'.join(str(f) for f in error['loc'])
224+
msg = error['msg']
225+
error_details.append(f"{field}: {msg}")
226+
227+
logger.warning(f"Pydantic validation failed for {response_model.__name__}: {'; '.join(error_details)}")
228+
logger.warning(f"Returning raw JSON data instead: {list(json_data.keys())}")
229+
return json_data # Return the parsed JSON as dict
230+
except Exception as e:
231+
logger.error(f"Unexpected error creating {response_model.__name__}: {type(e).__name__}: {str(e)}")
232+
logger.warning(f"Returning raw JSON data instead: {list(json_data.keys())}")
233+
return json_data # Return the parsed JSON as dict
190234

191235

192236
def _save_raw_response(result: Any, job_id: str, raw_files_dir: str) -> None:

tests/providers/anthropic/test_citation_mapper.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,4 +180,36 @@ class ManyFieldsModel(BaseModel):
180180
assert len(mappings_few) == 1 # Only field1 mapped
181181
assert "field1" in mappings_few
182182
assert warning is not None
183-
assert "field2, field3, field4" in warning # Should mention unmapped fields
183+
assert "field2, field3, field4" in warning # Should mention unmapped fields
184+
185+
186+
def test_citation_window_size_prevents_false_positives():
187+
"""Test that window size (250 chars) prevents distant false matches."""
188+
189+
class PropertyModel(BaseModel):
190+
cap_rate: str
191+
occupancy_rate: str
192+
193+
# Create citation where field words appear far from actual values
194+
padding = "x" * 300 # More than 250 char window
195+
long_citation = (
196+
f"This property analysis mentions cap rates in general. {padding}"
197+
f"The actual occupancy rate is 95% according to lease data. {padding}"
198+
f"Various other rate calculations are mentioned here."
199+
)
200+
201+
parsed = PropertyModel(
202+
cap_rate="8.5%", # This value doesn't appear in citation
203+
occupancy_rate="95%" # This value DOES appear in citation
204+
)
205+
206+
citation = Citation(text="test", source="report.pdf", page=1)
207+
citation_blocks = [(long_citation, citation)]
208+
209+
mappings, warning = map_citations_to_fields(citation_blocks, parsed)
210+
211+
# Should map occupancy_rate (value + field words nearby)
212+
assert "occupancy_rate" in mappings
213+
214+
# Should NOT map cap_rate (value "8.5%" not in citation, despite "cap" and "rate" words)
215+
assert "cap_rate" not in mappings

tests/providers/anthropic/test_parse_results.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -621,4 +621,33 @@ class LanguageInfo(BaseModel):
621621
assert citation2.text == "Python first appeared in 1991"
622622
assert citation2.source == "Programming Language Timeline"
623623
assert citation2.metadata['type'] == "historical_fact"
624-
assert citation2.metadata['start_page_number'] == 15
624+
assert citation2.metadata['start_page_number'] == 15
625+
626+
def test_json_fallback_to_dict_on_pydantic_validation_error(self):
627+
"""Test that JSON parsing falls back to dict when Pydantic validation fails."""
628+
from batchata.providers.anthropic.parse_results import _extract_json_model
629+
630+
# Model with strict types that won't match the JSON response
631+
class StrictModel(BaseModel):
632+
cap_rate: float # Expects float, gets string "7.00%"
633+
occupancy: int # Wrong field name (JSON has "occupancy_rate")
634+
active: bool # Missing from JSON
635+
636+
json_response = '''
637+
```json
638+
{
639+
"cap_rate": "7.00%",
640+
"occupancy_rate": 95,
641+
"extra_field": "bonus"
642+
}
643+
```
644+
'''
645+
646+
result = _extract_json_model(json_response, StrictModel)
647+
648+
# Should return dict (fallback), not None
649+
assert result is not None
650+
assert isinstance(result, dict)
651+
assert result["cap_rate"] == "7.00%"
652+
assert result["occupancy_rate"] == 95
653+
assert result["extra_field"] == "bonus"

uv.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)