changes after the review: organize OCR files, change to gpt-5.1, update README, refacor sync/async parts

pawelkiszczak · pawelkiszczak · commit 3d5fbbe39052 · 2026-01-19T12:26:03.000+01:00
diff --git a/.gitignore b/.gitignore
@@ -43,3 +43,5 @@ build/
 # Custom
 *.db
 .ruff_cache
+ocr_parsing/files/results
+ocr_parsing/files/temp_files
diff --git a/README.md b/README.md
@@ -19,7 +19,20 @@ uv sync
 echo "OPENAI_API_KEY=your-key-here" > .env
 ```
 
-**Note**: Most examples use OpenAI's GPT-4o. Ensure your API key has appropriate permissions and sufficient quota.
+**Note**: Most examples use OpenAI's GPT-5.1. Ensure your API key has appropriate permissions and sufficient quota.
+
+## Learning Path
+
+**Recommended order for learning PydanticAI**:
+
+1. **[Direct Model Requests](direct_model_request/)** - Understand basic LLM API calls
+2. **[Temperature](temperature/)** - Understand model parameters
+3. **[Reasoning Effort](reasoning_effort/)** - Uncover how the reasoning effort may change the model's output
+4. **[Basic Sentiment](basic_sentiment/)** - Learn structured outputs with Pydantic
+5. **[Dynamic Classification](dynamic_classification/)** - Runtime schema generation
+6. **[Bielik](bielik_example/)** - Local models and tools
+7. **[History Processor](history_processor/)** - Multi-turn conversations
+8. **[OCR Parsing](ocr_parsing_demo/)** - Complex real-world document processing
 
 ## Examples Overview
 
@@ -194,6 +207,36 @@ Most examples use PydanticAI's `Agent` class, which wraps an LLM with:
 - Output type schemas for structured responses
 - Async/await support for concurrent requests
 
+### Tools
+
+It's worth noticing that since those are examples, most of them are pretty basic. However, it's easy to add an a tool for given agent. Let's look at **[OCR Parsing](ocr_parsing/) code.
+
+Currently the Agent does all the work itself - classifies document, parses the output, does the OCR and so on for every document in the same way. But what if we'd like to have a different behavior based on the document type?
+
+```python
+from pydantic_ai import Agent, RunContext
+from my_schemas import OCRInvoiceOutput, ReportOcrOutput
+
+# The Agent acts as a router, deciding which tool to call
+# based on the document's visual or textual cues.
+agent = Agent(
+    'openai:gpt-5.1',
+    system_prompt="Analyze the document and use the appropriate tool for parsing."
+)
+
+@agent.tool
+async def parse_invoice(ctx: RunContext[MyDeps], data: bytes) -> OCRInvoiceOutput:
+    """Use this tool when the document is identified as an Invoice."""
+    # Your specialized OCR & validation logic here
+    return await ctx.deps.ocr_service.process(data, schema=OCRInvoiceOutput)
+
+@agent.tool
+async def parse_report(ctx: RunContext[MyDeps], data: bytes) -> ReportOcrOutput:
+    """Use this tool when the document is a multi-page Annual Report."""
+    # Custom logic for complex reports
+    return await ctx.deps.ocr_service.process(data, schema=ReportOcrOutput)
+```
+
 ### Structured Outputs
 
 Examples show how to enforce type safety using Pydantic `BaseModel`:
@@ -265,7 +308,6 @@ Bielik example shows alternative to cloud APIs:
 │   ├── 1_basic_ocr_demo.py
 │   ├── 2_ocr_with_structured_output.py
 │   ├── 3_ocr_validation.py
-│   ├── shared_fns.py
 │   ├── README.md
 │   ├── files/
 │   │   ├── samples/        # Sample PDF documents
@@ -275,18 +317,6 @@ Bielik example shows alternative to cloud APIs:
 └── README.md
 ```
 
-## Learning Path
-
-**Recommended order for learning PydanticAI**:
-
-1. **[Direct Model Requests](direct_model_request/)** - Understand basic LLM API calls
-2. **[Basic Sentiment](basic_sentiment/)** - Learn structured outputs with Pydantic
-3. **[Temperature](temperature/)** - Understand model parameters
-4. **[Dynamic Classification](dynamic_classification/)** - Runtime schema generation
-5. **[Bielik](bielik_example/)** - Local models and tools
-6. **[History Processor](history_processor/)** - Multi-turn conversations
-7. **[OCR Parsing](ocr_parsing_demo/)** - Complex real-world document processing
-
 ## Common Issues & Troubleshooting
 
 ### API Key Issues
@@ -309,7 +339,7 @@ Bielik example shows alternative to cloud APIs:
 
 - **poppler not found**: Install via your package manager (brew/apt/choco)
 - **PDF conversion fails**: Ensure PDF is valid and readable
-- **Rate limiting**: Reduce semaphore value in `shared_fns.py`
+- **Rate limiting**: Reduce semaphore value in `ocr_parsing/shared_fns.py`
 
 See individual example READMEs for specific setup requirements.
 
diff --git a/history_processor/1_basic_history_handling.py b/history_processor/1_basic_history_handling.py
@@ -19,7 +19,7 @@
 def main() -> None:
     """Run basic history inspection example."""
     # Create a basic agent
-    agent = Agent(model="openai:gpt-4o", system_prompt="Be a helpful assistant")
+    agent = Agent(model="openai:gpt-5.1", system_prompt="Be a helpful assistant")
 
     # Run a single inference
     prompt = "Tell me a funny joke. Respond in plain text."
diff --git a/history_processor/2_continuous_history.py b/history_processor/2_continuous_history.py
@@ -21,7 +21,7 @@
 def main() -> None:
     """Run multi-turn conversation example."""
     # Create agent
-    agent = Agent(model="openai:gpt-4o", system_prompt="Be a helpful assistant")
+    agent = Agent(model="openai:gpt-5.1", system_prompt="Be a helpful assistant")
 
     # First turn: Agent generates a joke
     prompt_1 = "Provide a really, really funny joke. Respond in plain text."
diff --git a/history_processor/3_history_usage.py b/history_processor/3_history_usage.py
@@ -21,7 +21,7 @@
 def main() -> None:
     """Run multi-turn conversation with persistence example."""
     # Create agent
-    agent = Agent(model="openai:gpt-4o", system_prompt="Be a helpful assistant")
+    agent = Agent(model="openai:gpt-5.1", system_prompt="Be a helpful assistant")
 
     # Turn 1: Get initial motto
     log.info("\n=== Turn 1 ===")
diff --git a/history_processor/4_history_filtering.py b/history_processor/4_history_filtering.py
@@ -59,13 +59,13 @@ def main() -> None:
 
     # Example 1: Summarize only user messages
     log.info("\n=== Filtering: User Messages Only ===")
-    agent_user = Agent("openai:gpt-4o", history_processors=[user_message_filter])
+    agent_user = Agent("openai:gpt-5.1", history_processors=[user_message_filter])
     result_1 = agent_user.run_sync("Please summarize the whole chat history until now.", message_history=history)
     log.info(f"Summary (user messages only):\n{result_1.output}")
 
     # Example 2: Attempt to filter only model messages (will fail)
     log.info("\n=== Filtering: Model Messages Only ===")
-    agent_model = Agent("openai:gpt-4o", history_processors=[model_message_filter])
+    agent_model = Agent("openai:gpt-5.1", history_processors=[model_message_filter])
     try:
         result_2 = agent_model.run_sync("Please summarize the whole chat history until now.", message_history=history)
         log.info(f"Summary (model messages only):\n{result_2.output}")
diff --git a/history_processor/5a_history_length_fixed.py b/history_processor/5a_history_length_fixed.py
@@ -51,7 +51,7 @@ def main() -> None:
 
     # Create agent with message count limiter
     log.info("\n=== Agent with Fixed Message Limit (last 3) ===")
-    agent_1 = Agent("openai:gpt-4o", history_processors=[keep_last_messages])
+    agent_1 = Agent("openai:gpt-5.1", history_processors=[keep_last_messages])
     result_1 = agent_1.run_sync("What were we talking about?", message_history=history)
     log.info(f"Answer (with truncated history):\n{result_1.output}")
 
diff --git a/history_processor/5b_history_length_dynamic.py b/history_processor/5b_history_length_dynamic.py
@@ -23,7 +23,7 @@
 # `tiktoken` is used for OpenAI models, therefore if you're going to
 # use different model provided, this bit will need to be changed
 # to different tokenizer that corresponding to model used
-tokenizer = tiktoken.encoding_for_model("gpt-4o")
+tokenizer = tiktoken.encoding_for_model("gpt-5.1")
 
 
 @dataclass
@@ -58,7 +58,7 @@ def estimate_tokens(messages: list[ModelMessage]) -> int:
 # of this example, threshold is set low for the logic to trigger. Usually,
 # this value is much bigger and corresponds to used model's context
 # window size. To fully utilize model processing capabilities it is best to
-# set this value close to context size. For `gpt-4o` model this value is
+# set this value close to context size. For `gpt-5.1` model this value is
 # equal to 128_000 tokens
 
 
@@ -100,7 +100,7 @@ def main() -> None:
 
     log.info("\n=== Agent with Dynamic Token-Based Context Guard ===")
     agent_2 = Agent(
-        "openai:gpt-4o",
+        "openai:gpt-5.1",
         deps_type=MemoryState,
         history_processors=[context_guard],
         system_prompt="You are a helpful and concise assistant.",
diff --git a/history_processor/5c_history_with_tools.py b/history_processor/5c_history_with_tools.py
@@ -78,7 +78,7 @@ def run_conversation_with_history_processor(history_processor: Callable[..., lis
     log.info(f"\n=== Running with history processor: {processor_name} ===")
 
     # Create agent with history processor
-    agent = Agent("openai:gpt-4o", system_prompt="You are a helpful and playful assistant", history_processors=[history_processor])
+    agent = Agent("openai:gpt-5.1", system_prompt="You are a helpful and playful assistant", history_processors=[history_processor])
 
     # Add basic tool
     @agent.tool
diff --git a/history_processor/6_persistent_history.py b/history_processor/6_persistent_history.py
@@ -35,7 +35,7 @@ class ConversationRecord(Base):
         id: Unique identifier for the record
         question: User prompt/question
         answer: Agent response
-        model_used: Model identifier (e.g., "gpt-4o")
+        model_used: Model identifier (e.g., "gpt-5.1")
         usage: Token usage metadata (input, output, total tokens)
     """
 
@@ -110,7 +110,7 @@ def main() -> None:
     """Run database persistence example."""
     # Initialize agent
     log.info("=== Initializing Agent ===")
-    agent = Agent("openai:gpt-4o", system_prompt=("You are a helpful assistant. Respond concisely and clearly."))
+    agent = Agent("openai:gpt-5.1", system_prompt=("You are a helpful assistant. Respond concisely and clearly."))
 
     # Run conversation and save to database
     log.info("\n=== Running Conversation ===")
diff --git a/history_processor/README.md b/history_processor/README.md
@@ -148,7 +148,7 @@ def my_processor(messages: list[ModelMessage]) -> list[ModelMessage]:
     """Transform history before agent processes it"""
     return [msg for msg in messages if isinstance(msg, ModelRequest)]
 
-agent = Agent("openai:gpt-4o", history_processors=[my_processor])
+agent = Agent("openai:gpt-5.1", history_processors=[my_processor])
 ```
 
 ### Important Constraint ⚠️
@@ -204,7 +204,7 @@ result_2 = agent.run_sync(
 def user_message_filter(messages: list[ModelMessage]) -> list[ModelMessage]:
     return [msg for msg in messages if isinstance(msg, ModelRequest)]
 
-agent = Agent("openai:gpt-4o", history_processors=[user_message_filter])
+agent = Agent("openai:gpt-5.1", history_processors=[user_message_filter])
 ```
 
 ### Pattern 3: Context Window Management
@@ -220,7 +220,7 @@ def keep_last_messages(
         else messages
     )
 
-agent = Agent("openai:gpt-4o", history_processors=[keep_last_messages])
+agent = Agent("openai:gpt-5.1", history_processors=[keep_last_messages])
 ```
 
 ### Pattern 4: Database Persistence
diff --git a/history_processor/output_3.json b/history_processor/output_3.json
@@ -43,7 +43,7 @@
         "rejected_prediction_tokens": 0
       }
     },
-    "model_name": "gpt-4o-2024-08-06",
+    "model_name": "gpt-5.1",
     "timestamp": "2026-01-13T11:45:42.152866Z",
     "kind": "response",
     "provider_name": "openai",
@@ -95,7 +95,7 @@
         "rejected_prediction_tokens": 0
       }
     },
-    "model_name": "gpt-4o-2024-08-06",
+    "model_name": "gpt-5.1",
     "timestamp": "2026-01-13T11:45:45.463059Z",
     "kind": "response",
     "provider_name": "openai",
@@ -147,7 +147,7 @@
         "rejected_prediction_tokens": 0
       }
     },
-    "model_name": "gpt-4o-2024-08-06",
+    "model_name": "gpt-5.1",
     "timestamp": "2026-01-13T11:45:46.360274Z",
     "kind": "response",
     "provider_name": "openai",
@@ -199,7 +199,7 @@
         "rejected_prediction_tokens": 0
       }
     },
-    "model_name": "gpt-4o-2024-08-06",
+    "model_name": "gpt-5.1",
     "timestamp": "2026-01-13T11:45:50.014196Z",
     "kind": "response",
     "provider_name": "openai",
diff --git a/ocr_parsing/1_basic_ocr_demo.py b/ocr_parsing/1_basic_ocr_demo.py
diff --git a/ocr_parsing/2_ocr_with_structured_output.py b/ocr_parsing/2_ocr_with_structured_output.py
diff --git a/ocr_parsing/README.md b/ocr_parsing/README.md
diff --git a/ocr_parsing/shared_fns.py b/ocr_parsing/shared_fns.py