agamm
diff --git a/‎README.md‎
Lines changed: 4 additions & 0 deletions b/‎README.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎batchata/core/batch.py‎
Lines changed: 7 additions & 1 deletion b/‎batchata/core/batch.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎batchata/core/batch_run.py‎
Lines changed: 78 additions & 1 deletion b/‎batchata/core/batch_run.py‎
Lines changed: 78 additions & 1 deletion
diff --git a/‎batchata/providers/anthropic/anthropic.py‎
Lines changed: 27 additions & 9 deletions b/‎batchata/providers/anthropic/anthropic.py‎
Lines changed: 27 additions & 9 deletions
diff --git a/‎batchata/utils/pdf.py‎
Lines changed: 119 additions & 3 deletions b/‎batchata/utils/pdf.py‎
Lines changed: 119 additions & 3 deletions
@@ -16,6 +16,7 @@ AI providers offer batch APIs that process requests asynchronously at 50% reduce
 
 - Native batch processing (50% cost savings via provider APIs)
 - Set `max_cost_usd` limits for batch requests
+- **Dry run mode** for cost estimation and job planning
 - Time limit control with `.add_time_limit(seconds=, minutes=, hours=)`
 - State persistence in case of network interruption
 - Structured output `.json` format with Pydantic models
@@ -50,6 +51,9 @@ for file in files:
 run = batch.run()
 
 results = run.results()  # {"completed": [JobResult], "failed": [JobResult], "cancelled": [JobResult]}
+
+# Or preview costs first with dry run
+run = batch.run(dry_run=True)  # Shows cost estimates without executing
 ```
 
 ## Complete Example
 
@@ -9,6 +9,7 @@
 from pydantic import BaseModel
 
 from .batch_params import BatchParams
+from .batch_run import BatchRun
 from .job import Job
 from ..providers import get_provider
 from ..types import Message
@@ -313,7 +314,7 @@ def add_job(
         self.jobs.append(job)
         return self
 
-    def run(self, on_progress: Optional[Callable[[Dict, float, Dict], None]] = None, progress_interval: float = 1.0, print_status: bool = False) -> 'BatchRun':
+    def run(self, on_progress: Optional[Callable[[Dict, float, Dict], None]] = None, progress_interval: float = 1.0, print_status: bool = False, dry_run: bool = False) -> 'BatchRun':
         """Execute the batch.
         
         Creates a BatchRun instance and executes the jobs synchronously.
@@ -323,6 +324,7 @@ def run(self, on_progress: Optional[Callable[[Dict, float, Dict], None]] = None,
                         (stats_dict, elapsed_time_seconds, batch_data)
             progress_interval: Interval in seconds between progress updates (default: 1.0)
             print_status: Whether to show rich progress display (default: False)
+            dry_run: If True, only show cost estimation without executing (default: False)
             
         Returns:
             BatchRun instance with completed results
@@ -339,6 +341,10 @@ def run(self, on_progress: Optional[Callable[[Dict, float, Dict], None]] = None,
         # Create and start the run
         run = BatchRun(self.config, self.jobs)
 
+        # Handle dry run mode
+        if dry_run:
+            return run.dry_run()
+        
         # Set progress callback - either rich display or custom callback
         if print_status:
             return self._run_with_rich_display(run, progress_interval)
 
@@ -826,4 +826,81 @@ def _create_cancelled_results(self) -> List[JobResult]:
 
     def shutdown(self):
         """Shutdown (no-op for synchronous execution)."""
-        pass
+        pass
+    
+    def dry_run(self) -> 'BatchRun':
+        """Perform a dry run - show cost estimation and job details without executing.
+        
+        Returns:
+            Self for chaining (doesn't actually execute jobs)
+        """
+        logger.info("=== DRY RUN MODE ===")
+        logger.info("This will show cost estimates without executing jobs")
+        
+        # Load existing state if reuse_state=True
+        if self.config.reuse_state:
+            self.state_manager.load_state(self)
+        
+        # Filter out completed jobs from previous runs
+        self.pending_jobs = [job for job in self.jobs.values() if job.id not in self.completed_results]
+        
+        if not self.pending_jobs:
+            logger.info("No pending jobs to analyze (all jobs already completed)")
+            return self
+        
+        logger.info(f"Analyzing {len(self.pending_jobs)} pending jobs...")
+        
+        # Group jobs by provider and analyze costs
+        provider_groups = self._group_jobs_by_provider()
+        total_estimated_cost = 0.0
+        
+        logger.info(f"\nJob breakdown:")
+        for provider_name, jobs in provider_groups.items():
+            provider = get_provider(jobs[0].model)
+            logger.info(f"\n{provider_name} ({len(jobs)} jobs):")
+            
+            job_batches = [jobs[i:i + self.config.items_per_batch] 
+                          for i in range(0, len(jobs), self.config.items_per_batch)]
+            
+            for batch_idx, batch_jobs in enumerate(job_batches, 1):
+                estimated_cost = provider.estimate_cost(batch_jobs)
+                total_estimated_cost += estimated_cost
+                
+                logger.info(f"  Batch {batch_idx}: {len(batch_jobs)} jobs, estimated cost: ${estimated_cost:.4f}")
+                for job in batch_jobs:
+                    if job.file:
+                        logger.info(f"    - {job.id}: {job.file.name} (citations: {job.enable_citations})")
+                    else:
+                        logger.info(f"    - {job.id}: direct messages (citations: {job.enable_citations})")
+        
+        # Show cost summary
+        logger.info(f"\n=== COST SUMMARY ===")
+        logger.info(f"Total estimated cost: ${total_estimated_cost:.4f}")
+        
+        if self.config.cost_limit_usd:
+            logger.info(f"Cost limit: ${self.config.cost_limit_usd:.2f}")
+            if total_estimated_cost > self.config.cost_limit_usd:
+                excess = total_estimated_cost - self.config.cost_limit_usd
+                logger.warning(f"⚠️ Estimated cost exceeds limit by ${excess:.4f}")
+            else:
+                remaining = self.config.cost_limit_usd - total_estimated_cost
+                logger.info(f"✅ Within cost limit (${remaining:.4f} remaining)")
+        else:
+            logger.info("No cost limit set")
+        
+        # Show execution plan
+        logger.info(f"\n=== EXECUTION PLAN ===")
+        total_batches = sum(
+            len(jobs) // self.config.items_per_batch + (1 if len(jobs) % self.config.items_per_batch else 0)
+            for jobs in provider_groups.values()
+        )
+        logger.info(f"Total batches to process: {total_batches}")
+        logger.info(f"Max parallel batches: {self.config.max_parallel_batches}")
+        logger.info(f"Items per batch: {self.config.items_per_batch}")
+        logger.info(f"Results directory: {self.config.results_dir}")
+        
+        logger.info("\n=== DRY RUN COMPLETE ===")
+        logger.info("To execute for real, call run() without dry_run=True")
+        
+        return self
+    
@@ -238,13 +238,30 @@ def estimate_cost(self, jobs: List[Job]) -> float:
                 if system_prompt:
                     full_text += system_prompt + "\n\n"
 
-                for msg in messages:
-                    role = msg.get("role", "")
-                    content = msg.get("content", "")
-                    full_text += f"{role}: {content}\n\n"
-                
-                # Estimate tokens using Claude-specific estimator
-                input_tokens = token_count_simple(full_text)
+                # Handle PDF files specially
+                if job.file and job.file.suffix.lower() == '.pdf':
+                    from ...utils.pdf import estimate_pdf_tokens
+                    input_tokens = estimate_pdf_tokens(job.file, job.prompt)
+                    logger.debug(f"Job {job.id}: Estimated {input_tokens} tokens for PDF")
+                else:
+                    # Normal message handling
+                    for msg in messages:
+                        role = msg.get("role", "")
+                        content = msg.get("content", "")
+                        # Handle content that might be a list (for multimodal messages)
+                        if isinstance(content, list):
+                            for part in content:
+                                if isinstance(part, dict) and part.get("type") == "text":
+                                    full_text += f"{role}: {part.get('text', '')}\n\n"
+                        else:
+                            full_text += f"{role}: {content}\n\n"
+                    
+                    # Add prompt if it's a file-based job
+                    if job.prompt:
+                        full_text += f"\nUser prompt: {job.prompt}\n"
+                    
+                    # Estimate tokens using Claude-specific estimator
+                    input_tokens = token_count_simple(full_text)
 
                 # Calculate costs using tokencost with actual Claude model
                 input_cost = float(calculate_cost_by_tokens(
@@ -264,7 +281,7 @@ def estimate_cost(self, jobs: List[Job]) -> float:
                 discount = model_config.batch_discount if model_config else 0.5
                 job_cost = (input_cost + output_cost) * discount
 
-                logger.info(
+                logger.debug(
                     f"Job {job.id}: ~{input_tokens} input tokens, "
                     f"{job.max_tokens} max output tokens, "
                     f"cost: ${job_cost:.6f} (with {int(discount*100)}% batch discount)"
@@ -276,4 +293,5 @@ def estimate_cost(self, jobs: List[Job]) -> float:
                 logger.warning(f"Failed to estimate cost for job {job.id}: {e}")
                 continue
 
-        return total_cost
+        return total_cost
+    
@@ -1,14 +1,17 @@
 """
 PDF Utilities Module
 
-Provides utility functions for creating test PDFs.
+Provides utility functions for creating test PDFs and extracting text.
 """
 
 import re
 from pathlib import Path
-from typing import List
+from typing import List, Tuple, Optional
 
 import pypdf
+from ..utils import get_logger
+
+logger = get_logger(__name__)
 
 
 def create_pdf(pages: List[str]) -> bytes:
@@ -136,4 +139,117 @@ def is_textual_pdf(
 
     except Exception:
         # If PDF can't be read, assume it's not textual
-        return 0.0
+        return 0.0
+
+
+def extract_text_from_pdf(path: str | Path) -> str:
+    """
+    Extract all text from a PDF file.
+    
+    Args:
+        path: Path to the PDF file
+        
+    Returns:
+        str: Extracted text from all pages
+    """
+    try:
+        reader = pypdf.PdfReader(str(path))
+        text_parts = []
+        
+        for page_num, page in enumerate(reader.pages):
+            try:
+                text = page.extract_text()
+                if text.strip():
+                    text_parts.append(text)
+            except Exception as e:
+                logger.debug(f"Failed to extract text from page {page_num}: {e}")
+                continue
+        
+        return "\n\n".join(text_parts)
+        
+    except Exception as e:
+        logger.warning(f"Failed to extract text from PDF {path}: {e}")
+        return ""
+
+
+def get_pdf_info(path: str | Path) -> Tuple[int, bool, Optional[str]]:
+    """
+    Get information about a PDF file for cost estimation.
+    
+    Args:
+        path: Path to the PDF file
+        
+    Returns:
+        Tuple of (page_count, is_textual, extracted_text)
+        - page_count: Number of pages in the PDF
+        - is_textual: Whether the PDF has extractable text
+        - extracted_text: Text content if textual, None otherwise
+    """
+    try:
+        reader = pypdf.PdfReader(str(path))
+        page_count = len(reader.pages)
+        
+        # Check if PDF is textual
+        textual_score = is_textual_pdf(path)
+        is_textual = textual_score > 0.5  # Consider textual if >50% pages have text
+        
+        # Extract text if textual
+        extracted_text = None
+        if is_textual:
+            extracted_text = extract_text_from_pdf(path)
+            if not extracted_text.strip():
+                is_textual = False
+                extracted_text = None
+        
+        logger.debug(f"PDF info for {path}: {page_count} pages, textual={is_textual}, "
+                    f"text_length={len(extracted_text) if extracted_text else 0}")
+        
+        return page_count, is_textual, extracted_text
+        
+    except Exception as e:
+        logger.error(f"Failed to get PDF info for {path}: {e}")
+        return 0, False, None
+
+
+def estimate_pdf_tokens(path: str | Path, prompt: Optional[str] = None, 
+                       pdf_token_multiplier: float = 1.5,
+                       tokens_per_page: int = 2000) -> int:
+    """
+    Estimate token count for a PDF file.
+    
+    This is a generic utility that can be used by any provider to estimate
+    tokens for PDF processing.
+    
+    Args:
+        path: Path to the PDF file
+        prompt: Optional prompt to include in token count
+        pdf_token_multiplier: Coefficient to apply to extracted text tokens
+                            to account for PDF processing overhead (default: 1.5)
+        tokens_per_page: Estimated tokens per page for image-based PDFs (default: 2000)
+        
+    Returns:
+        Estimated token count
+    """
+    from .llm import token_count_simple
+    
+    page_count, is_textual, extracted_text = get_pdf_info(path)
+    
+    if is_textual and extracted_text:
+        # Count tokens from extracted text
+        base_tokens = token_count_simple(extracted_text)
+        if prompt:
+            base_tokens += token_count_simple(prompt)
+        
+        # Apply multiplier to account for PDF processing overhead
+        input_tokens = int(base_tokens * pdf_token_multiplier)
+        logger.debug(f"Textual PDF {path}: {page_count} pages, "
+                    f"base tokens: {base_tokens}, with {pdf_token_multiplier}x multiplier: {input_tokens}")
+    else:
+        # Estimate based on page count
+        input_tokens = page_count * tokens_per_page
+        if prompt:
+            input_tokens += token_count_simple(prompt)
+        logger.debug(f"Image-based PDF {path}: {page_count} pages, "
+                    f"estimated tokens: {input_tokens} ({tokens_per_page} per page)")
+    
+    return input_tokens