CodeCutTech
diff --git a/‎llm/smart_data_extraction_llamaindex/README.md‎
Lines changed: 121 additions & 0 deletions b/‎llm/smart_data_extraction_llamaindex/README.md‎
Lines changed: 121 additions & 0 deletions
diff --git a/‎llm/smart_data_extraction_llamaindex/example_invoice.py‎
Lines changed: 65 additions & 0 deletions b/‎llm/smart_data_extraction_llamaindex/example_invoice.py‎
Lines changed: 65 additions & 0 deletions
@@ -0,0 +1,121 @@
+# Generic Document Extraction Pipeline
+
+A flexible, schema-driven pipeline for extracting structured data from any type of document or image using LlamaParse and OpenAI.
+
+## Features
+
+- **Dynamic Schema Support**: Use any Pydantic model to define your extraction schema
+- **Optional Preprocessing**: Scale and optimize images before extraction
+- **Flexible Transformations**: Apply custom transformation functions to extracted data
+- **Extensible**: Easy to adapt for receipts, invoices, forms, IDs, or any document type
+
+## Quick Start
+
+### 1. Define Your Schema
+
+Create a Pydantic model for your document type:
+
+```python
+from pydantic import BaseModel, Field
+
+class Invoice(BaseModel):
+    invoice_number: str = Field(description="Invoice number")
+    vendor_name: str = Field(description="Vendor name")
+    total_amount: float = Field(description="Total amount")
+```
+
+### 2. Run Extraction
+
+```python
+from extract_receipts_pipeline import main
+
+result_df = main(
+    image_paths=["invoice1.pdf", "invoice2.pdf"],
+    output_cls=Invoice,
+    prompt="Extract invoice data from: {context_str}",
+    id_column="invoice_id",
+)
+```
+
+## Usage Examples
+
+### Basic Extraction (No Ground Truth)
+
+```python
+from schemas.receipt_schema import Receipt
+from extract_receipts_pipeline import main
+
+result = main(
+    image_paths=["receipt1.jpg"],
+    output_cls=Receipt,
+    prompt="Extract receipt data: {context_str}",
+)
+```
+
+### With Preprocessing
+
+```python
+from pathlib import Path
+
+result = main(
+    image_paths=["low_res.jpg"],
+    output_cls=Receipt,
+    prompt="Extract data: {context_str}",
+    preprocess=True,
+    output_dir=Path("processed_images"),
+    scale_factor=3,
+)
+```
+
+### With Custom Transformations
+
+```python
+import pandas as pd
+
+def transform_data(df: pd.DataFrame) -> pd.DataFrame:
+    df["vendor"] = df["vendor"].str.upper()
+    df["amount"] = pd.to_numeric(df["amount"], errors="coerce")
+    return df
+
+result = main(
+    image_paths=["invoice.pdf"],
+    output_cls=Invoice,
+    prompt="Extract: {context_str}",
+    transform_fn=transform_data,
+)
+```
+
+## Parameters
+
+### Required
+- `image_paths`: List of document/image paths
+- `output_cls`: Pydantic model class for extraction
+- `prompt`: Extraction prompt template (must include `{context_str}`)
+
+### Optional
+- `id_column`: Document ID column name (default: "document_id")
+- `fields`: Fields to extract (default: all model fields)
+- `preprocess`: Enable image preprocessing (default: False)
+- `output_dir`: Directory for preprocessed images
+- `scale_factor`: Image scaling factor (default: 3)
+- `transform_fn`: Custom transformation function
+
+## File Structure
+
+```
+llm/smart_data_extraction_llamaindex/
+├── extract_receipts_pipeline.py   # Main pipeline
+├── schemas/
+│   ├── __init__.py
+│   └── receipt_schema.py          # Receipt schema example
+├── example_invoice.py             # Invoice extraction example
+└── README.md                      # This file
+```
+
+## Custom Schema Examples
+
+See:
+- `schemas/receipt_schema.py` - Receipt extraction
+- `example_invoice.py` - Invoice extraction example
+
+Create your own schemas in the `schemas/` directory!
@@ -0,0 +1,65 @@
+"""Example: Using the generic extraction pipeline with a custom invoice schema."""
+
+from typing import Optional
+
+import pandas as pd
+
+# Import the generic extraction pipeline
+from extract_receipts_pipeline import main
+from pydantic import BaseModel, Field
+
+
+# Define custom schema for invoices
+class Invoice(BaseModel):
+	"""Invoice extraction schema."""
+
+	invoice_number: str = Field(description="Invoice number or ID")
+	vendor_name: str = Field(description="Vendor or supplier name")
+	invoice_date: Optional[str] = Field(default=None, description="Invoice date")
+	total_amount: float = Field(description="Total invoice amount")
+	tax_amount: Optional[float] = Field(default=None, description="Tax amount if present")
+
+
+# Optional: Define transformation function
+def transform_invoice_data(df: pd.DataFrame) -> pd.DataFrame:
+	"""Transform invoice data."""
+	df = df.copy()
+	df["vendor_name"] = df["vendor_name"].str.upper()
+	df["total_amount"] = pd.to_numeric(df["total_amount"], errors="coerce")
+	df["tax_amount"] = pd.to_numeric(df["tax_amount"], errors="coerce")
+	return df
+
+
+# Define extraction prompt
+INVOICE_PROMPT = """
+You are extracting structured data from an invoice document.
+Use the provided text to populate the Invoice model accurately.
+If a field is not present in the document, return null.
+
+{context_str}
+"""
+
+
+if __name__ == "__main__":
+	# Example usage - replace with your actual invoice paths
+	invoice_paths = [
+		"path/to/invoice1.pdf",
+		"path/to/invoice2.pdf",
+	]
+
+	# Run extraction
+	result_df = main(
+		image_paths=invoice_paths,
+		output_cls=Invoice,
+		prompt=INVOICE_PROMPT,
+		id_column="invoice_id",
+		fields=["invoice_number", "vendor_name", "invoice_date", "total_amount", "tax_amount"],
+		transform_fn=transform_invoice_data,
+	)
+
+	print("\nExtracted Invoices:")
+	print(result_df)
+
+	# Save results
+	result_df.to_csv("extracted_invoices.csv", index=False)
+	print("\nResults saved to extracted_invoices.csv")