AdemBoukhris457
diff --git a/‎docs/api/parsers.md‎
Lines changed: 11 additions & 4 deletions b/‎docs/api/parsers.md‎
Lines changed: 11 additions & 4 deletions
diff --git a/‎docs/user-guide/core-concepts.md‎
Lines changed: 59 additions & 6 deletions b/‎docs/user-guide/core-concepts.md‎
Lines changed: 59 additions & 6 deletions
diff --git a/‎docs/user-guide/engines/ocr-engine.md‎
Lines changed: 112 additions & 8 deletions b/‎docs/user-guide/engines/ocr-engine.md‎
Lines changed: 112 additions & 8 deletions
diff --git a/‎doctra/engines/ocr/__init__.py‎
Lines changed: 3 additions & 2 deletions b/‎doctra/engines/ocr/__init__.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎doctra/engines/ocr/api.py‎
Lines changed: 32 additions & 0 deletions b/‎doctra/engines/ocr/api.py‎
Lines changed: 32 additions & 0 deletions
@@ -194,10 +194,17 @@ parser.parse(
 
 | Parameter | Type | Default | Description |
 |-----------|------|---------|-------------|
-| `ocr_lang` | str | "eng" | Tesseract language code |
-| `ocr_psm` | int | 4 | Page segmentation mode |
-| `ocr_oem` | int | 3 | OCR engine mode |
-| `ocr_extra_config` | str | "" | Additional Tesseract configuration |
+| `ocr_engine` | str | "pytesseract" | OCR engine to use: "pytesseract" or "paddleocr" |
+| `ocr_lang` | str | "eng" | Tesseract language code (PyTesseract only) |
+| `ocr_psm` | int | 4 | Page segmentation mode (PyTesseract only) |
+| `ocr_oem` | int | 3 | OCR engine mode (PyTesseract only) |
+| `ocr_extra_config` | str | "" | Additional Tesseract configuration (PyTesseract only) |
+| `paddleocr_use_doc_orientation_classify` | bool | False | Enable document orientation classification (PaddleOCR only) |
+| `paddleocr_use_doc_unwarping` | bool | False | Enable text image rectification (PaddleOCR only) |
+| `paddleocr_use_textline_orientation` | bool | False | Enable text line orientation classification (PaddleOCR only) |
+| `paddleocr_device` | str | "gpu" | Device for PaddleOCR: "cpu" or "gpu" (PaddleOCR only) |
+
+**Note**: When using `ocr_engine="paddleocr"`, PaddleOCR 3.0's PP-OCRv5_server model is used by default. Models are automatically downloaded on first use.
 
 ### VLM Parameters
 
 
@@ -171,18 +171,43 @@ This shows bounding boxes with colors:
 
 ## OCR Processing
 
-OCR (Optical Character Recognition) extracts text from images.
+OCR (Optical Character Recognition) extracts text from images. Doctra supports two OCR engines:
+
+### OCR Engines
+
+**PyTesseract** (default)
+:   Traditional Tesseract OCR with extensive language support and fine-grained control.
+
+**PaddleOCR**
+:   Advanced PP-OCRv5_server model (PaddleOCR 3.0) with superior accuracy and GPU acceleration.
 
 ### Configuration
 
+**Using PyTesseract (default):**
+
 ```python
 parser = StructuredPDFParser(
+    ocr_engine="pytesseract",  # Optional, this is the default
     ocr_lang="eng",  # Language
     ocr_psm=6,  # Page segmentation mode
     ocr_oem=3  # OCR Engine mode
 )
 ```
 
+**Using PaddleOCR:**
+
+```python
+parser = StructuredPDFParser(
+    ocr_engine="paddleocr",
+    paddleocr_device="gpu",  # Use "cpu" if no GPU available
+    paddleocr_use_doc_orientation_classify=False,
+    paddleocr_use_doc_unwarping=False,
+    paddleocr_use_textline_orientation=False
+)
+```
+
+### PyTesseract Parameters
+
 **ocr_lang**
 :   Tesseract language code. Examples: `eng`, `fra`, `spa`, `deu`
 
@@ -201,22 +226,50 @@ parser = StructuredPDFParser(
     - `1`: Neural nets LSTM
     - `3`: Default (both)
 
+### PaddleOCR Parameters
+
+**paddleocr_device**
+:   Processing device: `"gpu"` (default, recommended) or `"cpu"`
+
+**paddleocr_use_doc_orientation_classify**
+:   Enable automatic document orientation detection (default: `False`)
+
+**paddleocr_use_doc_unwarping**
+:   Enable perspective correction for scanned documents (default: `False`)
+
+**paddleocr_use_textline_orientation**
+:   Enable text line orientation classification (default: `False`)
+
 ### Improving OCR Accuracy
 
-1. **Increase DPI**: Higher resolution = better text recognition
+1. **Choose PaddleOCR for complex documents**: Better accuracy on degraded or complex documents
+   ```python
+   parser = StructuredPDFParser(
+       ocr_engine="paddleocr",
+       paddleocr_device="gpu"
+   )
+   ```
+
+2. **Increase DPI**: Higher resolution = better text recognition
    ```python
    parser = StructuredPDFParser(dpi=300)
    ```
 
-2. **Use Image Restoration**: Enhance document quality first
+3. **Use Image Restoration**: Enhance document quality first
    ```python
    from doctra import EnhancedPDFParser
-   parser = EnhancedPDFParser(use_image_restoration=True)
+   parser = EnhancedPDFParser(
+       use_image_restoration=True,
+       ocr_engine="paddleocr"  # Combine for best results
+   )
    ```
 
-3. **Correct Language**: Specify document language
+4. **Correct Language** (PyTesseract): Specify document language
    ```python
-   parser = StructuredPDFParser(ocr_lang="fra")  # French
+   parser = StructuredPDFParser(
+       ocr_engine="pytesseract",
+       ocr_lang="fra"  # French
+   )
    ```
 
 ## Image Restoration
 
@@ -4,21 +4,62 @@ Guide to text extraction using OCR in Doctra.
 
 ## Overview
 
-Doctra uses Tesseract OCR to extract text from document images. The OCR engine is highly configurable for different document types and languages.
+Doctra supports two OCR engines for text extraction:
 
-## Configuration
+1. **PyTesseract** (default) - Traditional Tesseract OCR engine with extensive language support
+2. **PaddleOCR** - Advanced PP-OCRv5_server model released in PaddleOCR 3.0, offering superior accuracy and performance
+
+You can choose between these engines based on your needs. PyTesseract is the default and works well for most use cases, while PaddleOCR provides enhanced accuracy for complex documents.
+
+## Choosing an OCR Engine
+
+### PyTesseract (Default)
+
+PyTesseract is the default OCR engine and works well for most documents. It offers extensive language support and fine-grained control.
 
 ```python
 from doctra import StructuredPDFParser
 
+# PyTesseract is the default - no need to specify
 parser = StructuredPDFParser(
     ocr_lang="eng",
     ocr_psm=6,
     ocr_oem=3
 )
+
+# Or explicitly specify it
+parser = StructuredPDFParser(
+    ocr_engine="pytesseract",
+    ocr_lang="eng",
+    ocr_psm=6,
+    ocr_oem=3
+)
+```
+
+### PaddleOCR with PP-OCRv5_server
+
+PaddleOCR provides the advanced **PP-OCRv5_server** model (default in PaddleOCR 3.0), which offers:
+
+- **Higher accuracy** for complex documents
+- **Better performance** on GPU
+- **Advanced text detection** and recognition
+- **Automatic model management** (models downloaded automatically)
+
+```python
+from doctra import StructuredPDFParser
+
+parser = StructuredPDFParser(
+    ocr_engine="paddleocr",
+    paddleocr_device="gpu",  # Use "cpu" if no GPU available
+    paddleocr_use_doc_orientation_classify=False,
+    paddleocr_use_doc_unwarping=False,
+    paddleocr_use_textline_orientation=False
+)
 ```
 
-## Parameters
+## PyTesseract Parameters
+
+These parameters are only used when `ocr_engine="pytesseract"` (or when using the default):
 
 **ocr_lang**
 :   Tesseract language code
@@ -41,40 +82,103 @@ parser = StructuredPDFParser(
     - `1`: Neural nets LSTM
     - `3`: Default (both)
 
+**ocr_extra_config**
+:   Additional Tesseract configuration string
+
+## PaddleOCR Parameters
+
+These parameters are only used when `ocr_engine="paddleocr"`:
+
+**paddleocr_device**
+:   Device to use for OCR processing
+    - `"gpu"`: Use GPU acceleration (default, recommended if available)
+    - `"cpu"`: Use CPU processing
+
+**paddleocr_use_doc_orientation_classify**
+:   Enable document orientation classification model (default: `False`)
+    - Automatically detects and corrects document orientation
+
+**paddleocr_use_doc_unwarping**
+:   Enable text image rectification model (default: `False`)
+    - Corrects perspective distortion in scanned documents
+
+**paddleocr_use_textline_orientation**
+:   Enable text line orientation classification model (default: `False`)
+    - Handles rotated text lines
+
+**Note**: The PP-OCRv5_server model is automatically used by default in PaddleOCR 3.0. Models are automatically downloaded on first use and cached for future use.
+
 ## Improving Accuracy
 
-### 1. Increase DPI
+### 1. Choose the Right OCR Engine
+
+For complex documents or when accuracy is critical, consider using PaddleOCR:
+
+```python
+parser = StructuredPDFParser(
+    ocr_engine="paddleocr",
+    paddleocr_device="gpu"  # Use GPU for better performance
+)
+```
+
+### 2. Increase DPI
+
+Higher resolution improves text recognition for both engines:
 
 ```python
 parser = StructuredPDFParser(dpi=300)
 ```
 
-### 2. Use Image Restoration
+### 3. Use Image Restoration
+
+Enhance document quality before OCR:
 
 ```python
 from doctra import EnhancedPDFParser
 
 parser = EnhancedPDFParser(
-    use_image_restoration=True
+    use_image_restoration=True,
+    ocr_engine="paddleocr"  # Combine with PaddleOCR for best results
 )
 ```
 
-### 3. Correct Language
+### 4. Correct Language (PyTesseract)
+
+For PyTesseract, specify the document language:
 
 ```python
 parser = StructuredPDFParser(
+    ocr_engine="pytesseract",
     ocr_lang="fra"  # For French documents
 )
 ```
 
-## Multi-language Documents
+## Multi-language Documents (PyTesseract)
+
+PyTesseract supports multiple languages:
 
 ```python
 parser = StructuredPDFParser(
+    ocr_engine="pytesseract",
     ocr_lang="eng+fra+deu"  # Multiple languages
 )
 ```
 
+## When to Use Each Engine
+
+### Use PyTesseract when:
+- Working with standard documents
+- Need multi-language support
+- Want fine-grained control over OCR parameters
+- CPU-only environment
+
+### Use PaddleOCR when:
+- Dealing with complex or degraded documents
+- Need maximum accuracy
+- Have GPU available for faster processing
+- Working with Asian languages (better support)
+- Processing large batches of documents
+
 ## See Also
 
 - [Enhanced Parser](../parsers/enhanced-parser.md) - Improve OCR with restoration
 
@@ -1,4 +1,5 @@
 from .pytesseract_engine import PytesseractOCREngine
-from .api import ocr_image
+from .paddleocr_engine import PaddleOCREngine
+from .api import ocr_image, ocr_image_paddleocr
 
-__all__ = ["PytesseractOCREngine", "ocr_image"]
+__all__ = ["PytesseractOCREngine", "PaddleOCREngine", "ocr_image", "ocr_image_paddleocr"]
@@ -4,6 +4,7 @@
 from PIL import Image
 
 from .pytesseract_engine import PytesseractOCREngine
+from .paddleocr_engine import PaddleOCREngine
 
 
 def ocr_image(
@@ -34,3 +35,34 @@ def ocr_image(
         tesseract_cmd=tesseract_cmd, lang=lang, psm=psm, oem=oem, extra_config=extra_config
     )
     return engine.recognize(cropped_pil)
+
+
+def ocr_image_paddleocr(
+    cropped_pil: Image.Image,
+    *,
+    use_doc_orientation_classify: bool = False,
+    use_doc_unwarping: bool = False,
+    use_textline_orientation: bool = False,
+    device: str = "gpu",
+) -> str:
+    """
+    One-shot OCR: run PaddleOCR on a cropped PIL image and return text.
+    
+    Convenience function that creates a PaddleOCREngine instance and
+    immediately runs OCR on the provided image. Useful for quick text extraction
+    without needing to manage engine instances.
+
+    :param cropped_pil: PIL Image object to perform OCR on
+    :param use_doc_orientation_classify: Enable document orientation classification (default: False)
+    :param use_doc_unwarping: Enable text image rectification (default: False)
+    :param use_textline_orientation: Enable text line orientation classification (default: False)
+    :param device: Device to use for OCR ("cpu" or "gpu", default: "gpu")
+    :return: Extracted text string from the image
+    """
+    engine = PaddleOCREngine(
+        use_doc_orientation_classify=use_doc_orientation_classify,
+        use_doc_unwarping=use_doc_unwarping,
+        use_textline_orientation=use_textline_orientation,
+        device=device
+    )
+    return engine.recognize(cropped_pil)