refactor: simplify the workflow by removing the request dto

Alijanloo · Alijanloo · commit e0965c5abc78 · 2025-10-17T17:21:52.000+03:30
diff --git a/README.md b/README.md
@@ -38,15 +38,16 @@ pip install -e .
 ### Usage
 ```python
 from pdf2table.frameworks.table_extraction_factory import TableExtractionFactory
-from pdf2table.usecases.dtos import TableExtractionRequest
 
 # Initialize the factory
 factory = TableExtractionFactory()
 adapter = factory.create_table_extraction_adapter()
 
-# Extract tables from PDF
-request = TableExtractionRequest(pdf_path="document.pdf", page_number=0)
-response = adapter.extract_tables(request)
+# Extract tables from a specific page
+response = adapter.extract_tables(pdf_path="document.pdf", page_number=0)
+
+# Or extract tables from all pages
+response = adapter.extract_tables(pdf_path="document.pdf")
 
 # Access extracted tables
 for table in response.tables:
@@ -72,18 +73,14 @@ service = TableExtractionService(device="cpu")
 page_result = service.extract_tables_from_page("document.pdf", page_number=0)
 print(f"Found {len(page_result['tables'])} tables on page 0")
 
-# Extract tables from entire PDF
+# Extract tables from entire PDF (all pages)
 all_results = service.extract_tables_from_pdf("document.pdf")
-for page_idx, page_result in enumerate(all_results):
-    if page_result.get('success', True):
-        tables = page_result.get('tables', [])
-        print(f"Page {page_idx}: Found {len(tables)} tables")
-        
-        # Process each table
-        for table_idx, table in enumerate(tables):
-            print(f"  Table {table_idx + 1}: {table['rows']} rows x {table['columns']} columns")
-    else:
-        print(f"Page {page_idx}: Error - {page_result.get('error', 'Unknown error')}")
+tables = all_results.get('tables', [])
+print(f"Found {len(tables)} total tables across all pages")
+
+# Process each table
+for table_idx, table in enumerate(tables):
+    print(f"  Table {table_idx + 1}: {table['metadata']}")
 ```
 
 ## 📋 Logging
diff --git a/docs/architecture_guide.md b/docs/architecture_guide.md
@@ -34,19 +34,21 @@ pdf2table/
 ### 2. Use Cases Layer (`pdf2table/usecases/`)
 - **table_extraction_use_case.py**: Application business logic
   - `TableExtractionUseCase`: Orchestrates table extraction workflow
+    - `extract_tables(pdf_path, page_number=None)`: Main extraction method
   - `TableGridBuilder`: Builds structured grids from detected cells
   - Contains the core algorithms for grouping rows/columns and building grids
 - **services/table_services.py**: Supporting services for use cases
   - `TableValidationService`: Validates detected table structures and cells
   - `CoordinateClusteringService`: Clusters coordinates for row/column grouping
 - **dtos.py**: Data transfer objects for use cases
-  - `TableExtractionRequest`: Request DTO for table extraction
   - `TableExtractionResponse`: Response DTO for table extraction
 
 ### 3. Interface Adapters Layer (`pdf2table/adaptors/`)
-- **table_extraction_ports.py**: Abstract interfaces and DTOs
-  - Port interfaces: `PDFImageExtractorPort`, `TableDetectorPort`, etc.
+- **table_extraction_adaptor.py**: Adapter for table extraction
   - `TableExtractionAdapter`: Coordinates between use cases and external interfaces
+    - `extract_tables(pdf_path, page_number=None)`: Main adapter method
+      - Accepts `pdf_path` and optional `page_number`
+      - Returns `TableExtractionResponse`
 
 ### 4. Frameworks & Drivers Layer (`pdf2table/frameworks/`)
 - **pdf_image_extractor.py**: PyMuPDF implementation
@@ -61,14 +63,18 @@ pdf2table/
 from pdf2table.frameworks.table_extraction_factory import TableExtractionService
 
 service = TableExtractionService(device="cpu")
-result = service.extract_tables_from_page(pdf_path, page_number)
+
+# Extract from a specific page
+result = service.extract_tables_from_page(pdf_path, page_number=0)
 tables = result["tables"]
+
+# Or extract from all pages
+all_results = service.extract_tables_from_pdf(pdf_path)
 ```
 
 ### Usage (Advanced)
 ```python
 from pdf2table.frameworks.table_extraction_factory import TableExtractionFactory
-from pdf2table.usecases.dtos import TableExtractionRequest
 
 # Create with custom configuration
 adapter = TableExtractionFactory.create_table_extraction_adapter(
@@ -77,7 +83,9 @@ adapter = TableExtractionFactory.create_table_extraction_adapter(
     structure_threshold=0.7
 )
 
-# Use the adapter
-request = TableExtractionRequest(pdf_path, page_number)
-response = adapter.extract_tables(request)
+# Extract from a specific page
+response = adapter.extract_tables(pdf_path, page_number=0)
+
+# Or extract from all pages
+response = adapter.extract_tables(pdf_path)
 ```
diff --git a/pdf2table/adaptors/table_extraction_adaptor.py b/pdf2table/adaptors/table_extraction_adaptor.py
@@ -1,29 +1,33 @@
-from pdf2table.usecases.dtos import TableExtractionRequest, TableExtractionResponse
+from typing import Optional
+from pdf2table.usecases.dtos import TableExtractionResponse
 from pdf2table.usecases.table_extraction_use_case import TableExtractionUseCase
 
 
 class TableExtractionAdapter:
     """Adapter that orchestrates table extraction using the use case."""
-    
+
     def __init__(self, table_extraction_use_case: TableExtractionUseCase):
         self._use_case = table_extraction_use_case
-    
-    def extract_tables(self, request: TableExtractionRequest) -> TableExtractionResponse:
+
+    def extract_tables(
+        self, pdf_path: str, page_number: Optional[int] = None
+    ) -> TableExtractionResponse:
+        """
+        Extract tables from a PDF document.
+
+        Args:
+            pdf_path: Path to the PDF file
+            page_number: Optional page number to extract. If None, extracts from all pages.
+
+        Returns:
+            TableExtractionResponse containing extracted tables
+        """
         try:
-            tables = self._use_case.extract_tables_from_page(
-                request.pdf_path, 
-                request.page_number
-            )
-            
-            return TableExtractionResponse(
-                tables=tables,
-                page_number=request.page_number,
-                source_file=request.pdf_path
-            )
-            
+            tables = self._use_case.extract_tables(pdf_path, page_number)
+
+            return TableExtractionResponse(tables=tables, source_file=pdf_path)
+
         except Exception as e:
             return TableExtractionResponse.error(
-                error_message=str(e),
-                page_number=request.page_number,
-                source_file=request.pdf_path
+                error_message=str(e), source_file=pdf_path
             )
diff --git a/pdf2table/frameworks/table_extraction_factory.py b/pdf2table/frameworks/table_extraction_factory.py
@@ -1,5 +1,4 @@
 from pdf2table.usecases.table_extraction_use_case import TableExtractionUseCase
-from pdf2table.usecases.dtos import TableExtractionRequest
 from pdf2table.adaptors.table_extraction_adaptor import TableExtractionAdapter
 from pdf2table.frameworks.pdf_image_extractor import PyMuPDFImageExtractor
 from pdf2table.frameworks.table_transformer_detector import TableTransformerDetector
@@ -27,26 +26,28 @@ def create_table_extraction_adapter(
         visualization_save_dir: str = "data/table_visualizations",  # Optional save dir
     ) -> TableExtractionAdapter:
         """Create a fully configured table extraction adapter."""
-        
-        logger.info(f"Creating table extraction adapter - Device: {device}, "
-                   f"Detection threshold: {detection_threshold}, "
-                   f"Structure threshold: {structure_threshold}, "
-                   f"PDF DPI: {pdf_dpi}, OCR: {load_ocr}, Visualize: {visualize}")
+
+        logger.info(
+            f"Creating table extraction adapter - Device: {device}, "
+            f"Detection threshold: {detection_threshold}, "
+            f"Structure threshold: {structure_threshold}, "
+            f"PDF DPI: {pdf_dpi}, OCR: {load_ocr}, Visualize: {visualize}"
+        )
 
         # Create framework implementations (outermost layer)
         logger.debug("Initializing PDF image extractor")
         pdf_extractor = PyMuPDFImageExtractor(dpi=pdf_dpi)
-        
+
         logger.debug("Initializing table transformer detector")
         table_detector = TableTransformerDetector(
             device=device, confidence_threshold=detection_threshold
         )
-        
+
         logger.debug("Initializing table structure recognizer")
         structure_recognizer = TableTransformerStructureRecognizer(
             device=device, confidence_threshold=structure_threshold
         )
-        
+
         if load_ocr:
             logger.debug("Initializing OCR service")
             ocr_service = TrOCRService(device=device)
@@ -66,7 +67,7 @@ def create_table_extraction_adapter(
 
         logger.debug("Creating table extraction adapter")
         adapter = TableExtractionAdapter(table_extraction_use_case)
-        
+
         logger.info("Table extraction adapter created successfully")
         return adapter
 
@@ -87,12 +88,13 @@ def __init__(self, device: str = "cpu"):
     def extract_tables_from_page(self, pdf_path: str, page_number: int) -> dict:
         """Extract tables from a single PDF page."""
         logger.info(f"Extracting tables from {pdf_path}, page {page_number}")
-        request = TableExtractionRequest(pdf_path, page_number)
         try:
-            response = self._adapter.extract_tables(request)
+            response = self._adapter.extract_tables(pdf_path, page_number)
             result = response.to_dict()
-            tables_count = len(result.get('tables', []))
-            logger.info(f"Successfully extracted {tables_count} tables from page {page_number}")
+            tables_count = len(result.get("tables", []))
+            logger.info(
+                f"Successfully extracted {tables_count} tables from page {page_number}"
+            )
             return result
         except Exception as e:
             logger.error(f"Failed to extract tables from page {page_number}: {e}")
@@ -101,32 +103,23 @@ def extract_tables_from_page(self, pdf_path: str, page_number: int) -> dict:
     def extract_tables_from_pdf(self, pdf_path: str) -> list[dict]:
         """Extract tables from all pages of a PDF."""
         logger.info(f"Starting table extraction from entire PDF: {pdf_path}")
-        
-        from pdf2table.frameworks.pdf_image_extractor import PyMuPDFImageExtractor
-
-        # Get page count
-        pdf_extractor = PyMuPDFImageExtractor()
-        page_count = pdf_extractor.get_page_count(pdf_path)
-        logger.info(f"PDF has {page_count} pages")
-
-        results = []
-        successful_pages = 0
-        
-        for page_number in range(page_count):
-            try:
-                result = self.extract_tables_from_page(pdf_path, page_number)
-                results.append(result)
-                successful_pages += 1
-            except Exception as e:
-                logger.error(f"Failed to process page {page_number}: {e}")
-                results.append(
-                    {
-                        "success": False,
-                        "error": str(e),
-                        "page_number": page_number,
-                        "source_file": pdf_path,
-                    }
+
+        try:
+            response = self._adapter.extract_tables(pdf_path)
+            result = response.to_dict()
+
+            if result.get("success"):
+                tables_count = len(result.get("tables", []))
+                logger.info(
+                    f"Successfully extracted {tables_count} tables from entire PDF"
                 )
-        
-        logger.info(f"Completed PDF processing - {successful_pages}/{page_count} pages successful")
-        return results
+            else:
+                logger.error(
+                    f"Failed to extract tables from PDF: {result.get('error')}"
+                )
+
+            return result
+
+        except Exception as e:
+            logger.error(f"Failed to extract tables from PDF: {e}")
+            raise
diff --git a/pdf2table/usecases/dtos.py b/pdf2table/usecases/dtos.py
@@ -3,24 +3,17 @@
 from pdf2table.entities.table_entities import  DetectedTable
 
 
-class TableExtractionRequest:    
-    def __init__(self, pdf_path: str, page_number: int):
-        self.pdf_path = pdf_path
-        self.page_number = page_number
-
-
 class TableExtractionResponse:    
-    def __init__(self, tables: List[DetectedTable], page_number: int, source_file: str):
+    def __init__(self, tables: List[DetectedTable], source_file: str):
         self.tables = tables
-        self.page_number = page_number
         self.source_file = source_file
         self.success = True
         self.error_message = None
     
     @classmethod
-    def error(cls, error_message: str, page_number: int, source_file: str):
+    def error(cls, error_message: str, source_file: str):
         """Create error response."""
-        response = cls([], page_number, source_file)
+        response = cls([], source_file)
         response.success = False
         response.error_message = error_message
         return response
@@ -31,21 +24,16 @@ def to_dict(self):
             return {
                 "success": False,
                 "error": self.error_message,
-                "page_number": self.page_number,
                 "source_file": self.source_file
             }
         
         return {
             "success": True,
-            "page_number": self.page_number,
             "source_file": self.source_file,
             "tables": [
                 {
                     "metadata": table.metadata,
                     "data": table.grid.to_row_format() if table.grid else [],
-                    "box": table.detection_box.to_list(),
-                    "n_rows": table.grid.n_rows if table.grid else 0,
-                    "n_cols": table.grid.n_cols if table.grid else 0,
                 }
                 for table in self.tables
             ]
diff --git a/pdf2table/usecases/table_extraction_use_case.py b/pdf2table/usecases/table_extraction_use_case.py
@@ -41,6 +41,36 @@ def __init__(
         self._visualize = visualize
         self._visualization_save_dir = visualization_save_dir
 
+    def extract_tables(
+        self, pdf_path: str, page_number: Optional[int] = None
+    ) -> List[DetectedTable]:
+        """
+        Extract all tables from a PDF document.
+
+        Args:
+            pdf_path: Path to the PDF file
+            page_number: Optional page number to extract. If None, extracts from all pages.
+
+        Returns:
+            List of DetectedTable objects from the specified page(s)
+        """
+        if page_number is not None:
+            return self.extract_tables_from_page(pdf_path, page_number)
+
+        # Extract from all pages
+        page_count = self.pdf_extractor.get_page_count(pdf_path)
+        all_tables = []
+
+        for page_num in range(page_count):
+            try:
+                tables = self.extract_tables_from_page(pdf_path, page_num)
+                all_tables.extend(tables)
+            except Exception as e:
+                print(f"Error processing page {page_num}: {e}")
+                continue
+
+        return all_tables
+
     def extract_tables_from_page(
         self, pdf_path: str, page_number: int
     ) -> List[DetectedTable]:
diff --git a/tests/integration/test_table_extraction.py b/tests/integration/test_table_extraction.py
@@ -6,7 +6,6 @@
 import os
 
 from pdf2table.frameworks.table_extraction_factory import TableExtractionFactory
-from pdf2table.usecases.dtos import TableExtractionRequest
 
 
 def test_actual_table_extraction():
@@ -15,19 +14,18 @@ def test_actual_table_extraction():
     adapter = TableExtractionFactory.create_table_extraction_adapter(visualize=True)
 
     # Sample PDF path
-    pdf_path = "tests/samples/A_Comprehensive_Review_of_Low_Rank_Adaptation_in_Large_Language_Models_for_Efficient_Parameter_Tuning-1.pdf"
+    pdf_path = "data/oxford-textbook-of-medicine-693.pdf"
 
     if not os.path.exists(pdf_path):
         print(f"❌ PDF file not found: {pdf_path}")
         return
 
     try:
         print(f"\n🔍 Testing with sample PDF: {os.path.basename(pdf_path)}")
-        request = TableExtractionRequest(pdf_path, 4)
-        result = adapter.extract_tables(request).to_dict()
+        result = adapter.extract_tables(pdf_path, page_number=0).to_dict()
 
         if result["success"]:
-            print("✅ Successfully processed page 4")
+            print("✅ Successfully processed page 0")
             print(f"📊 Found {len(result['tables'])} table(s)")
         else:
             print(f"⚠️ Processing failed: {result.get('error', 'Unknown error')}")
diff --git a/tests/unit/test_entities_and_use_cases.py b/tests/unit/test_entities_and_use_cases.py