refactor: enhance response handling

Alijanloo · Alijanloo · commit f01c68600d86 · 2025-10-17T18:31:57.000+03:30
diff --git a/README.md b/README.md
@@ -17,17 +17,6 @@ This project provides a robust solution for extracting tabular data from PDF doc
 - Table structure recognition using Table Transformer
 - Clean architecture with separation of concerns
 
-## Project Structure
-
-- `pdf2table/`: Main package
-  - `adaptors/`: Interface with external systems(PDF reader, Table Transformer)
-  - `entities/`: Domain models
-  - `usecases/`: Application logic
-  - `frameworks/`: UI and infrastructure
-- `tests/`: Unit tests
-  - `adaptors/`: Tests for adaptors
-  - `samples/`: Sample PDFs for testing
-
 ## Installation
 
 ```bash
@@ -50,19 +39,28 @@ pipeline = create_pipeline(
 )
 
 # Extract tables from a specific page
-tables = pipeline.extract_tables(pdf_path="document.pdf", page_number=0)
+response = pipeline.extract_tables(pdf_path="document.pdf", page_number=0)
 
 # Or extract tables from all pages
-all_tables = pipeline.extract_tables(pdf_path="document.pdf")
+response = pipeline.extract_tables(pdf_path="document.pdf")
 
-# Access extracted tables
-for table in tables:
-    print(f"Table with {len(table.grid.cells)} cells")
-    print(f"Grid size: {table.grid.n_rows} x {table.grid.n_cols}")
+# Check if extraction was successful
+if response.success:
+    print(f"Successfully extracted {len(response.tables)} tables")
+    
+    # Access extracted tables
+    for table in response.tables:
+        print(f"Table with {len(table.grid.cells)} cells")
+        print(f"Grid size: {table.grid.n_rows} x {table.grid.n_cols}")
+    
+    # Convert to dictionary format
+    result_dict = response.to_dict()
+    print(result_dict)
     
-    # Convert to structured format
-    table_data = table.to_dict()
-    print(table_data)
+    # Save results to JSON file
+    response.save_to_json("output/extracted_tables.json")
+else:
+    print(f"Extraction failed: {response.error_message}")
 ```
 
 ### Configuration Options
diff --git a/docs/project_tree.md b/docs/project_tree.md
@@ -6,6 +6,7 @@ pdf2table/
 ├── entities/
 │   └── table_entities.py
 ├── usecases/
+│   ├── dtos.py
 │   ├── services/
 │   │   └── table_services.py
 │   ├── interfaces/
@@ -33,9 +34,11 @@ pdf2table/
 - **table_extraction_use_case.py**: Application business logic
   - `TableExtractionUseCase`: Orchestrates table extraction workflow
     - `extract_tables(pdf_path, page_number=None)`: Main extraction method
-    - Returns list of `DetectedTable` objects
+    - Returns `TableExtractionResponse` object with extracted tables and metadata
   - `TableGridBuilder`: Builds structured grids from detected cells
   - Contains the core algorithms for grouping rows/columns and building grids
+- **dtos.py**: Data Transfer Objects
+  - `TableExtractionResponse`: Response object for table extraction
 - **services/table_services.py**: Supporting services for use cases
   - `TableValidationService`: Validates detected table structures and cells
   - `CoordinateClusteringService`: Clusters coordinates for row/column grouping
@@ -64,13 +67,20 @@ use_case = create_pipeline(
 )
 
 # Extract from a specific page
-tables = use_case.extract_tables(pdf_path, page_number=0)
+response = use_case.extract_tables(pdf_path, page_number=0)
 
 # Or extract from all pages
-all_tables = use_case.extract_tables(pdf_path)
+response = use_case.extract_tables(pdf_path)
 
-# Process the results
-for table in tables:
-    print(f"Found table with {table.grid.n_rows} rows and {table.grid.n_cols} columns")
-    table_dict = table.to_dict()
+# Check if extraction was successful
+if response.success:
+    # Process the results
+    for table in response.tables:
+        print(f"Found table with {table.grid.n_rows} rows and {table.grid.n_cols} columns")
+        table_dict = table.to_dict()
+    
+    # Save to JSON file
+    response.save_to_json("output/tables.json")
+else:
+    print(f"Error: {response.error_message}")
 ```
diff --git a/pdf2table/usecases/dtos.py b/pdf2table/usecases/dtos.py
@@ -1,3 +1,4 @@
+import json
 from typing import List
 
 from pdf2table.entities.table_entities import  DetectedTable
@@ -38,4 +39,14 @@ def to_dict(self):
                 for table in self.tables
             ]
         }
+    
+    def save_to_json(self, output_path: str):
+        """
+        Save the extraction response to a JSON file.
+        
+        Args:
+            output_path: Path where the JSON file will be saved
+        """
+        with open(output_path, 'w', encoding='utf-8') as f:
+            json.dump(self.to_dict(), f, indent=2, ensure_ascii=False)
 
diff --git a/pdf2table/usecases/interfaces/framework_interfaces.py b/pdf2table/usecases/interfaces/framework_interfaces.py
@@ -15,6 +15,10 @@ def extract_page_image(self, pdf_path: str, page_number: int) -> PageImage:
         """Extract image from PDF page."""
         pass
 
+    @abstractmethod
+    def get_page_count(self, pdf_path: str) -> int:
+        """Get total number of pages in PDF."""
+        pass
 
 class TableDetectorInterface(ABC):
     """Abstract interface for table detection."""
diff --git a/pdf2table/usecases/table_extraction_use_case.py b/pdf2table/usecases/table_extraction_use_case.py
@@ -4,6 +4,7 @@
     PageImage,
     DetectedTable,
 )
+from pdf2table.usecases.dtos import TableExtractionResponse
 from pdf2table.usecases.services.table_services import (
     TableValidationService,
 )
@@ -43,7 +44,7 @@ def __init__(
 
     def extract_tables(
         self, pdf_path: str, page_number: Optional[int] = None
-    ) -> List[DetectedTable]:
+    ) -> TableExtractionResponse:
         """
         Extract all tables from a PDF document.
 
@@ -52,26 +53,29 @@ def extract_tables(
             page_number: Optional page number to extract. If None, extracts from all pages.
 
         Returns:
-            List of DetectedTable objects from the specified page(s)
+            TableExtractionResponse object containing extracted tables
         """
-        if page_number is not None:
-            return self.extract_tables_from_page(pdf_path, page_number)
-
-        # Extract from all pages
-        page_count = self.pdf_extractor.get_page_count(pdf_path)
-        all_tables = []
-
-        for page_num in range(page_count):
-            try:
-                tables = self.extract_tables_from_page(pdf_path, page_num)
-                all_tables.extend(tables)
-            except Exception as e:
-                print(f"Error processing page {page_num}: {e}")
-                continue
-
-        return all_tables
-
-    def extract_tables_from_page(
+        try:
+            if page_number is not None:
+                tables = self._extract_tables_from_page(pdf_path, page_number)
+                return TableExtractionResponse(tables, pdf_path)
+
+            page_count = self.pdf_extractor.get_page_count(pdf_path)
+            all_tables = []
+
+            for page_num in range(page_count):
+                try:
+                    tables = self._extract_tables_from_page(pdf_path, page_num)
+                    all_tables.extend(tables)
+                except Exception as e:
+                    print(f"Error processing page {page_num}: {e}")
+                    continue
+
+            return TableExtractionResponse(all_tables, pdf_path)
+        except Exception as e:
+            return TableExtractionResponse.error(str(e), pdf_path)
+
+    def _extract_tables_from_page(
         self, pdf_path: str, page_number: int
     ) -> List[DetectedTable]:
         """Extract all tables from a PDF page."""
diff --git a/tests/integration/test_table_extraction.py b/tests/integration/test_table_extraction.py
@@ -5,13 +5,13 @@
 
 import os
 
-from pdf2table.frameworks.pipeline import TableExtractionFactory
+from pdf2table.frameworks.pipeline import create_pipeline
 
 
 def test_actual_table_extraction():
     """Test table extraction on the sample PDF"""
     print("✅ Creating table extraction pipeline...")
-    use_case = TableExtractionFactory.create_pipeline(visualize=True)
+    use_case = create_pipeline(visualize=True)
 
     pdf_path = "data/oxford-textbook-of-medicine-693.pdf"
 
diff --git a/tests/unit/usecases/test_extraction_integration.py b/tests/unit/usecases/test_extraction_integration.py
@@ -84,7 +84,7 @@ def test_end_to_end_table_extraction(self):
         )
 
         # Act
-        results = use_case.extract_tables_from_page("sample.pdf", 0)
+        results = use_case._extract_tables_from_page("sample.pdf", 0)
 
         # Assert
         assert len(results) == 1
diff --git a/tests/unit/usecases/test_table_extraction.py b/tests/unit/usecases/test_table_extraction.py
@@ -79,7 +79,7 @@ def test_extract_tables_from_page_success(self):
         self.mock_ocr_service.extract_text.return_value = "test text"
         
         # Act
-        result = self.use_case.extract_tables_from_page(pdf_path, page_number)
+        result = self.use_case._extract_tables_from_page(pdf_path, page_number)
         
         # Assert
         assert len(result) >= 1
@@ -123,7 +123,7 @@ def test_extract_tables_from_page_no_valid_structure(self):
         self.mock_structure_recognizer.recognize_structure.return_value = [cell1]
         
         # Act
-        result = self.use_case.extract_tables_from_page(pdf_path, page_number)
+        result = self.use_case._extract_tables_from_page(pdf_path, page_number)
         
         # Assert
         assert len(result) == 0  # No valid tables should be returned
@@ -190,7 +190,7 @@ def mock_recognize_structure(page_image, table_box):
         self.mock_ocr_service.extract_text.return_value = "test"
         
         # Act
-        result = self.use_case.extract_tables_from_page(pdf_path, page_number)
+        result = self.use_case._extract_tables_from_page(pdf_path, page_number)
         
         # Assert
         assert len(result) >= 1  # Only the second table should succeed

Original file line number	Diff line number	Diff line change
`@@ -84,7 +84,7 @@ def test_end_to_end_table_extraction(self):`
`84`	`84`	`)`
`85`	`85`
`86`	`86`	`# Act`
`87`		`- results = use_case.extract_tables_from_page("sample.pdf", 0)`
	`87`	`+ results = use_case._extract_tables_from_page("sample.pdf", 0)`
`88`	`88`
`89`	`89`	`# Assert`
`90`	`90`	`assert len(results) == 1`