Skip to content

Commit f01c686

Browse files
committed
refactor: enhance response handling
1 parent 7a33115 commit f01c686

File tree

8 files changed

+80
-53
lines changed

8 files changed

+80
-53
lines changed

README.md

Lines changed: 18 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,6 @@ This project provides a robust solution for extracting tabular data from PDF doc
1717
- Table structure recognition using Table Transformer
1818
- Clean architecture with separation of concerns
1919

20-
## Project Structure
21-
22-
- `pdf2table/`: Main package
23-
- `adaptors/`: Interface with external systems(PDF reader, Table Transformer)
24-
- `entities/`: Domain models
25-
- `usecases/`: Application logic
26-
- `frameworks/`: UI and infrastructure
27-
- `tests/`: Unit tests
28-
- `adaptors/`: Tests for adaptors
29-
- `samples/`: Sample PDFs for testing
30-
3120
## Installation
3221

3322
```bash
@@ -50,19 +39,28 @@ pipeline = create_pipeline(
5039
)
5140

5241
# Extract tables from a specific page
53-
tables = pipeline.extract_tables(pdf_path="document.pdf", page_number=0)
42+
response = pipeline.extract_tables(pdf_path="document.pdf", page_number=0)
5443

5544
# Or extract tables from all pages
56-
all_tables = pipeline.extract_tables(pdf_path="document.pdf")
45+
response = pipeline.extract_tables(pdf_path="document.pdf")
5746

58-
# Access extracted tables
59-
for table in tables:
60-
print(f"Table with {len(table.grid.cells)} cells")
61-
print(f"Grid size: {table.grid.n_rows} x {table.grid.n_cols}")
47+
# Check if extraction was successful
48+
if response.success:
49+
print(f"Successfully extracted {len(response.tables)} tables")
50+
51+
# Access extracted tables
52+
for table in response.tables:
53+
print(f"Table with {len(table.grid.cells)} cells")
54+
print(f"Grid size: {table.grid.n_rows} x {table.grid.n_cols}")
55+
56+
# Convert to dictionary format
57+
result_dict = response.to_dict()
58+
print(result_dict)
6259

63-
# Convert to structured format
64-
table_data = table.to_dict()
65-
print(table_data)
60+
# Save results to JSON file
61+
response.save_to_json("output/extracted_tables.json")
62+
else:
63+
print(f"Extraction failed: {response.error_message}")
6664
```
6765

6866
### Configuration Options

docs/architecture_guide.md renamed to docs/project_tree.md

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ pdf2table/
66
├── entities/
77
│ └── table_entities.py
88
├── usecases/
9+
│ ├── dtos.py
910
│ ├── services/
1011
│ │ └── table_services.py
1112
│ ├── interfaces/
@@ -33,9 +34,11 @@ pdf2table/
3334
- **table_extraction_use_case.py**: Application business logic
3435
- `TableExtractionUseCase`: Orchestrates table extraction workflow
3536
- `extract_tables(pdf_path, page_number=None)`: Main extraction method
36-
- Returns list of `DetectedTable` objects
37+
- Returns `TableExtractionResponse` object with extracted tables and metadata
3738
- `TableGridBuilder`: Builds structured grids from detected cells
3839
- Contains the core algorithms for grouping rows/columns and building grids
40+
- **dtos.py**: Data Transfer Objects
41+
- `TableExtractionResponse`: Response object for table extraction
3942
- **services/table_services.py**: Supporting services for use cases
4043
- `TableValidationService`: Validates detected table structures and cells
4144
- `CoordinateClusteringService`: Clusters coordinates for row/column grouping
@@ -64,13 +67,20 @@ use_case = create_pipeline(
6467
)
6568

6669
# Extract from a specific page
67-
tables = use_case.extract_tables(pdf_path, page_number=0)
70+
response = use_case.extract_tables(pdf_path, page_number=0)
6871

6972
# Or extract from all pages
70-
all_tables = use_case.extract_tables(pdf_path)
73+
response = use_case.extract_tables(pdf_path)
7174

72-
# Process the results
73-
for table in tables:
74-
print(f"Found table with {table.grid.n_rows} rows and {table.grid.n_cols} columns")
75-
table_dict = table.to_dict()
75+
# Check if extraction was successful
76+
if response.success:
77+
# Process the results
78+
for table in response.tables:
79+
print(f"Found table with {table.grid.n_rows} rows and {table.grid.n_cols} columns")
80+
table_dict = table.to_dict()
81+
82+
# Save to JSON file
83+
response.save_to_json("output/tables.json")
84+
else:
85+
print(f"Error: {response.error_message}")
7686
```

pdf2table/usecases/dtos.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import json
12
from typing import List
23

34
from pdf2table.entities.table_entities import DetectedTable
@@ -38,4 +39,14 @@ def to_dict(self):
3839
for table in self.tables
3940
]
4041
}
42+
43+
def save_to_json(self, output_path: str):
44+
"""
45+
Save the extraction response to a JSON file.
46+
47+
Args:
48+
output_path: Path where the JSON file will be saved
49+
"""
50+
with open(output_path, 'w', encoding='utf-8') as f:
51+
json.dump(self.to_dict(), f, indent=2, ensure_ascii=False)
4152

pdf2table/usecases/interfaces/framework_interfaces.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@ def extract_page_image(self, pdf_path: str, page_number: int) -> PageImage:
1515
"""Extract image from PDF page."""
1616
pass
1717

18+
@abstractmethod
19+
def get_page_count(self, pdf_path: str) -> int:
20+
"""Get total number of pages in PDF."""
21+
pass
1822

1923
class TableDetectorInterface(ABC):
2024
"""Abstract interface for table detection."""

pdf2table/usecases/table_extraction_use_case.py

Lines changed: 24 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
PageImage,
55
DetectedTable,
66
)
7+
from pdf2table.usecases.dtos import TableExtractionResponse
78
from pdf2table.usecases.services.table_services import (
89
TableValidationService,
910
)
@@ -43,7 +44,7 @@ def __init__(
4344

4445
def extract_tables(
4546
self, pdf_path: str, page_number: Optional[int] = None
46-
) -> List[DetectedTable]:
47+
) -> TableExtractionResponse:
4748
"""
4849
Extract all tables from a PDF document.
4950
@@ -52,26 +53,29 @@ def extract_tables(
5253
page_number: Optional page number to extract. If None, extracts from all pages.
5354
5455
Returns:
55-
List of DetectedTable objects from the specified page(s)
56+
TableExtractionResponse object containing extracted tables
5657
"""
57-
if page_number is not None:
58-
return self.extract_tables_from_page(pdf_path, page_number)
59-
60-
# Extract from all pages
61-
page_count = self.pdf_extractor.get_page_count(pdf_path)
62-
all_tables = []
63-
64-
for page_num in range(page_count):
65-
try:
66-
tables = self.extract_tables_from_page(pdf_path, page_num)
67-
all_tables.extend(tables)
68-
except Exception as e:
69-
print(f"Error processing page {page_num}: {e}")
70-
continue
71-
72-
return all_tables
73-
74-
def extract_tables_from_page(
58+
try:
59+
if page_number is not None:
60+
tables = self._extract_tables_from_page(pdf_path, page_number)
61+
return TableExtractionResponse(tables, pdf_path)
62+
63+
page_count = self.pdf_extractor.get_page_count(pdf_path)
64+
all_tables = []
65+
66+
for page_num in range(page_count):
67+
try:
68+
tables = self._extract_tables_from_page(pdf_path, page_num)
69+
all_tables.extend(tables)
70+
except Exception as e:
71+
print(f"Error processing page {page_num}: {e}")
72+
continue
73+
74+
return TableExtractionResponse(all_tables, pdf_path)
75+
except Exception as e:
76+
return TableExtractionResponse.error(str(e), pdf_path)
77+
78+
def _extract_tables_from_page(
7579
self, pdf_path: str, page_number: int
7680
) -> List[DetectedTable]:
7781
"""Extract all tables from a PDF page."""

tests/integration/test_table_extraction.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,13 @@
55

66
import os
77

8-
from pdf2table.frameworks.pipeline import TableExtractionFactory
8+
from pdf2table.frameworks.pipeline import create_pipeline
99

1010

1111
def test_actual_table_extraction():
1212
"""Test table extraction on the sample PDF"""
1313
print("✅ Creating table extraction pipeline...")
14-
use_case = TableExtractionFactory.create_pipeline(visualize=True)
14+
use_case = create_pipeline(visualize=True)
1515

1616
pdf_path = "data/oxford-textbook-of-medicine-693.pdf"
1717

tests/unit/usecases/test_extraction_integration.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ def test_end_to_end_table_extraction(self):
8484
)
8585

8686
# Act
87-
results = use_case.extract_tables_from_page("sample.pdf", 0)
87+
results = use_case._extract_tables_from_page("sample.pdf", 0)
8888

8989
# Assert
9090
assert len(results) == 1

tests/unit/usecases/test_table_extraction.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ def test_extract_tables_from_page_success(self):
7979
self.mock_ocr_service.extract_text.return_value = "test text"
8080

8181
# Act
82-
result = self.use_case.extract_tables_from_page(pdf_path, page_number)
82+
result = self.use_case._extract_tables_from_page(pdf_path, page_number)
8383

8484
# Assert
8585
assert len(result) >= 1
@@ -123,7 +123,7 @@ def test_extract_tables_from_page_no_valid_structure(self):
123123
self.mock_structure_recognizer.recognize_structure.return_value = [cell1]
124124

125125
# Act
126-
result = self.use_case.extract_tables_from_page(pdf_path, page_number)
126+
result = self.use_case._extract_tables_from_page(pdf_path, page_number)
127127

128128
# Assert
129129
assert len(result) == 0 # No valid tables should be returned
@@ -190,7 +190,7 @@ def mock_recognize_structure(page_image, table_box):
190190
self.mock_ocr_service.extract_text.return_value = "test"
191191

192192
# Act
193-
result = self.use_case.extract_tables_from_page(pdf_path, page_number)
193+
result = self.use_case._extract_tables_from_page(pdf_path, page_number)
194194

195195
# Assert
196196
assert len(result) >= 1 # Only the second table should succeed

0 commit comments

Comments
 (0)