Skip to content

Commit e0965c5

Browse files
committed
refactor: simplify the workflow by removing the request dto
1 parent dfafcf8 commit e0965c5

File tree

8 files changed

+127
-114
lines changed

8 files changed

+127
-114
lines changed

README.md

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -38,15 +38,16 @@ pip install -e .
3838
### Usage
3939
```python
4040
from pdf2table.frameworks.table_extraction_factory import TableExtractionFactory
41-
from pdf2table.usecases.dtos import TableExtractionRequest
4241

4342
# Initialize the factory
4443
factory = TableExtractionFactory()
4544
adapter = factory.create_table_extraction_adapter()
4645

47-
# Extract tables from PDF
48-
request = TableExtractionRequest(pdf_path="document.pdf", page_number=0)
49-
response = adapter.extract_tables(request)
46+
# Extract tables from a specific page
47+
response = adapter.extract_tables(pdf_path="document.pdf", page_number=0)
48+
49+
# Or extract tables from all pages
50+
response = adapter.extract_tables(pdf_path="document.pdf")
5051

5152
# Access extracted tables
5253
for table in response.tables:
@@ -72,18 +73,14 @@ service = TableExtractionService(device="cpu")
7273
page_result = service.extract_tables_from_page("document.pdf", page_number=0)
7374
print(f"Found {len(page_result['tables'])} tables on page 0")
7475

75-
# Extract tables from entire PDF
76+
# Extract tables from entire PDF (all pages)
7677
all_results = service.extract_tables_from_pdf("document.pdf")
77-
for page_idx, page_result in enumerate(all_results):
78-
if page_result.get('success', True):
79-
tables = page_result.get('tables', [])
80-
print(f"Page {page_idx}: Found {len(tables)} tables")
81-
82-
# Process each table
83-
for table_idx, table in enumerate(tables):
84-
print(f" Table {table_idx + 1}: {table['rows']} rows x {table['columns']} columns")
85-
else:
86-
print(f"Page {page_idx}: Error - {page_result.get('error', 'Unknown error')}")
78+
tables = all_results.get('tables', [])
79+
print(f"Found {len(tables)} total tables across all pages")
80+
81+
# Process each table
82+
for table_idx, table in enumerate(tables):
83+
print(f" Table {table_idx + 1}: {table['metadata']}")
8784
```
8885

8986
## 📋 Logging

docs/architecture_guide.md

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,19 +34,21 @@ pdf2table/
3434
### 2. Use Cases Layer (`pdf2table/usecases/`)
3535
- **table_extraction_use_case.py**: Application business logic
3636
- `TableExtractionUseCase`: Orchestrates table extraction workflow
37+
- `extract_tables(pdf_path, page_number=None)`: Main extraction method
3738
- `TableGridBuilder`: Builds structured grids from detected cells
3839
- Contains the core algorithms for grouping rows/columns and building grids
3940
- **services/table_services.py**: Supporting services for use cases
4041
- `TableValidationService`: Validates detected table structures and cells
4142
- `CoordinateClusteringService`: Clusters coordinates for row/column grouping
4243
- **dtos.py**: Data transfer objects for use cases
43-
- `TableExtractionRequest`: Request DTO for table extraction
4444
- `TableExtractionResponse`: Response DTO for table extraction
4545

4646
### 3. Interface Adapters Layer (`pdf2table/adaptors/`)
47-
- **table_extraction_ports.py**: Abstract interfaces and DTOs
48-
- Port interfaces: `PDFImageExtractorPort`, `TableDetectorPort`, etc.
47+
- **table_extraction_adaptor.py**: Adapter for table extraction
4948
- `TableExtractionAdapter`: Coordinates between use cases and external interfaces
49+
- `extract_tables(pdf_path, page_number=None)`: Main adapter method
50+
- Accepts `pdf_path` and optional `page_number`
51+
- Returns `TableExtractionResponse`
5052

5153
### 4. Frameworks & Drivers Layer (`pdf2table/frameworks/`)
5254
- **pdf_image_extractor.py**: PyMuPDF implementation
@@ -61,14 +63,18 @@ pdf2table/
6163
from pdf2table.frameworks.table_extraction_factory import TableExtractionService
6264

6365
service = TableExtractionService(device="cpu")
64-
result = service.extract_tables_from_page(pdf_path, page_number)
66+
67+
# Extract from a specific page
68+
result = service.extract_tables_from_page(pdf_path, page_number=0)
6569
tables = result["tables"]
70+
71+
# Or extract from all pages
72+
all_results = service.extract_tables_from_pdf(pdf_path)
6673
```
6774

6875
### Usage (Advanced)
6976
```python
7077
from pdf2table.frameworks.table_extraction_factory import TableExtractionFactory
71-
from pdf2table.usecases.dtos import TableExtractionRequest
7278

7379
# Create with custom configuration
7480
adapter = TableExtractionFactory.create_table_extraction_adapter(
@@ -77,7 +83,9 @@ adapter = TableExtractionFactory.create_table_extraction_adapter(
7783
structure_threshold=0.7
7884
)
7985

80-
# Use the adapter
81-
request = TableExtractionRequest(pdf_path, page_number)
82-
response = adapter.extract_tables(request)
86+
# Extract from a specific page
87+
response = adapter.extract_tables(pdf_path, page_number=0)
88+
89+
# Or extract from all pages
90+
response = adapter.extract_tables(pdf_path)
8391
```
Lines changed: 22 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,33 @@
1-
from pdf2table.usecases.dtos import TableExtractionRequest, TableExtractionResponse
1+
from typing import Optional
2+
from pdf2table.usecases.dtos import TableExtractionResponse
23
from pdf2table.usecases.table_extraction_use_case import TableExtractionUseCase
34

45

56
class TableExtractionAdapter:
67
"""Adapter that orchestrates table extraction using the use case."""
7-
8+
89
def __init__(self, table_extraction_use_case: TableExtractionUseCase):
910
self._use_case = table_extraction_use_case
10-
11-
def extract_tables(self, request: TableExtractionRequest) -> TableExtractionResponse:
11+
12+
def extract_tables(
13+
self, pdf_path: str, page_number: Optional[int] = None
14+
) -> TableExtractionResponse:
15+
"""
16+
Extract tables from a PDF document.
17+
18+
Args:
19+
pdf_path: Path to the PDF file
20+
page_number: Optional page number to extract. If None, extracts from all pages.
21+
22+
Returns:
23+
TableExtractionResponse containing extracted tables
24+
"""
1225
try:
13-
tables = self._use_case.extract_tables_from_page(
14-
request.pdf_path,
15-
request.page_number
16-
)
17-
18-
return TableExtractionResponse(
19-
tables=tables,
20-
page_number=request.page_number,
21-
source_file=request.pdf_path
22-
)
23-
26+
tables = self._use_case.extract_tables(pdf_path, page_number)
27+
28+
return TableExtractionResponse(tables=tables, source_file=pdf_path)
29+
2430
except Exception as e:
2531
return TableExtractionResponse.error(
26-
error_message=str(e),
27-
page_number=request.page_number,
28-
source_file=request.pdf_path
32+
error_message=str(e), source_file=pdf_path
2933
)

pdf2table/frameworks/table_extraction_factory.py

Lines changed: 35 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
from pdf2table.usecases.table_extraction_use_case import TableExtractionUseCase
2-
from pdf2table.usecases.dtos import TableExtractionRequest
32
from pdf2table.adaptors.table_extraction_adaptor import TableExtractionAdapter
43
from pdf2table.frameworks.pdf_image_extractor import PyMuPDFImageExtractor
54
from pdf2table.frameworks.table_transformer_detector import TableTransformerDetector
@@ -27,26 +26,28 @@ def create_table_extraction_adapter(
2726
visualization_save_dir: str = "data/table_visualizations", # Optional save dir
2827
) -> TableExtractionAdapter:
2928
"""Create a fully configured table extraction adapter."""
30-
31-
logger.info(f"Creating table extraction adapter - Device: {device}, "
32-
f"Detection threshold: {detection_threshold}, "
33-
f"Structure threshold: {structure_threshold}, "
34-
f"PDF DPI: {pdf_dpi}, OCR: {load_ocr}, Visualize: {visualize}")
29+
30+
logger.info(
31+
f"Creating table extraction adapter - Device: {device}, "
32+
f"Detection threshold: {detection_threshold}, "
33+
f"Structure threshold: {structure_threshold}, "
34+
f"PDF DPI: {pdf_dpi}, OCR: {load_ocr}, Visualize: {visualize}"
35+
)
3536

3637
# Create framework implementations (outermost layer)
3738
logger.debug("Initializing PDF image extractor")
3839
pdf_extractor = PyMuPDFImageExtractor(dpi=pdf_dpi)
39-
40+
4041
logger.debug("Initializing table transformer detector")
4142
table_detector = TableTransformerDetector(
4243
device=device, confidence_threshold=detection_threshold
4344
)
44-
45+
4546
logger.debug("Initializing table structure recognizer")
4647
structure_recognizer = TableTransformerStructureRecognizer(
4748
device=device, confidence_threshold=structure_threshold
4849
)
49-
50+
5051
if load_ocr:
5152
logger.debug("Initializing OCR service")
5253
ocr_service = TrOCRService(device=device)
@@ -66,7 +67,7 @@ def create_table_extraction_adapter(
6667

6768
logger.debug("Creating table extraction adapter")
6869
adapter = TableExtractionAdapter(table_extraction_use_case)
69-
70+
7071
logger.info("Table extraction adapter created successfully")
7172
return adapter
7273

@@ -87,12 +88,13 @@ def __init__(self, device: str = "cpu"):
8788
def extract_tables_from_page(self, pdf_path: str, page_number: int) -> dict:
8889
"""Extract tables from a single PDF page."""
8990
logger.info(f"Extracting tables from {pdf_path}, page {page_number}")
90-
request = TableExtractionRequest(pdf_path, page_number)
9191
try:
92-
response = self._adapter.extract_tables(request)
92+
response = self._adapter.extract_tables(pdf_path, page_number)
9393
result = response.to_dict()
94-
tables_count = len(result.get('tables', []))
95-
logger.info(f"Successfully extracted {tables_count} tables from page {page_number}")
94+
tables_count = len(result.get("tables", []))
95+
logger.info(
96+
f"Successfully extracted {tables_count} tables from page {page_number}"
97+
)
9698
return result
9799
except Exception as e:
98100
logger.error(f"Failed to extract tables from page {page_number}: {e}")
@@ -101,32 +103,23 @@ def extract_tables_from_page(self, pdf_path: str, page_number: int) -> dict:
101103
def extract_tables_from_pdf(self, pdf_path: str) -> list[dict]:
102104
"""Extract tables from all pages of a PDF."""
103105
logger.info(f"Starting table extraction from entire PDF: {pdf_path}")
104-
105-
from pdf2table.frameworks.pdf_image_extractor import PyMuPDFImageExtractor
106-
107-
# Get page count
108-
pdf_extractor = PyMuPDFImageExtractor()
109-
page_count = pdf_extractor.get_page_count(pdf_path)
110-
logger.info(f"PDF has {page_count} pages")
111-
112-
results = []
113-
successful_pages = 0
114-
115-
for page_number in range(page_count):
116-
try:
117-
result = self.extract_tables_from_page(pdf_path, page_number)
118-
results.append(result)
119-
successful_pages += 1
120-
except Exception as e:
121-
logger.error(f"Failed to process page {page_number}: {e}")
122-
results.append(
123-
{
124-
"success": False,
125-
"error": str(e),
126-
"page_number": page_number,
127-
"source_file": pdf_path,
128-
}
106+
107+
try:
108+
response = self._adapter.extract_tables(pdf_path)
109+
result = response.to_dict()
110+
111+
if result.get("success"):
112+
tables_count = len(result.get("tables", []))
113+
logger.info(
114+
f"Successfully extracted {tables_count} tables from entire PDF"
129115
)
130-
131-
logger.info(f"Completed PDF processing - {successful_pages}/{page_count} pages successful")
132-
return results
116+
else:
117+
logger.error(
118+
f"Failed to extract tables from PDF: {result.get('error')}"
119+
)
120+
121+
return result
122+
123+
except Exception as e:
124+
logger.error(f"Failed to extract tables from PDF: {e}")
125+
raise

pdf2table/usecases/dtos.py

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,24 +3,17 @@
33
from pdf2table.entities.table_entities import DetectedTable
44

55

6-
class TableExtractionRequest:
7-
def __init__(self, pdf_path: str, page_number: int):
8-
self.pdf_path = pdf_path
9-
self.page_number = page_number
10-
11-
126
class TableExtractionResponse:
13-
def __init__(self, tables: List[DetectedTable], page_number: int, source_file: str):
7+
def __init__(self, tables: List[DetectedTable], source_file: str):
148
self.tables = tables
15-
self.page_number = page_number
169
self.source_file = source_file
1710
self.success = True
1811
self.error_message = None
1912

2013
@classmethod
21-
def error(cls, error_message: str, page_number: int, source_file: str):
14+
def error(cls, error_message: str, source_file: str):
2215
"""Create error response."""
23-
response = cls([], page_number, source_file)
16+
response = cls([], source_file)
2417
response.success = False
2518
response.error_message = error_message
2619
return response
@@ -31,21 +24,16 @@ def to_dict(self):
3124
return {
3225
"success": False,
3326
"error": self.error_message,
34-
"page_number": self.page_number,
3527
"source_file": self.source_file
3628
}
3729

3830
return {
3931
"success": True,
40-
"page_number": self.page_number,
4132
"source_file": self.source_file,
4233
"tables": [
4334
{
4435
"metadata": table.metadata,
4536
"data": table.grid.to_row_format() if table.grid else [],
46-
"box": table.detection_box.to_list(),
47-
"n_rows": table.grid.n_rows if table.grid else 0,
48-
"n_cols": table.grid.n_cols if table.grid else 0,
4937
}
5038
for table in self.tables
5139
]

pdf2table/usecases/table_extraction_use_case.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,36 @@ def __init__(
4141
self._visualize = visualize
4242
self._visualization_save_dir = visualization_save_dir
4343

44+
def extract_tables(
45+
self, pdf_path: str, page_number: Optional[int] = None
46+
) -> List[DetectedTable]:
47+
"""
48+
Extract all tables from a PDF document.
49+
50+
Args:
51+
pdf_path: Path to the PDF file
52+
page_number: Optional page number to extract. If None, extracts from all pages.
53+
54+
Returns:
55+
List of DetectedTable objects from the specified page(s)
56+
"""
57+
if page_number is not None:
58+
return self.extract_tables_from_page(pdf_path, page_number)
59+
60+
# Extract from all pages
61+
page_count = self.pdf_extractor.get_page_count(pdf_path)
62+
all_tables = []
63+
64+
for page_num in range(page_count):
65+
try:
66+
tables = self.extract_tables_from_page(pdf_path, page_num)
67+
all_tables.extend(tables)
68+
except Exception as e:
69+
print(f"Error processing page {page_num}: {e}")
70+
continue
71+
72+
return all_tables
73+
4474
def extract_tables_from_page(
4575
self, pdf_path: str, page_number: int
4676
) -> List[DetectedTable]:

tests/integration/test_table_extraction.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
import os
77

88
from pdf2table.frameworks.table_extraction_factory import TableExtractionFactory
9-
from pdf2table.usecases.dtos import TableExtractionRequest
109

1110

1211
def test_actual_table_extraction():
@@ -15,19 +14,18 @@ def test_actual_table_extraction():
1514
adapter = TableExtractionFactory.create_table_extraction_adapter(visualize=True)
1615

1716
# Sample PDF path
18-
pdf_path = "tests/samples/A_Comprehensive_Review_of_Low_Rank_Adaptation_in_Large_Language_Models_for_Efficient_Parameter_Tuning-1.pdf"
17+
pdf_path = "data/oxford-textbook-of-medicine-693.pdf"
1918

2019
if not os.path.exists(pdf_path):
2120
print(f"❌ PDF file not found: {pdf_path}")
2221
return
2322

2423
try:
2524
print(f"\n🔍 Testing with sample PDF: {os.path.basename(pdf_path)}")
26-
request = TableExtractionRequest(pdf_path, 4)
27-
result = adapter.extract_tables(request).to_dict()
25+
result = adapter.extract_tables(pdf_path, page_number=0).to_dict()
2826

2927
if result["success"]:
30-
print("✅ Successfully processed page 4")
28+
print("✅ Successfully processed page 0")
3129
print(f"📊 Found {len(result['tables'])} table(s)")
3230
else:
3331
print(f"⚠️ Processing failed: {result.get('error', 'Unknown error')}")

0 commit comments

Comments
 (0)