Skip to content

Commit e69565c

Browse files
committed
feat: update the table extraction example and response handling
1 parent ed3557b commit e69565c

File tree

6 files changed

+59
-77
lines changed

6 files changed

+59
-77
lines changed
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
from pdf2table.frameworks.pipeline import create_pipeline
2+
3+
# Create the extraction pipeline with configuration
4+
pipeline = create_pipeline(
5+
device="cpu",
6+
detection_threshold=0.9,
7+
structure_threshold=0.6,
8+
pdf_dpi=300,
9+
load_ocr=False,
10+
visualize=False
11+
)
12+
13+
pdf_path = "tests/samples/oxford-textbook-of-medicine-693.pdf"
14+
15+
# Extract tables from a specific page
16+
response = pipeline.extract_tables(pdf_path=pdf_path, page_number=0)
17+
18+
# Or extract tables from all pages
19+
# response = pipeline.extract_tables(pdf_path=pdf_path)
20+
21+
# Check if extraction was successful
22+
if response.success:
23+
print(f"Successfully extracted {len(response.tables)} tables")
24+
25+
# Access extracted tables
26+
for table in response.tables:
27+
print(f"Table with {len(table.grid.cells)} cells")
28+
print(f"Grid size: {table.grid.n_rows} x {table.grid.n_cols}")
29+
30+
# Convert to dictionary format
31+
result_dict = response.to_dict()
32+
print(result_dict)
33+
34+
# Save results to JSON file
35+
response.save_to_json("data/extracted_tables.json")
36+
else:
37+
print(f"Extraction failed: {response.error_message}")

pdf2table/usecases/dtos.py

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,33 @@
11
import json
22
from typing import List
33

4-
from pdf2table.entities.table_entities import DetectedTable
4+
from pdf2table.entities.table_entities import DetectedTable
55

66

7-
class TableExtractionResponse:
7+
class TableExtractionResponse:
88
def __init__(self, tables: List[DetectedTable], source_file: str):
99
self.tables = tables
1010
self.source_file = source_file
1111
self.success = True
1212
self.error_message = None
13-
13+
1414
@classmethod
1515
def error(cls, error_message: str, source_file: str):
1616
"""Create error response."""
1717
response = cls([], source_file)
1818
response.success = False
1919
response.error_message = error_message
2020
return response
21-
21+
2222
def to_dict(self):
2323
"""Convert response to dictionary format."""
2424
if not self.success:
2525
return {
2626
"success": False,
2727
"error": self.error_message,
28-
"source_file": self.source_file
28+
"source_file": self.source_file,
2929
}
30-
30+
3131
return {
3232
"success": True,
3333
"source_file": self.source_file,
@@ -37,16 +37,24 @@ def to_dict(self):
3737
"data": table.grid.to_row_format() if table.grid else [],
3838
}
3939
for table in self.tables
40-
]
40+
],
4141
}
42-
42+
4343
def save_to_json(self, output_path: str):
4444
"""
45-
Save the extraction response to a JSON file.
46-
45+
Save the extracted tables to a JSON file.
46+
4747
Args:
4848
output_path: Path where the JSON file will be saved
4949
"""
50-
with open(output_path, 'w', encoding='utf-8') as f:
51-
json.dump(self.to_dict(), f, indent=2, ensure_ascii=False)
52-
50+
result_dict = {
51+
"tables": [
52+
{
53+
"metadata": table.metadata,
54+
"data": table.grid.to_row_format() if table.grid else [],
55+
}
56+
for table in self.tables
57+
]
58+
}
59+
with open(output_path, "w", encoding="utf-8") as f:
60+
json.dump(result_dict, f, indent=2, ensure_ascii=False)

tests/README.md

Lines changed: 1 addition & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,4 @@
1-
# Test Suite Organization
2-
3-
This directory contains a well-organized test suite following Clean Architecture principles.
4-
5-
## Structure
6-
7-
```
8-
tests/
9-
├── unit/ # Fast unit tests with mocking
10-
│ ├── test_entities_and_use_cases.py # Core business logic tests
11-
│ └── usecases/ # Use case specific unit tests
12-
├── integration/ # Integration tests with real dependencies
13-
│ └── test_table_extraction.py # End-to-end table extraction tests
14-
├── samples/ # Sample files for testing
15-
├── README.md
16-
└── __init__.py
17-
```
18-
19-
## Running Tests
1+
# Running Tests
202

213
```bash
224
# Run fast unit tests
@@ -28,9 +10,3 @@ python -m unittest discover -s tests/integration
2810
# Run all tests
2911
python -m unittest discover -s tests
3012
```
31-
32-
## Test Principles
33-
34-
1. **Unit tests** should be fast and use mocking for external dependencies
35-
2. **Integration tests** can use real models but should be clearly marked
36-
4. Each layer should be testable in isolation

tests/integration/__init__.py

Whitespace-only changes.

tests/integration/test_table_extraction.py

Lines changed: 0 additions & 39 deletions
This file was deleted.
525 KB
Binary file not shown.

0 commit comments

Comments
 (0)