Skip to content

Commit 46aa004

Browse files
authored
Add one more example skeleton for file content extraction. (#55)
Add one more example skeleton for file content extraction.
1 parent 9216313 commit 46aa004

File tree

8 files changed

+157
-0
lines changed

8 files changed

+157
-0
lines changed

examples/manual_extraction/.env

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# Postgres database address for cocoindex
2+
COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
Simple example for cocoindex: extract structured information from a Markdown file.
2+
3+
## Prerequisite
4+
Follow [Setup Postgres](../../#setup-postgres) section on the root directory to setup Postgres database.
5+
6+
## Run
7+
8+
Install dependencies:
9+
10+
```bash
11+
pip install -e .
12+
```
13+
14+
Setup:
15+
16+
```bash
17+
python manual_extraction.py cocoindex setup
18+
```
19+
20+
Update index:
21+
22+
```bash
23+
python manual_extraction.py cocoindex update
24+
```
25+
26+
Run:
27+
28+
```bash
29+
python manual_extraction.py
30+
```
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
import tempfile
2+
import dataclasses
3+
4+
from dotenv import load_dotenv
5+
from marker.converters.pdf import PdfConverter
6+
from marker.models import create_model_dict
7+
from marker.output import text_from_rendered
8+
from marker.config.parser import ConfigParser
9+
10+
import cocoindex
11+
12+
class PdfToMarkdown(cocoindex.op.FunctionSpec):
13+
"""Convert a PDF to markdown."""
14+
15+
@cocoindex.op.executor_class(gpu=True, cache=True, behavior_version=1)
16+
class PdfToMarkdownExecutor:
17+
"""Executor for PdfToMarkdown."""
18+
19+
spec: PdfToMarkdown
20+
_converter: PdfConverter
21+
22+
def prepare(self):
23+
config_parser = ConfigParser({})
24+
self._converter = PdfConverter(create_model_dict(), config=config_parser.generate_config_dict())
25+
26+
def __call__(self, content: bytes) -> str:
27+
with tempfile.NamedTemporaryFile(delete=True, suffix=".pdf") as temp_file:
28+
temp_file.write(content)
29+
temp_file.flush()
30+
text, _, _ = text_from_rendered(self._converter(temp_file.name))
31+
return text
32+
33+
@dataclasses.dataclass
34+
class ArgInfo:
35+
name: str
36+
description: str
37+
38+
@dataclasses.dataclass
39+
class MethodInfo:
40+
name: str
41+
args: list[ArgInfo]
42+
description: str
43+
44+
@dataclasses.dataclass
45+
class ClassInfo:
46+
name: str
47+
description: str
48+
methods: list[MethodInfo]
49+
50+
@dataclasses.dataclass
51+
class ManualInfo:
52+
title: str
53+
description: str
54+
classes: list[ClassInfo]
55+
methods: list[MethodInfo]
56+
57+
58+
class ExtractManual(cocoindex.op.FunctionSpec):
59+
"""Extract manual information from a Markdown."""
60+
61+
@cocoindex.op.executor_class()
62+
class ExtractManualExecutor:
63+
"""Executor for ExtractManual."""
64+
65+
spec: ExtractManual
66+
67+
def __call__(self, _markdown: str) -> ManualInfo:
68+
return ManualInfo(
69+
title="title_placeholder",
70+
description="description_placeholder",
71+
classes=[
72+
ClassInfo(
73+
name="class_name_placeholder",
74+
description="class_description_placeholder",
75+
methods=[
76+
MethodInfo(
77+
name="method_name_placeholder",
78+
args=[ArgInfo(name="arg_name_placeholder", description="arg_description_placeholder")],
79+
description="method_description_placeholder"
80+
)
81+
]
82+
)
83+
],
84+
methods=[
85+
MethodInfo(
86+
name="method_name_placeholder",
87+
args=[ArgInfo(name="arg_name_placeholder", description="arg_description_placeholder")],
88+
description="method_description_placeholder"
89+
)
90+
]
91+
)
92+
93+
@cocoindex.flow_def(name="ManualExtraction")
94+
def manual_extraction_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
95+
"""
96+
Define an example flow that extracts manual information from a Markdown.
97+
"""
98+
data_scope["documents"] = flow_builder.add_source(cocoindex.sources.LocalFile(path="pdf_files", binary=True))
99+
100+
manual_infos = data_scope.add_collector()
101+
102+
with data_scope["documents"].row() as doc:
103+
doc["markdown"] = doc["content"].transform(PdfToMarkdown())
104+
doc["manual_info"] = doc["markdown"].transform(ExtractManual())
105+
manual_infos.collect(filename=doc["filename"], manual_info=doc["manual_info"])
106+
107+
manual_infos.export(
108+
"manual_infos",
109+
cocoindex.storages.Postgres(),
110+
primary_key_fields=["filename"],
111+
)
112+
113+
@cocoindex.main_fn()
114+
def _run():
115+
pass
116+
117+
if __name__ == "__main__":
118+
load_dotenv(override=True)
119+
_run()
371 KB
Binary file not shown.
424 KB
Binary file not shown.
190 KB
Binary file not shown.
320 KB
Binary file not shown.
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
[project]
2+
name = "manual-extraction"
3+
version = "0.1.0"
4+
description = "Simple example for cocoindex: extract manual information from a Markdown."
5+
requires-python = ">=3.10"
6+
dependencies = ["cocoindex>=0.1.4", "python-dotenv>=1.0.1", "marker-pdf>=1.5.2"]

0 commit comments

Comments
 (0)