Skip to content

Commit 2d026e8

Browse files
committed
Update manual_extraction example according to simplify type in spec.
1 parent 02e1e9c commit 2d026e8

File tree

2 files changed

+38
-24
lines changed

2 files changed

+38
-24
lines changed

examples/manual_extraction/manual_extraction.py

Lines changed: 37 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -32,43 +32,53 @@ def __call__(self, content: bytes) -> str:
3232

3333
@dataclasses.dataclass
3434
class ArgInfo:
35+
"""Information about an argument of a method."""
3536
name: str
3637
description: str
3738

3839
@dataclasses.dataclass
3940
class MethodInfo:
41+
"""Information about a method."""
4042
name: str
4143
args: cocoindex.typing.List[ArgInfo]
4244
description: str
4345

4446
@dataclasses.dataclass
4547
class ClassInfo:
48+
"""Information about a class."""
4649
name: str
4750
description: str
4851
methods: cocoindex.typing.List[MethodInfo]
4952

5053
@dataclasses.dataclass
5154
class ModuleInfo:
55+
"""Information about a Python module."""
5256
title: str
5357
description: str
5458
classes: cocoindex.typing.Table[ClassInfo]
5559
methods: cocoindex.typing.Table[MethodInfo]
5660

61+
@dataclasses.dataclass
62+
class ModuleSummary:
63+
"""Summary info about a Python module."""
64+
num_classes: int
65+
num_methods: int
5766

58-
class CleanUpManual(cocoindex.op.FunctionSpec):
59-
"""Clean up manual information."""
60-
61-
67+
@dataclasses.dataclass
68+
class SummarizeModule(cocoindex.op.FunctionSpec):
69+
"""Summarize a Python module."""
6270

6371
@cocoindex.op.executor_class()
64-
class CleanUpManualExecutor:
65-
"""Executor for CleanUpManual."""
72+
class SummarizeModuleExecutor:
73+
"""Executor for SummarizeModule."""
6674

67-
spec: CleanUpManual
75+
spec: SummarizeModule
6876

69-
def __call__(self, module_info: ModuleInfo) -> ModuleInfo | None:
70-
# TODO: Clean up
71-
return module_info
77+
def __call__(self, module_info: ModuleInfo) -> ModuleSummary:
78+
return ModuleSummary(
79+
num_classes=len(module_info.classes),
80+
num_methods=len(module_info.methods),
81+
)
7282

7383
@cocoindex.flow_def(name="ManualExtraction")
7484
def manual_extraction_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
@@ -77,27 +87,31 @@ def manual_extraction_flow(flow_builder: cocoindex.FlowBuilder, data_scope: coco
7787
"""
7888
data_scope["documents"] = flow_builder.add_source(cocoindex.sources.LocalFile(path="manuals", binary=True))
7989

80-
manual_infos = data_scope.add_collector()
90+
modules_index = data_scope.add_collector()
8191

8292
with data_scope["documents"].row() as doc:
8393
doc["markdown"] = doc["content"].transform(PdfToMarkdown())
84-
doc["raw_module_info"] = doc["markdown"].transform(
94+
doc["module_info"] = doc["markdown"].transform(
8595
cocoindex.functions.ExtractByLlm(
86-
llm_spec=cocoindex.llm.LlmSpec(
87-
api_type=cocoindex.llm.LlmApiType.OLLAMA,
96+
llm_spec=cocoindex.LlmSpec(
97+
api_type=cocoindex.LlmApiType.OLLAMA,
8898
# See the full list of models: https://ollama.com/library
89-
model="llama3.2:latest"
99+
model="llama3.2"
90100
),
91101
# Replace by this spec below, to use OpenAI API model instead of ollama
92-
# llm_spec=cocoindex.llm.LlmSpec(
93-
# api_type=cocoindex.llm.LlmApiType.OPENAI, model="gpt-4o"),
94-
output_type=cocoindex.typing.encode_enriched_type(ModuleInfo),
102+
# llm_spec=cocoindex.LlmSpec(
103+
# api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"),
104+
output_type=ModuleInfo,
95105
instruction="Please extract Python module information from the manual."))
96-
doc["module_info"] = doc["raw_module_info"].transform(CleanUpManual())
97-
manual_infos.collect(filename=doc["filename"], module_info=doc["module_info"])
98-
99-
manual_infos.export(
100-
"manual_infos",
106+
doc["module_summary"] = doc["module_info"].transform(SummarizeModule())
107+
modules_index.collect(
108+
filename=doc["filename"],
109+
module_info=doc["module_info"],
110+
module_summary=doc["module_summary"],
111+
)
112+
113+
modules_index.export(
114+
"modules",
101115
cocoindex.storages.Postgres(),
102116
primary_key_fields=["filename"],
103117
)
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "manual-extraction"
33
version = "0.1.0"
4-
description = "Simple example for cocoindex: extract manual information from a Markdown."
4+
description = "Simple example for cocoindex: extract structured information from a Markdown file."
55
requires-python = ">=3.10"
66
dependencies = ["cocoindex>=0.1.6", "python-dotenv>=1.0.1", "marker-pdf>=1.5.2"]

0 commit comments

Comments
 (0)