@@ -32,43 +32,53 @@ def __call__(self, content: bytes) -> str:
3232
3333@dataclasses .dataclass
3434class ArgInfo :
35+ """Information about an argument of a method."""
3536 name : str
3637 description : str
3738
3839@dataclasses .dataclass
3940class MethodInfo :
41+ """Information about a method."""
4042 name : str
4143 args : cocoindex .typing .List [ArgInfo ]
4244 description : str
4345
4446@dataclasses .dataclass
4547class ClassInfo :
48+ """Information about a class."""
4649 name : str
4750 description : str
4851 methods : cocoindex .typing .List [MethodInfo ]
4952
5053@dataclasses .dataclass
5154class ModuleInfo :
55+ """Information about a Python module."""
5256 title : str
5357 description : str
5458 classes : cocoindex .typing .Table [ClassInfo ]
5559 methods : cocoindex .typing .Table [MethodInfo ]
5660
61+ @dataclasses .dataclass
62+ class ModuleSummary :
63+ """Summary info about a Python module."""
64+ num_classes : int
65+ num_methods : int
5766
58- class CleanUpManual (cocoindex .op .FunctionSpec ):
59- """Clean up manual information."""
60-
61-
67+ @dataclasses .dataclass
68+ class SummarizeModule (cocoindex .op .FunctionSpec ):
69+ """Summarize a Python module."""
6270
6371@cocoindex .op .executor_class ()
64- class CleanUpManualExecutor :
65- """Executor for CleanUpManual ."""
72+ class SummarizeModuleExecutor :
73+ """Executor for SummarizeModule ."""
6674
67- spec : CleanUpManual
75+ spec : SummarizeModule
6876
69- def __call__ (self , module_info : ModuleInfo ) -> ModuleInfo | None :
70- # TODO: Clean up
71- return module_info
77+ def __call__ (self , module_info : ModuleInfo ) -> ModuleSummary :
78+ return ModuleSummary (
79+ num_classes = len (module_info .classes ),
80+ num_methods = len (module_info .methods ),
81+ )
7282
7383@cocoindex .flow_def (name = "ManualExtraction" )
7484def manual_extraction_flow (flow_builder : cocoindex .FlowBuilder , data_scope : cocoindex .DataScope ):
@@ -77,27 +87,31 @@ def manual_extraction_flow(flow_builder: cocoindex.FlowBuilder, data_scope: coco
7787 """
7888 data_scope ["documents" ] = flow_builder .add_source (cocoindex .sources .LocalFile (path = "manuals" , binary = True ))
7989
80- manual_infos = data_scope .add_collector ()
90+ modules_index = data_scope .add_collector ()
8191
8292 with data_scope ["documents" ].row () as doc :
8393 doc ["markdown" ] = doc ["content" ].transform (PdfToMarkdown ())
84- doc ["raw_module_info " ] = doc ["markdown" ].transform (
94+ doc ["module_info " ] = doc ["markdown" ].transform (
8595 cocoindex .functions .ExtractByLlm (
86- llm_spec = cocoindex .llm . LlmSpec (
87- api_type = cocoindex .llm . LlmApiType .OLLAMA ,
96+ llm_spec = cocoindex .LlmSpec (
97+ api_type = cocoindex .LlmApiType .OLLAMA ,
8898 # See the full list of models: https://ollama.com/library
89- model = "llama3.2:latest "
99+ model = "llama3.2"
90100 ),
91101 # Replace by this spec below, to use OpenAI API model instead of ollama
92- # llm_spec=cocoindex.llm. LlmSpec(
93- # api_type=cocoindex.llm. LlmApiType.OPENAI, model="gpt-4o"),
94- output_type = cocoindex . typing . encode_enriched_type ( ModuleInfo ) ,
102+ # llm_spec=cocoindex.LlmSpec(
103+ # api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"),
104+ output_type = ModuleInfo ,
95105 instruction = "Please extract Python module information from the manual." ))
96- doc ["module_info" ] = doc ["raw_module_info" ].transform (CleanUpManual ())
97- manual_infos .collect (filename = doc ["filename" ], module_info = doc ["module_info" ])
98-
99- manual_infos .export (
100- "manual_infos" ,
106+ doc ["module_summary" ] = doc ["module_info" ].transform (SummarizeModule ())
107+ modules_index .collect (
108+ filename = doc ["filename" ],
109+ module_info = doc ["module_info" ],
110+ module_summary = doc ["module_summary" ],
111+ )
112+
113+ modules_index .export (
114+ "modules" ,
101115 cocoindex .storages .Postgres (),
102116 primary_key_fields = ["filename" ],
103117 )
0 commit comments