diff --git a/examples/manual_extraction/manual_extraction.py b/examples/manual_extraction/manual_extraction.py index 26ca28ba8..016fcc05d 100644 --- a/examples/manual_extraction/manual_extraction.py +++ b/examples/manual_extraction/manual_extraction.py @@ -32,43 +32,53 @@ def __call__(self, content: bytes) -> str: @dataclasses.dataclass class ArgInfo: + """Information about an argument of a method.""" name: str description: str @dataclasses.dataclass class MethodInfo: + """Information about a method.""" name: str args: cocoindex.typing.List[ArgInfo] description: str @dataclasses.dataclass class ClassInfo: + """Information about a class.""" name: str description: str methods: cocoindex.typing.List[MethodInfo] @dataclasses.dataclass class ModuleInfo: + """Information about a Python module.""" title: str description: str classes: cocoindex.typing.Table[ClassInfo] methods: cocoindex.typing.Table[MethodInfo] +@dataclasses.dataclass +class ModuleSummary: + """Summary info about a Python module.""" + num_classes: int + num_methods: int -class CleanUpManual(cocoindex.op.FunctionSpec): - """Clean up manual information.""" - - +@dataclasses.dataclass +class SummarizeModule(cocoindex.op.FunctionSpec): + """Summarize a Python module.""" @cocoindex.op.executor_class() -class CleanUpManualExecutor: - """Executor for CleanUpManual.""" +class SummarizeModuleExecutor: + """Executor for SummarizeModule.""" - spec: CleanUpManual + spec: SummarizeModule - def __call__(self, module_info: ModuleInfo) -> ModuleInfo | None: - # TODO: Clean up - return module_info + def __call__(self, module_info: ModuleInfo) -> ModuleSummary: + return ModuleSummary( + num_classes=len(module_info.classes), + num_methods=len(module_info.methods), + ) @cocoindex.flow_def(name="ManualExtraction") def manual_extraction_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope): @@ -77,27 +87,31 @@ def manual_extraction_flow(flow_builder: cocoindex.FlowBuilder, data_scope: coco """ data_scope["documents"] = flow_builder.add_source(cocoindex.sources.LocalFile(path="manuals", binary=True)) - manual_infos = data_scope.add_collector() + modules_index = data_scope.add_collector() with data_scope["documents"].row() as doc: doc["markdown"] = doc["content"].transform(PdfToMarkdown()) - doc["raw_module_info"] = doc["markdown"].transform( + doc["module_info"] = doc["markdown"].transform( cocoindex.functions.ExtractByLlm( - llm_spec=cocoindex.llm.LlmSpec( - api_type=cocoindex.llm.LlmApiType.OLLAMA, + llm_spec=cocoindex.LlmSpec( + api_type=cocoindex.LlmApiType.OLLAMA, # See the full list of models: https://ollama.com/library - model="llama3.2:latest" + model="llama3.2" ), # Replace by this spec below, to use OpenAI API model instead of ollama - # llm_spec=cocoindex.llm.LlmSpec( - # api_type=cocoindex.llm.LlmApiType.OPENAI, model="gpt-4o"), - output_type=cocoindex.typing.encode_enriched_type(ModuleInfo), + # llm_spec=cocoindex.LlmSpec( + # api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"), + output_type=ModuleInfo, instruction="Please extract Python module information from the manual.")) - doc["module_info"] = doc["raw_module_info"].transform(CleanUpManual()) - manual_infos.collect(filename=doc["filename"], module_info=doc["module_info"]) - - manual_infos.export( - "manual_infos", + doc["module_summary"] = doc["module_info"].transform(SummarizeModule()) + modules_index.collect( + filename=doc["filename"], + module_info=doc["module_info"], + module_summary=doc["module_summary"], + ) + + modules_index.export( + "modules", cocoindex.storages.Postgres(), primary_key_fields=["filename"], ) diff --git a/examples/manual_extraction/pyproject.toml b/examples/manual_extraction/pyproject.toml index 79dff53ac..bf7b32221 100644 --- a/examples/manual_extraction/pyproject.toml +++ b/examples/manual_extraction/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "manual-extraction" version = "0.1.0" -description = "Simple example for cocoindex: extract manual information from a Markdown." +description = "Simple example for cocoindex: extract structured information from a Markdown file." requires-python = ">=3.10" dependencies = ["cocoindex>=0.1.6", "python-dotenv>=1.0.1", "marker-pdf>=1.5.2"] diff --git a/python/cocoindex/__init__.py b/python/cocoindex/__init__.py index 9cd86c8d9..4dcc18a7e 100644 --- a/python/cocoindex/__init__.py +++ b/python/cocoindex/__init__.py @@ -1,8 +1,9 @@ """ Cocoindex is a framework for building and running indexing pipelines. """ -from . import flow, functions, query, sources, storages, cli, llm +from . import flow, functions, query, sources, storages, cli from .flow import FlowBuilder, DataScope, DataSlice, Flow, flow_def +from .llm import LlmSpec, LlmApiType from .vector import VectorSimilarityMetric from .lib import * from ._engine import OpArgSchema diff --git a/python/cocoindex/flow.py b/python/cocoindex/flow.py index 7b917f287..2e3f0f5f2 100644 --- a/python/cocoindex/flow.py +++ b/python/cocoindex/flow.py @@ -6,7 +6,7 @@ import re import inspect -from typing import Any, Callable, Sequence, TypeVar +from typing import Any, Callable, Sequence, TypeVar, get_origin from threading import Lock from enum import Enum @@ -61,17 +61,19 @@ def _create_data_slice( def _spec_kind(spec: Any) -> str: return spec.__class__.__name__ -def _spec_value_dump(spec: Any) -> Any: +def _spec_value_dump(v: Any) -> Any: """Recursively dump a spec object and its nested attributes to a dictionary.""" - if isinstance(spec, Enum): - return spec.value - elif hasattr(spec, '__dict__'): - return {k: _spec_value_dump(v) for k, v in spec.__dict__.items()} - elif isinstance(spec, (list, tuple)): - return [_spec_value_dump(item) for item in spec] - elif isinstance(spec, dict): - return {k: _spec_value_dump(v) for k, v in spec.items()} - return spec + if isinstance(v, type) or get_origin(v) is not None: + return encode_enriched_type(v) + elif isinstance(v, Enum): + return v.value + elif hasattr(v, '__dict__'): + return {k: _spec_value_dump(v) for k, v in v.__dict__.items()} + elif isinstance(v, (list, tuple)): + return [_spec_value_dump(item) for item in v] + elif isinstance(v, dict): + return {k: _spec_value_dump(v) for k, v in v.items()} + return v T = TypeVar('T') diff --git a/python/cocoindex/functions.py b/python/cocoindex/functions.py index d97e7ab14..b0753893a 100644 --- a/python/cocoindex/functions.py +++ b/python/cocoindex/functions.py @@ -1,5 +1,4 @@ """All builtin functions.""" -from dataclasses import dataclass from typing import Annotated, Any import sentence_transformers @@ -16,8 +15,7 @@ class ExtractByLlm(op.FunctionSpec): """Extract information from a text using a LLM.""" llm_spec: llm.LlmSpec - # Expected to be generated by `cocoindex.typing.encode_enriched_type()` - output_type: dict[str, Any] + output_type: type instruction: str | None = None class SentenceTransformerEmbed(op.FunctionSpec):