cocoindex-io · georgeh0 · Oct 13, 2025 · Oct 9, 2025 · Oct 9, 2025 · Oct 11, 2025
diff --git a/examples/bedrock_llm_extraction/.env.example b/examples/bedrock_llm_extraction/.env.example
@@ -0,0 +1,10 @@
+# Postgres database address for cocoindex
+COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex
+
+# Fallback to CPU for operations not supported by MPS on Mac.
+# It's no-op for other platforms.
+PYTORCH_ENABLE_MPS_FALLBACK=1
+
+# AWS Bedrock credentials
+BEDROCK_API_KEY=your_bedrock_api_key
+BEDROCK_REGION=your_bedrock_region
diff --git a/examples/bedrock_llm_extraction/.gitignore b/examples/bedrock_llm_extraction/.gitignore
@@ -0,0 +1 @@
+.env
diff --git a/examples/bedrock_llm_extraction/README.md b/examples/bedrock_llm_extraction/README.md
@@ -0,0 +1,72 @@
+# Structured Data Extraction from PDF with AWS Bedrock and CocoIndex
+
+In this example, we
+
+*   Converts PDFs (generated from a few Python docs) into Markdown.
+*   Extract structured information from the Markdown using an AWS Bedrock LLM.
+*   Use a custom function to further extract information from the structured output.
+
+Please give [Cocoindex on Github](https://github.com/cocoindex-io/cocoindex) a star to support us if you like our work. Thank you so much with a warm coconut hug 🥥🤗. [![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex)
+
+## Prerequisite
+
+Before running the example, you need to:
+
+*   [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one.
+*   Configure your AWS Bedrock credentials. In this example we use AWS Bedrock. You need to get it ready by following [this guide](https://docs.aws.amazon.com/bedrock/latest/userguide/api-keys.html) to create an API key. Alternatively, you can also follow the comments in source code to switch to other LLMs.
+
+First, copy the example environment file:
+
+```bash
+cp .env.example .env
+```
+
+Then, open the `.env` file and fill in your AWS Bedrock credentials. The `.env` file is ignored by git, so your secrets will not be committed.
+
+## Run
+
+
+### Build the index
+
+Install dependencies:
+
+```bash
+pip install -e .
+```
+
+Setup:
+
+```bash
+cocoindex setup main.py
+```
+
+Update index:
+
+```bash
+cocoindex update main.py
+```
+
+### Query the index
+
+After index is build, you have a table with name `modules_info`. You can query it any time, e.g. start a Postgres shell:
+
+```bash
+psql postgres://cocoindex:cocoindex@localhost/cocoindex
+```
+
+And run the SQL query:
+
+```sql
+SELECT filename, module_info->'title' AS title, module_summary FROM modules_info;
+```
+You should see results like:
+
+```
+      filename       |         title          |      module_summary
+---------------------+------------------------+--------------------------
+ manuals/asyncio.pdf | "asyncio — Asynchronous" | {"num_classes": 0, "num_methods": 0}
+ manuals/json.pdf    | "json — JSON encoder"  | {"num_classes": 0, "num_methods": 0}
+(2 rows)
+```
+
+The output may vary depending on the model you are using. The important part is that the `module_info` and `module_summary` columns are populated with the extracted data.
diff --git a/examples/bedrock_llm_extraction/main.py b/examples/bedrock_llm_extraction/main.py
@@ -0,0 +1,138 @@
+import tempfile
+import dataclasses
+
+from marker.converters.pdf import PdfConverter
+from marker.models import create_model_dict
+from marker.output import text_from_rendered
+from marker.config.parser import ConfigParser
+
+import cocoindex
+
+
+class PdfToMarkdown(cocoindex.op.FunctionSpec):
+    """Convert a PDF to markdown."""
+
+
+@cocoindex.op.executor_class(gpu=True, cache=True, behavior_version=1)
+class PdfToMarkdownExecutor:
+    """Executor for PdfToMarkdown."""
+
+    spec: PdfToMarkdown
+    _converter: PdfConverter
+
+    def prepare(self):
+        config_parser = ConfigParser({})
+        self._converter = PdfConverter(
+            create_model_dict(), config=config_parser.generate_config_dict()
+        )
+
+    def __call__(self, content: bytes) -> str:
+        with tempfile.NamedTemporaryFile(delete=True, suffix=".pdf") as temp_file:
+            temp_file.write(content)
+            temp_file.flush()
+            text, _, _ = text_from_rendered(self._converter(temp_file.name))
+            return text
+
+
+@dataclasses.dataclass
+class ArgInfo:
+    """Information about an argument of a method."""
+
+    name: str
+    description: str
+
+
+@dataclasses.dataclass
+class MethodInfo:
+    """Information about a method."""
+
+    name: str
+    args: list[ArgInfo]
+    description: str
+
+
+@dataclasses.dataclass
+class ClassInfo:
+    """Information about a class."""
+
+    name: str
+    description: str
+    methods: list[MethodInfo]
+
+
+@dataclasses.dataclass
+class ModuleInfo:
+    """Information about a Python module."""
+
+    title: str
+    description: str
+    classes: list[ClassInfo]
+    methods: list[MethodInfo]
+
+
+@dataclasses.dataclass
+class ModuleSummary:
+    """Summary info about a Python module."""
+
+    num_classes: int
+    num_methods: int
+
+
+@cocoindex.op.function()
+def summarize_module(module_info: ModuleInfo) -> ModuleSummary:
+    """Summarize a Python module."""
+    return ModuleSummary(
+        num_classes=len(module_info.classes),
+        num_methods=len(module_info.methods),
+    )
+
+
+@cocoindex.flow_def(name="ManualExtraction")
+def manual_extraction_flow(
+    flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope
+):
+    """
+    Define an example flow that extracts manual information from a Markdown.
+    """
+    data_scope["documents"] = flow_builder.add_source(
+        cocoindex.sources.LocalFile(path="manuals", binary=True)
+    )
+
+    modules_index = data_scope.add_collector()
+
+    with data_scope["documents"].row() as doc:
+        doc["markdown"] = doc["content"].transform(PdfToMarkdown())
+        doc["module_info"] = doc["markdown"].transform(
+            cocoindex.functions.ExtractByLlm(
+                llm_spec=cocoindex.LlmSpec(
+                    api_type=cocoindex.LlmApiType.BEDROCK,
+                    model="anthropic.claude-3-haiku-20240307-v1:0",
+                ),
+                # Replace by this spec below, to use OpenAI API model instead of ollama
+                #   llm_spec=cocoindex.LlmSpec(
+                #       api_type=cocoindex.LlmApiType.OPENAI, model="gpt-4o"),
+                # Replace by this spec below, to use Gemini API model
+                #   llm_spec=cocoindex.LlmSpec(
+                #       api_type=cocoindex.LlmApiType.GEMINI, model="gemini-2.0-flash"),
+                # Replace by this spec below, to use Anthropic API model
+                #   llm_spec=cocoindex.LlmSpec(
+                #       api_type=cocoindex.LlmApiType.ANTHROPIC, model="claude-3-5-sonnet-latest"),
+                # Replace by this spec below, to use Ollama API model
+                #   llm_spec=cocoindex.LlmSpec(
+                #       api_type=cocoindex.LlmApiType.OLLAMA, model="llama3.2"),
+                output_type=ModuleInfo,
+                instruction="Please extract Python module information from the manual.",
+            )
+        )
+        doc["module_summary"] = doc["module_info"].transform(summarize_module)
+        modules_index.collect(
+            filename=doc["filename"],
+            module_info=doc["module_info"],
+            module_summary=doc["module_summary"],
+        )
+
+    modules_index.export(
+        "modules",
+        cocoindex.targets.Postgres(table_name="modules_info"),
+        primary_key_fields=["filename"],
+    )
diff --git a/examples/bedrock_llm_extraction/manuals/array.pdf b/examples/bedrock_llm_extraction/manuals/array.pdf
diff --git a/examples/bedrock_llm_extraction/manuals/base64.pdf b/examples/bedrock_llm_extraction/manuals/base64.pdf
diff --git a/examples/bedrock_llm_extraction/manuals/copy.pdf b/examples/bedrock_llm_extraction/manuals/copy.pdf
diff --git a/examples/bedrock_llm_extraction/manuals/glob.pdf b/examples/bedrock_llm_extraction/manuals/glob.pdf
diff --git a/examples/bedrock_llm_extraction/pyproject.toml b/examples/bedrock_llm_extraction/pyproject.toml
@@ -0,0 +1,9 @@
+[project]
+name = "manuals-llm-extraction"
+version = "0.1.0"
+description = "Simple example for cocoindex: extract structured information from a Markdown file using LLM."
+requires-python = ">=3.11"
+dependencies = ["cocoindex>=0.2.8", "marker-pdf>=1.8.5"]
+
+[tool.setuptools]
+packages = []
diff --git a/python/cocoindex/llm.py b/python/cocoindex/llm.py
@@ -14,6 +14,7 @@ class LlmApiType(Enum):
     OPEN_ROUTER = "OpenRouter"
     VOYAGE = "Voyage"
     VLLM = "Vllm"
+    BEDROCK = "Bedrock"
 
 
 @dataclass

diff --git a/python/cocoindex/tests/test_engine_value.py b/python/cocoindex/tests/test_engine_value.py
@@ -1064,6 +1064,25 @@ def test_full_roundtrip_vector_numeric_types() -> None:
         validate_full_roundtrip(value_u64, Vector[np.uint64, Literal[3]])
 
 
+def test_llm_api_type_bedrock() -> None:
+    """Test that LlmApiType.BEDROCK is available and works."""
+    from cocoindex.llm import LlmApiType, LlmSpec
+
+    # Test enum availability
+    assert hasattr(LlmApiType, "BEDROCK")
+    assert LlmApiType.BEDROCK.value == "Bedrock"
+
+    # Test LlmSpec creation with Bedrock
+    spec = LlmSpec(
+        api_type=LlmApiType.BEDROCK, model="us.anthropic.claude-3-5-haiku-20241022-v1:0"
+    )
+
+    assert spec.api_type == LlmApiType.BEDROCK
+    assert spec.model == "us.anthropic.claude-3-5-haiku-20241022-v1:0"
+    assert spec.address is None
+    assert spec.api_config is None
+
+
 def test_full_roundtrip_vector_of_vector() -> None:
     """Test full roundtrip for vector of vector."""
     value_f32 = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], dtype=np.float32)