From edbe29f6c52a679da5ea454d35de7bc843d2e089 Mon Sep 17 00:00:00 2001 From: Jiangzhou He Date: Mon, 17 Nov 2025 15:49:43 -0800 Subject: [PATCH 1/2] example(meeting-notes-kg): add example for meeting notes knowledge graph --- README.md | 1 + examples/README.md | 1 + examples/meeting_notes_graph/.env.example | 14 ++ examples/meeting_notes_graph/.gitignore | 1 + examples/meeting_notes_graph/README.md | 107 ++++++++++ examples/meeting_notes_graph/main.py | 205 ++++++++++++++++++++ examples/meeting_notes_graph/pyproject.toml | 9 + 7 files changed, 338 insertions(+) create mode 100644 examples/meeting_notes_graph/.env.example create mode 100644 examples/meeting_notes_graph/.gitignore create mode 100644 examples/meeting_notes_graph/README.md create mode 100644 examples/meeting_notes_graph/main.py create mode 100644 examples/meeting_notes_graph/pyproject.toml diff --git a/README.md b/README.md index 206c60704..19b15f059 100644 --- a/README.md +++ b/README.md @@ -189,6 +189,7 @@ It defines an index flow like this: | [Amazon S3 Embedding](examples/amazon_s3_embedding) | Index text documents from Amazon S3 | | [Azure Blob Storage Embedding](examples/azure_blob_embedding) | Index text documents from Azure Blob Storage | | [Google Drive Text Embedding](examples/gdrive_text_embedding) | Index text documents from Google Drive | +| [Meeting Notes to Knowledge Graph](examples/meeting_notes_graph) | Extract structured meeting info from Google Drive and build a knowledge graph | | [Docs to Knowledge Graph](examples/docs_to_knowledge_graph) | Extract relationships from Markdown documents and build a knowledge graph | | [Embeddings to Qdrant](examples/text_embedding_qdrant) | Index documents in a Qdrant collection for semantic search | | [Embeddings to LanceDB](examples/text_embedding_lancedb) | Index documents in a LanceDB collection for semantic search | diff --git a/examples/README.md b/examples/README.md index 6163f67e7..0bb05d8d2 100644 --- a/examples/README.md +++ b/examples/README.md @@ -32,6 +32,7 @@ Check out our [examples documentation](https://cocoindex.io/docs/examples) for m - 🏥 [**patient_intake_extraction_baml**](./patient_intake_extraction_baml) - Extract structured data from patient intake PDFs using BAML - 📖 [**manuals_llm_extraction**](./manuals_llm_extraction) - Extract structured information from PDF manuals using Ollama - 📄 [**paper_metadata**](./paper_metadata) - Extract metadata (title, authors, abstract) from research papers in PDF +- 📝 [**meeting_notes_graph**](./meeting_notes_graph) - Extract structured meeting info from Google Drive and build a knowledge graph ## Custom Sources & Targets diff --git a/examples/meeting_notes_graph/.env.example b/examples/meeting_notes_graph/.env.example new file mode 100644 index 000000000..5717c14e5 --- /dev/null +++ b/examples/meeting_notes_graph/.env.example @@ -0,0 +1,14 @@ +# Postgres database address for cocoindex +COCOINDEX_DATABASE_URL=postgres://cocoindex:cocoindex@localhost/cocoindex + +# OpenAI API key. +#! PLEASE FILL IN +OPENAI_API_KEY= + +# Google Drive service account credential path. +#! PLEASE FILL IN +GOOGLE_SERVICE_ACCOUNT_CREDENTIAL=/path/to/service_account_credential.json + +# Google Drive root folder IDs, comma separated. +#! PLEASE FILL IN +GOOGLE_DRIVE_ROOT_FOLDER_IDS=id1,id2 diff --git a/examples/meeting_notes_graph/.gitignore b/examples/meeting_notes_graph/.gitignore new file mode 100644 index 000000000..4c49bd78f --- /dev/null +++ b/examples/meeting_notes_graph/.gitignore @@ -0,0 +1 @@ +.env diff --git a/examples/meeting_notes_graph/README.md b/examples/meeting_notes_graph/README.md new file mode 100644 index 000000000..07b985eb6 --- /dev/null +++ b/examples/meeting_notes_graph/README.md @@ -0,0 +1,107 @@ +# Build Meeting Notes Knowledge Graph from Google Drive + +We will extract structured information from meeting notes stored in Google Drive and build a knowledge graph in Neo4j. The flow ingests Markdown notes, splits them by headings into meetings, uses an LLM to parse participants, organizer, time, and tasks, and then writes nodes and relationships into a graph database. + +Please drop [CocoIndex on Github](https://github.com/cocoindex-io/cocoindex) a star to support us and stay tuned for more updates. Thank you so much 🥥🤗. [![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex) + +## What this builds + +The pipeline defines: + +- Meeting nodes: one per meeting section, keyed by source note file and meeting time +- Person nodes: people who organized or attended meetings +- Task nodes: tasks decided in meetings +- Relationships: + - `ATTENDED` Person → Meeting (organizer included, marked in flow when collected) + - `DECIDED` Meeting → Task + - `ASSIGNED_TO` Person → Task + +The source is Google Drive folders shared with a service account. The flow watches for recent changes and keeps the graph up to date. + +## How it works + +1. Ingest files from Google Drive (service account + root folder IDs) +2. Split each note by Markdown headings into meeting sections +3. Use an LLM to extract a structured `Meeting` object: time, note, organizer, participants, and tasks (with assignees) +4. Collect nodes and relationships in-memory +5. Export to Neo4j: + - Nodes: `Meeting` (explicit export), `Person` and `Task` (declared with primary keys) + - Relationships: `ATTENDED`, `DECIDED`, `ASSIGNED_TO` + +## Prerequisite + +- Install [Neo4j](https://cocoindex.io/docs/targets/neo4j) and start it locally + - Default local browser: + - Default credentials used in this example: username `neo4j`, password `cocoindex` +- [Configure your OpenAI API key](https://cocoindex.io/docs/ai/llm#openai) +- Prepare Google Drive: + - Create a Google Cloud service account and download its JSON credential + - Share the source folders with the service account email + - Collect the root folder IDs you want to ingest + - See [Setup for Google Drive](https://cocoindex.io/docs/sources/googledrive#setup-for-google-drive) for details + +## Environment + +Set the following environment variables: + +```sh +export OPENAI_API_KEY=sk-... +export GOOGLE_SERVICE_ACCOUNT_CREDENTIAL=/absolute/path/to/service_account.json +export GOOGLE_DRIVE_ROOT_FOLDER_IDS=folderId1,folderId2 +``` + +Notes: + +- `GOOGLE_DRIVE_ROOT_FOLDER_IDS` accepts a comma-separated list of folder IDs +- The flow polls recent changes and refreshes periodically + +## Run + +### Build/update the graph + +Install dependencies: + +```bash +pip install -e . +``` + +Update the index (run the flow once to build/update the graph): + +```bash +cocoindex update main +``` + +### Browse the knowledge graph + +Open Neo4j Browser at . + +Sample Cypher queries: + +```cypher +// All relationships +MATCH p=()-->() RETURN p + +// Who attended which meetings (including organizer) +MATCH (p:Person)-[:ATTENDED]->(m:Meeting) +RETURN p, m + +// Tasks decided in meetings +MATCH (m:Meeting)-[:DECIDED]->(t:Task) +RETURN m, t + +// Task assignments +MATCH (p:Person)-[:ASSIGNED_TO]->(t:Task) +RETURN p, t +``` + +## CocoInsight + +I used CocoInsight (Free beta now) to troubleshoot the index generation and understand the data lineage of the pipeline. It just connects to your local CocoIndex server, with Zero pipeline data retention. + +Start CocoInsight: + +```bash +cocoindex server -ci main +``` + +Then open the UI at . diff --git a/examples/meeting_notes_graph/main.py b/examples/meeting_notes_graph/main.py new file mode 100644 index 000000000..c52bd5c63 --- /dev/null +++ b/examples/meeting_notes_graph/main.py @@ -0,0 +1,205 @@ +""" +This example shows how to extract relationships from Markdown documents and build a knowledge graph. +""" + +from dataclasses import dataclass +import datetime +import cocoindex +import os + +conn_spec = cocoindex.add_auth_entry( + "Neo4jConnection", + cocoindex.targets.Neo4jConnection( + uri="bolt://localhost:7687", + user="neo4j", + password="cocoindex", + ), +) + +GraphDbSpec = cocoindex.targets.Neo4j + + +@dataclass +class Person: + name: str + + +@dataclass +class Task: + description: str + assigned_to: list[Person] + + +@dataclass +class Meeting: + time: datetime.date + note: str + organizer: Person + participants: list[Person] + tasks: list[Task] + + +@cocoindex.flow_def(name="MeetingNotesGraph") +def meeting_notes_graph_flow( + flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope +) -> None: + """ + Define an example flow that extracts triples from files and build knowledge graph. + """ + credential_path = os.environ["GOOGLE_SERVICE_ACCOUNT_CREDENTIAL"] + root_folder_ids = os.environ["GOOGLE_DRIVE_ROOT_FOLDER_IDS"].split(",") + + data_scope["documents"] = flow_builder.add_source( + cocoindex.sources.GoogleDrive( + service_account_credential_path=credential_path, + root_folder_ids=root_folder_ids, + recent_changes_poll_interval=datetime.timedelta(seconds=10), + ), + refresh_interval=datetime.timedelta(minutes=1), + ) + + meeting_nodes = data_scope.add_collector() + attended_rels = data_scope.add_collector() + decided_tasks_rels = data_scope.add_collector() + assigned_rels = data_scope.add_collector() + + with data_scope["documents"].row() as document: + document["meetings"] = document["content"].transform( + cocoindex.functions.SplitBySeparators( + separators_regex=[r"\n\n##?\ "], keep_separator="RIGHT" + ) + ) + with document["meetings"].row() as meeting: + parsed = meeting["parsed"] = meeting["text"].transform( + cocoindex.functions.ExtractByLlm( + llm_spec=cocoindex.LlmSpec( + api_type=cocoindex.LlmApiType.OPENAI, model="gpt-5" + ), + output_type=Meeting, + ) + ) + meeting_key = {"note_file": document["filename"], "time": parsed["time"]} + meeting_nodes.collect(**meeting_key, note=parsed["note"]) + + attended_rels.collect( + id=cocoindex.GeneratedField.UUID, + **meeting_key, + person=parsed["organizer"]["name"], + is_organizer=True, + ) + with parsed["participants"].row() as participant: + attended_rels.collect( + id=cocoindex.GeneratedField.UUID, + **meeting_key, + person=participant["name"], + ) + + with parsed["tasks"].row() as task: + decided_tasks_rels.collect( + id=cocoindex.GeneratedField.UUID, + **meeting_key, + description=task["description"], + ) + with task["assigned_to"].row() as assigned_to: + assigned_rels.collect( + id=cocoindex.GeneratedField.UUID, + **meeting_key, + task=task["description"], + person=assigned_to["name"], + ) + + meeting_nodes.export( + "meeting_nodes", + GraphDbSpec( + connection=conn_spec, mapping=cocoindex.targets.Nodes(label="Meeting") + ), + primary_key_fields=["note_file", "time"], + ) + flow_builder.declare( + cocoindex.targets.Neo4jDeclaration( + connection=conn_spec, + nodes_label="Person", + primary_key_fields=["name"], + ) + ) + flow_builder.declare( + cocoindex.targets.Neo4jDeclaration( + connection=conn_spec, + nodes_label="Task", + primary_key_fields=["description"], + ) + ) + attended_rels.export( + "attended_rels", + GraphDbSpec( + connection=conn_spec, + mapping=cocoindex.targets.Relationships( + rel_type="ATTENDED", + source=cocoindex.targets.NodeFromFields( + label="Person", + fields=[ + cocoindex.targets.TargetFieldMapping( + source="person", target="name" + ) + ], + ), + target=cocoindex.targets.NodeFromFields( + label="Meeting", + fields=[ + cocoindex.targets.TargetFieldMapping("note_file"), + cocoindex.targets.TargetFieldMapping("time"), + ], + ), + ), + ), + primary_key_fields=["id"], + ) + decided_tasks_rels.export( + "decided_tasks_rels", + GraphDbSpec( + connection=conn_spec, + mapping=cocoindex.targets.Relationships( + rel_type="DECIDED", + source=cocoindex.targets.NodeFromFields( + label="Meeting", + fields=[ + cocoindex.targets.TargetFieldMapping("note_file"), + cocoindex.targets.TargetFieldMapping("time"), + ], + ), + target=cocoindex.targets.NodeFromFields( + label="Task", + fields=[ + cocoindex.targets.TargetFieldMapping("description"), + ], + ), + ), + ), + primary_key_fields=["id"], + ) + assigned_rels.export( + "assigned_rels", + GraphDbSpec( + connection=conn_spec, + mapping=cocoindex.targets.Relationships( + rel_type="ASSIGNED_TO", + source=cocoindex.targets.NodeFromFields( + label="Person", + fields=[ + cocoindex.targets.TargetFieldMapping( + source="person", target="name" + ), + ], + ), + target=cocoindex.targets.NodeFromFields( + label="Task", + fields=[ + cocoindex.targets.TargetFieldMapping( + source="task", target="description" + ), + ], + ), + ), + ), + primary_key_fields=["id"], + ) diff --git a/examples/meeting_notes_graph/pyproject.toml b/examples/meeting_notes_graph/pyproject.toml new file mode 100644 index 000000000..7b7e8ed70 --- /dev/null +++ b/examples/meeting_notes_graph/pyproject.toml @@ -0,0 +1,9 @@ +[project] +name = "cocoindex-ecommerce-taxonomy" +version = "0.1.0" +description = "Simple example for CocoIndex: extract taxonomy from e-commerce products and build knowledge graph." +requires-python = ">=3.11" +dependencies = ["cocoindex>=0.2.8", "jinja2>=3.1.6"] + +[tool.setuptools] +packages = [] From b369a5dda5ac18e3f1c0e454f8e18d046d788ae3 Mon Sep 17 00:00:00 2001 From: Jiangzhou He Date: Mon, 17 Nov 2025 15:52:40 -0800 Subject: [PATCH 2/2] minor cleanups --- examples/meeting_notes_graph/main.py | 10 ++++------ examples/meeting_notes_graph/pyproject.toml | 2 +- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/examples/meeting_notes_graph/main.py b/examples/meeting_notes_graph/main.py index c52bd5c63..3ab29bdd1 100644 --- a/examples/meeting_notes_graph/main.py +++ b/examples/meeting_notes_graph/main.py @@ -16,8 +16,6 @@ ), ) -GraphDbSpec = cocoindex.targets.Neo4j - @dataclass class Person: @@ -110,7 +108,7 @@ def meeting_notes_graph_flow( meeting_nodes.export( "meeting_nodes", - GraphDbSpec( + cocoindex.targets.Neo4j( connection=conn_spec, mapping=cocoindex.targets.Nodes(label="Meeting") ), primary_key_fields=["note_file", "time"], @@ -131,7 +129,7 @@ def meeting_notes_graph_flow( ) attended_rels.export( "attended_rels", - GraphDbSpec( + cocoindex.targets.Neo4j( connection=conn_spec, mapping=cocoindex.targets.Relationships( rel_type="ATTENDED", @@ -156,7 +154,7 @@ def meeting_notes_graph_flow( ) decided_tasks_rels.export( "decided_tasks_rels", - GraphDbSpec( + cocoindex.targets.Neo4j( connection=conn_spec, mapping=cocoindex.targets.Relationships( rel_type="DECIDED", @@ -179,7 +177,7 @@ def meeting_notes_graph_flow( ) assigned_rels.export( "assigned_rels", - GraphDbSpec( + cocoindex.targets.Neo4j( connection=conn_spec, mapping=cocoindex.targets.Relationships( rel_type="ASSIGNED_TO", diff --git a/examples/meeting_notes_graph/pyproject.toml b/examples/meeting_notes_graph/pyproject.toml index 7b7e8ed70..0ce0b945e 100644 --- a/examples/meeting_notes_graph/pyproject.toml +++ b/examples/meeting_notes_graph/pyproject.toml @@ -3,7 +3,7 @@ name = "cocoindex-ecommerce-taxonomy" version = "0.1.0" description = "Simple example for CocoIndex: extract taxonomy from e-commerce products and build knowledge graph." requires-python = ">=3.11" -dependencies = ["cocoindex>=0.2.8", "jinja2>=3.1.6"] +dependencies = ["cocoindex>=0.3.8"] [tool.setuptools] packages = []