|
| 1 | +""" |
| 2 | +This example shows how to extract relationships from Markdown documents and build a knowledge graph. |
| 3 | +""" |
| 4 | + |
| 5 | +from dataclasses import dataclass |
| 6 | +import datetime |
| 7 | +import cocoindex |
| 8 | +import os |
| 9 | + |
| 10 | +conn_spec = cocoindex.add_auth_entry( |
| 11 | + "Neo4jConnection", |
| 12 | + cocoindex.targets.Neo4jConnection( |
| 13 | + uri="bolt://localhost:7687", |
| 14 | + user="neo4j", |
| 15 | + password="cocoindex", |
| 16 | + ), |
| 17 | +) |
| 18 | + |
| 19 | + |
| 20 | +@dataclass |
| 21 | +class Person: |
| 22 | + name: str |
| 23 | + |
| 24 | + |
| 25 | +@dataclass |
| 26 | +class Task: |
| 27 | + description: str |
| 28 | + assigned_to: list[Person] |
| 29 | + |
| 30 | + |
| 31 | +@dataclass |
| 32 | +class Meeting: |
| 33 | + time: datetime.date |
| 34 | + note: str |
| 35 | + organizer: Person |
| 36 | + participants: list[Person] |
| 37 | + tasks: list[Task] |
| 38 | + |
| 39 | + |
| 40 | +@cocoindex.flow_def(name="MeetingNotesGraph") |
| 41 | +def meeting_notes_graph_flow( |
| 42 | + flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope |
| 43 | +) -> None: |
| 44 | + """ |
| 45 | + Define an example flow that extracts triples from files and build knowledge graph. |
| 46 | + """ |
| 47 | + credential_path = os.environ["GOOGLE_SERVICE_ACCOUNT_CREDENTIAL"] |
| 48 | + root_folder_ids = os.environ["GOOGLE_DRIVE_ROOT_FOLDER_IDS"].split(",") |
| 49 | + |
| 50 | + data_scope["documents"] = flow_builder.add_source( |
| 51 | + cocoindex.sources.GoogleDrive( |
| 52 | + service_account_credential_path=credential_path, |
| 53 | + root_folder_ids=root_folder_ids, |
| 54 | + recent_changes_poll_interval=datetime.timedelta(seconds=10), |
| 55 | + ), |
| 56 | + refresh_interval=datetime.timedelta(minutes=1), |
| 57 | + ) |
| 58 | + |
| 59 | + meeting_nodes = data_scope.add_collector() |
| 60 | + attended_rels = data_scope.add_collector() |
| 61 | + decided_tasks_rels = data_scope.add_collector() |
| 62 | + assigned_rels = data_scope.add_collector() |
| 63 | + |
| 64 | + with data_scope["documents"].row() as document: |
| 65 | + document["meetings"] = document["content"].transform( |
| 66 | + cocoindex.functions.SplitBySeparators( |
| 67 | + separators_regex=[r"\n\n##?\ "], keep_separator="RIGHT" |
| 68 | + ) |
| 69 | + ) |
| 70 | + with document["meetings"].row() as meeting: |
| 71 | + parsed = meeting["parsed"] = meeting["text"].transform( |
| 72 | + cocoindex.functions.ExtractByLlm( |
| 73 | + llm_spec=cocoindex.LlmSpec( |
| 74 | + api_type=cocoindex.LlmApiType.OPENAI, model="gpt-5" |
| 75 | + ), |
| 76 | + output_type=Meeting, |
| 77 | + ) |
| 78 | + ) |
| 79 | + meeting_key = {"note_file": document["filename"], "time": parsed["time"]} |
| 80 | + meeting_nodes.collect(**meeting_key, note=parsed["note"]) |
| 81 | + |
| 82 | + attended_rels.collect( |
| 83 | + id=cocoindex.GeneratedField.UUID, |
| 84 | + **meeting_key, |
| 85 | + person=parsed["organizer"]["name"], |
| 86 | + is_organizer=True, |
| 87 | + ) |
| 88 | + with parsed["participants"].row() as participant: |
| 89 | + attended_rels.collect( |
| 90 | + id=cocoindex.GeneratedField.UUID, |
| 91 | + **meeting_key, |
| 92 | + person=participant["name"], |
| 93 | + ) |
| 94 | + |
| 95 | + with parsed["tasks"].row() as task: |
| 96 | + decided_tasks_rels.collect( |
| 97 | + id=cocoindex.GeneratedField.UUID, |
| 98 | + **meeting_key, |
| 99 | + description=task["description"], |
| 100 | + ) |
| 101 | + with task["assigned_to"].row() as assigned_to: |
| 102 | + assigned_rels.collect( |
| 103 | + id=cocoindex.GeneratedField.UUID, |
| 104 | + **meeting_key, |
| 105 | + task=task["description"], |
| 106 | + person=assigned_to["name"], |
| 107 | + ) |
| 108 | + |
| 109 | + meeting_nodes.export( |
| 110 | + "meeting_nodes", |
| 111 | + cocoindex.targets.Neo4j( |
| 112 | + connection=conn_spec, mapping=cocoindex.targets.Nodes(label="Meeting") |
| 113 | + ), |
| 114 | + primary_key_fields=["note_file", "time"], |
| 115 | + ) |
| 116 | + flow_builder.declare( |
| 117 | + cocoindex.targets.Neo4jDeclaration( |
| 118 | + connection=conn_spec, |
| 119 | + nodes_label="Person", |
| 120 | + primary_key_fields=["name"], |
| 121 | + ) |
| 122 | + ) |
| 123 | + flow_builder.declare( |
| 124 | + cocoindex.targets.Neo4jDeclaration( |
| 125 | + connection=conn_spec, |
| 126 | + nodes_label="Task", |
| 127 | + primary_key_fields=["description"], |
| 128 | + ) |
| 129 | + ) |
| 130 | + attended_rels.export( |
| 131 | + "attended_rels", |
| 132 | + cocoindex.targets.Neo4j( |
| 133 | + connection=conn_spec, |
| 134 | + mapping=cocoindex.targets.Relationships( |
| 135 | + rel_type="ATTENDED", |
| 136 | + source=cocoindex.targets.NodeFromFields( |
| 137 | + label="Person", |
| 138 | + fields=[ |
| 139 | + cocoindex.targets.TargetFieldMapping( |
| 140 | + source="person", target="name" |
| 141 | + ) |
| 142 | + ], |
| 143 | + ), |
| 144 | + target=cocoindex.targets.NodeFromFields( |
| 145 | + label="Meeting", |
| 146 | + fields=[ |
| 147 | + cocoindex.targets.TargetFieldMapping("note_file"), |
| 148 | + cocoindex.targets.TargetFieldMapping("time"), |
| 149 | + ], |
| 150 | + ), |
| 151 | + ), |
| 152 | + ), |
| 153 | + primary_key_fields=["id"], |
| 154 | + ) |
| 155 | + decided_tasks_rels.export( |
| 156 | + "decided_tasks_rels", |
| 157 | + cocoindex.targets.Neo4j( |
| 158 | + connection=conn_spec, |
| 159 | + mapping=cocoindex.targets.Relationships( |
| 160 | + rel_type="DECIDED", |
| 161 | + source=cocoindex.targets.NodeFromFields( |
| 162 | + label="Meeting", |
| 163 | + fields=[ |
| 164 | + cocoindex.targets.TargetFieldMapping("note_file"), |
| 165 | + cocoindex.targets.TargetFieldMapping("time"), |
| 166 | + ], |
| 167 | + ), |
| 168 | + target=cocoindex.targets.NodeFromFields( |
| 169 | + label="Task", |
| 170 | + fields=[ |
| 171 | + cocoindex.targets.TargetFieldMapping("description"), |
| 172 | + ], |
| 173 | + ), |
| 174 | + ), |
| 175 | + ), |
| 176 | + primary_key_fields=["id"], |
| 177 | + ) |
| 178 | + assigned_rels.export( |
| 179 | + "assigned_rels", |
| 180 | + cocoindex.targets.Neo4j( |
| 181 | + connection=conn_spec, |
| 182 | + mapping=cocoindex.targets.Relationships( |
| 183 | + rel_type="ASSIGNED_TO", |
| 184 | + source=cocoindex.targets.NodeFromFields( |
| 185 | + label="Person", |
| 186 | + fields=[ |
| 187 | + cocoindex.targets.TargetFieldMapping( |
| 188 | + source="person", target="name" |
| 189 | + ), |
| 190 | + ], |
| 191 | + ), |
| 192 | + target=cocoindex.targets.NodeFromFields( |
| 193 | + label="Task", |
| 194 | + fields=[ |
| 195 | + cocoindex.targets.TargetFieldMapping( |
| 196 | + source="task", target="description" |
| 197 | + ), |
| 198 | + ], |
| 199 | + ), |
| 200 | + ), |
| 201 | + ), |
| 202 | + primary_key_fields=["id"], |
| 203 | + ) |
0 commit comments