diff --git a/content/TinyGraphRAG/example/data.md b/content/TinyGraphRAG/example/data.md
new file mode 100644
index 0000000..a90d9fd
--- /dev/null
+++ b/content/TinyGraphRAG/example/data.md
@@ -0,0 +1,15 @@
+# Introduction
+
+## 1.1 Introduction
+
+Following a drizzling, we take a walk on the wet street. Feeling the gentle breeze and seeing the sunset glow, we bet the weather must be nice tomorrow. Walking to a fruit stand, we pick up a green watermelon with curly root and muffled sound; while hoping the watermelon is ripe, we also expect some good aca- demic marks this semester after all the hard work on studies. We wish readers to share the same confidence in their studies, but to begin with, let us take an informal discussion on what is machine learning .
+
+Taking a closer look at the scenario described above, we notice that it involves many experience-based predictions. For example, why would we expect beautiful weather tomorrow after observing the gentle breeze and sunset glow? We expect this beautiful weather because,from our experience,theweather on the following day is often beautiful when we experience such a scene in the present day. Also, why do we pick the watermelon with green color, curly root, and muffled sound? It is because we have eaten and enjoyed many watermelons, and those sat- isfying the above criteria are usually ripe. Similarly, our learn- ing experience tells us that hard work leads to good academic marks. We are confident in our predictions because we learned from experience and made experience-based decisions.
+
+Mitchell ( 1997 ) provides a more formal definition: ‘‘A computer program is said to learn from experience $E$ for some class of tasks $T$ and performance measure $P$ , if its performance at tasks in $T$ , as measured by $P$ , improves with experience $E$ .’’
+
+E.g., Hand et al. ( 2001 ).
+
+While humans learn from experience, can computers do the same? The answer is ‘‘yes’’, and machine learning is what we need. Machine learning is the technique that improves system performance by learning from experience via computational methods. In computer systems, experience exists in the form of data, and the main task of machine learning is to develop learning algorithms that build models from data. By feeding the learning algorithm with experience data, we obtain a model that can make predictions (e.g., the watermelon is ripe) on new observations (e.g., an uncut watermelon). If we consider com- puter science as the subject of algorithms, then machine learn- ing is the subject of learning algorithms .
+
+In this book, we use ‘‘model’’ as a general term for the out- come learned from data. In some other literature, the term ‘‘model’’may refer to the global outcome (e.g., a decision tree), while the term ‘‘pattern’’ refers to the local outcome (e.g., a single rule).
\ No newline at end of file
diff --git a/content/TinyGraphRAG/help.ipynb b/content/TinyGraphRAG/help.ipynb
new file mode 100644
index 0000000..f01e88c
--- /dev/null
+++ b/content/TinyGraphRAG/help.ipynb
@@ -0,0 +1,175 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "/home/calvin-lucas/Documents/DataWhale_Learning_Material/tiny-graphrag\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 注意:重新运行前需要:重启整个内核\n",
+ "import os\n",
+ "import sys\n",
+ "sys.path.append('.') # 添加当前目录到 Python 路径\n",
+ "print(os.getcwd()) # 验证下当前工作路径"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# 导入模块\n",
+ "from tinygraph.graph import TinyGraph\n",
+ "from tinygraph.embedding.zhipu import zhipuEmb\n",
+ "from tinygraph.llm.zhipu import zhipuLLM\n",
+ "\n",
+ "from neo4j import GraphDatabase\n",
+ "from dotenv import load_dotenv # 用于加载环境变量"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# 配置使用的 LLM 和 Embedding 服务,现在只支持 ZhipuAI\n",
+ "# 加载 .env文件, 从而导入api_key\n",
+ "load_dotenv() # 加载工作目录下的 .env 文件\n",
+ "\n",
+ "emb = zhipuEmb(\n",
+ " model_name=\"embedding-2\", # 嵌入模型\n",
+ " api_key=os.getenv('API_KEY')\n",
+ ")\n",
+ "llm = zhipuLLM(\n",
+ " model_name=\"glm-3-turbo\", # LLM 模型\n",
+ " api_key=os.getenv('API_KEY')\n",
+ ")\n",
+ "graph = TinyGraph(\n",
+ " url=\"neo4j://localhost:7687\",\n",
+ " username=\"neo4j\",\n",
+ " password=\"neo4j-passwordTGR\", # 初次登陆的默认密码为neo4j,此后需修改再使用\n",
+ " llm=llm,\n",
+ " emb=emb,\n",
+ ")\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Document 'example/data.md' has already been loaded, skipping import process.\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 使用 TinyGraph 添加文档。目前支持所有文本格式的文件。这一步的时间可能较长;\n",
+ "# 结束后,在当前目录下会生成一个 `workspace` 文件夹,包含 `community`、`chunk` 和 `doc` 信息\n",
+ "graph.add_document(\"example/data.md\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "数据库连接正常,节点数量: 29\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 再次验证数据库连接\n",
+ "with graph.driver.session() as session:\n",
+ " result = session.run(\"MATCH (n) RETURN count(n) as count\")\n",
+ " count = result.single()[\"count\"]\n",
+ " print(f\"数据库连接正常,节点数量: {count}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "本地查询结果:\n",
+ "The term \"dl\" is not explicitly defined in the provided context. However, based on the context's focus on machine learning, \"dl\" might commonly be interpreted as an abbreviation for \"deep learning,\" which is a subset of machine learning that involves neural networks with many layers (hence \"deep\"). Deep learning has become a prominent field, particularly in the realm of artificial intelligence, where it is used to recognize patterns and make predictions from large datasets.\n",
+ "\n",
+ "If \"dl\" refers to something else in the context of the user query, there would be no information to discern its meaning without further clarification or additional context.\n"
+ ]
+ }
+ ],
+ "source": [
+ "# 执行局部查询测试\n",
+ "local_res = graph.local_query(\"what is dl?\")\n",
+ "print(\"\\n本地查询结果:\")\n",
+ "print(local_res)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "全局查询结果:\n",
+ "The term 'dl' is not explicitly mentioned in the provided data tables. Therefore, I don't know what 'dl' refers to in the context of the user's question. If 'dl' stands for 'Deep Learning,' it is a subset of machine learning that uses neural networks with many layers for feature extraction and modeling. However, this context is not provided in the data tables.\n"
+ ]
+ }
+ ],
+ "source": [
+ "\n",
+ "# 执行全局查询测试\n",
+ "global_res = graph.global_query(\"what is dl?\")\n",
+ "print(\"\\n全局查询结果:\")\n",
+ "print(global_res)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "TinyGraphRAG_2025-04-08",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.16"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git "a/content/TinyGraphRAG/images/Learning-Algorithms\350\212\202\347\202\271\347\232\204\350\257\246\347\273\206\344\277\241\346\201\257.png" "b/content/TinyGraphRAG/images/Learning-Algorithms\350\212\202\347\202\271\347\232\204\350\257\246\347\273\206\344\277\241\346\201\257.png"
new file mode 100644
index 0000000..dec5876
Binary files /dev/null and "b/content/TinyGraphRAG/images/Learning-Algorithms\350\212\202\347\202\271\347\232\204\350\257\246\347\273\206\344\277\241\346\201\257.png" differ
diff --git "a/content/TinyGraphRAG/images/Tiny-Graphrag\346\265\201\347\250\213\345\233\276V2.png" "b/content/TinyGraphRAG/images/Tiny-Graphrag\346\265\201\347\250\213\345\233\276V2.png"
new file mode 100644
index 0000000..366db93
Binary files /dev/null and "b/content/TinyGraphRAG/images/Tiny-Graphrag\346\265\201\347\250\213\345\233\276V2.png" differ
diff --git "a/content/TinyGraphRAG/images/\345\233\276\346\225\260\346\215\256\345\272\223\347\244\272\344\276\213.png" "b/content/TinyGraphRAG/images/\345\233\276\346\225\260\346\215\256\345\272\223\347\244\272\344\276\213.png"
new file mode 100644
index 0000000..ea17303
Binary files /dev/null and "b/content/TinyGraphRAG/images/\345\233\276\346\225\260\346\215\256\345\272\223\347\244\272\344\276\213.png" differ
diff --git "a/content/TinyGraphRAG/images/\346\237\245\350\257\242\347\273\223\346\236\234\347\244\272\344\276\213.png" "b/content/TinyGraphRAG/images/\346\237\245\350\257\242\347\273\223\346\236\234\347\244\272\344\276\213.png"
new file mode 100644
index 0000000..b210ad0
Binary files /dev/null and "b/content/TinyGraphRAG/images/\346\237\245\350\257\242\347\273\223\346\236\234\347\244\272\344\276\213.png" differ
diff --git a/content/TinyGraphRAG/readme.md b/content/TinyGraphRAG/readme.md
new file mode 100644
index 0000000..1de7a6e
--- /dev/null
+++ b/content/TinyGraphRAG/readme.md
@@ -0,0 +1,631 @@
+# Tiny-Graphrag使用指南与代码解读
+>此README包括两部分:1.引言;2.正文
+## 引言:
+- Tiny-Graphrag是一个基于Graphrag的简化版本,包含了Graphrag的核心功能: 1.知识图谱构建;2.图检索优化;3.生成增强。创建Graphrag项目的目的是帮助大家理解Graphrag的原理并提供Demo来实现。
+- 本项目实现流程如下所示:
+
+
+
diff --git a/content/TinyGraphRAG/tinygraph/embedding/__init__.py b/content/TinyGraphRAG/tinygraph/embedding/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/content/TinyGraphRAG/tinygraph/embedding/base.py b/content/TinyGraphRAG/tinygraph/embedding/base.py
new file mode 100644
index 0000000..cfe15ee
--- /dev/null
+++ b/content/TinyGraphRAG/tinygraph/embedding/base.py
@@ -0,0 +1,25 @@
+from abc import ABC, abstractmethod
+from typing import List, Any, Optional
+
+
+class BaseEmb(ABC):
+ def __init__(
+ self,
+ model_name: str,
+ model_params: Optional[dict[str, Any]] = None,
+ **kwargs: Any,
+ ):
+ self.model_name = model_name
+ self.model_params = model_params or {}
+
+ @abstractmethod
+ def get_emb(self, input: str) -> List[float]:
+ """Sends a text input to the embedding model and retrieves the embedding.
+
+ Args:
+ input (str): Text sent to the embedding model
+
+ Returns:
+ List[float]: The embedding vector from the model.
+ """
+ pass
diff --git a/content/TinyGraphRAG/tinygraph/embedding/zhipu.py b/content/TinyGraphRAG/tinygraph/embedding/zhipu.py
new file mode 100644
index 0000000..cef5dbb
--- /dev/null
+++ b/content/TinyGraphRAG/tinygraph/embedding/zhipu.py
@@ -0,0 +1,16 @@
+from zhipuai import ZhipuAI
+from typing import List
+from .base import BaseEmb
+
+
+class zhipuEmb(BaseEmb):
+ def __init__(self, model_name: str, api_key: str, **kwargs):
+ super().__init__(model_name=model_name, **kwargs)
+ self.client = ZhipuAI(api_key=api_key)
+
+ def get_emb(self, text: str) -> List[float]:
+ emb = self.client.embeddings.create(
+ model=self.model_name,
+ input=text,
+ )
+ return emb.data[0].embedding
diff --git a/content/TinyGraphRAG/tinygraph/graph.py b/content/TinyGraphRAG/tinygraph/graph.py
new file mode 100644
index 0000000..5313a32
--- /dev/null
+++ b/content/TinyGraphRAG/tinygraph/graph.py
@@ -0,0 +1,714 @@
+from neo4j import GraphDatabase
+import os
+from tqdm import tqdm
+from .utils import (
+ get_text_inside_tag,
+ cosine_similarity,
+ compute_mdhash_id,
+ read_json_file,
+ write_json_file,
+ create_file_if_not_exists,
+)
+from .llm.base import BaseLLM
+from .embedding.base import BaseEmb
+from .prompt import *
+from typing import Dict, List, Optional, Tuple, Union
+import numpy as np
+from collections import defaultdict
+import json
+
+from dataclasses import dataclass
+
+
+@dataclass
+class Node:
+ name: str
+ desc: str
+ chunks_id: list
+ entity_id: str
+ similarity: float
+
+
+class TinyGraph:
+ """
+ 一个用于处理图数据库和语言模型的类。
+
+ 该类通过连接到Neo4j图数据库,并使用语言模型(LLM)和嵌入模型(Embedding)来处理文档和图数据。
+ 它还管理一个工作目录,用于存储文档、文档块和社区数据。
+ """
+
+ def __init__(
+ self,
+ url: str, # Neo4j数据库的URL
+ username: str, # Neo4j数据库的用户名
+ password: str, # Neo4j数据库的密码
+ llm: BaseLLM, # 语言模型(LLM)实例
+ emb: BaseLLM, # 嵌入模型(Embedding)实例
+ working_dir: str = "workspace", # 工作目录,默认为"workspace"
+ ):
+ """
+ 初始化TinyGraph类。
+
+ 参数:
+ - url: Neo4j数据库的URL
+ - username: Neo4j数据库的用户名
+ - password: Neo4j数据库的密码
+ - llm: 语言模型(LLM)实例
+ - emb: 嵌入模型(Embedding)实例
+ - working_dir: 工作目录,默认为"workspace"
+ """
+ self.driver = driver = GraphDatabase.driver(
+ url, auth=(username, password)
+ ) # 创建Neo4j数据库驱动
+ self.llm = llm # 设置语言模型
+ self.embedding = emb # 设置嵌入模型
+ self.working_dir = working_dir # 设置工作目录
+ os.makedirs(self.working_dir, exist_ok=True) # 创建工作目录(如果不存在)
+
+ # 定义文档、文档块和社区数据的文件路径
+ self.doc_path = os.path.join(working_dir, "doc.txt")
+ self.chunk_path = os.path.join(working_dir, "chunk.json")
+ self.community_path = os.path.join(working_dir, "community.json")
+
+ # 创建文件(如果不存在)
+ create_file_if_not_exists(self.doc_path)
+ create_file_if_not_exists(self.chunk_path)
+ create_file_if_not_exists(self.community_path)
+
+ # 加载已加载的文档
+ self.loaded_documents = self.get_loaded_documents()
+
+ def create_triplet(self, subject: dict, predicate, object: dict) -> None:
+ """
+ 创建一个三元组(Triplet)并将其存储到Neo4j数据库中。
+
+ 参数:
+ - subject: 主题实体的字典,包含名称、描述、块ID和实体ID
+ - predicate: 关系名称
+ - object: 对象实体的字典,包含名称、描述、块ID和实体ID
+
+ 返回:
+ - 查询结果
+ """
+ # 定义Cypher查询语句,用于创建或合并实体节点和关系
+ query = (
+ "MERGE (a:Entity {name: $subject_name, description: $subject_desc, chunks_id: $subject_chunks_id, entity_id: $subject_entity_id}) "
+ "MERGE (b:Entity {name: $object_name, description: $object_desc, chunks_id: $object_chunks_id, entity_id: $object_entity_id}) "
+ "MERGE (a)-[r:Relationship {name: $predicate}]->(b) "
+ "RETURN a, b, r"
+ )
+
+ # 使用数据库会话执行查询
+ with self.driver.session() as session:
+ result = session.run(
+ query,
+ subject_name=subject["name"],
+ subject_desc=subject["description"],
+ subject_chunks_id=subject["chunks id"],
+ subject_entity_id=subject["entity id"],
+ object_name=object["name"],
+ object_desc=object["description"],
+ object_chunks_id=object["chunks id"],
+ object_entity_id=object["entity id"],
+ predicate=predicate,
+ )
+
+ return
+
+ def split_text(self,file_path:str, segment_length=300, overlap_length=50) -> Dict:
+ """
+ 将文本文件分割成多个片段,每个片段的长度为segment_length,相邻片段之间有overlap_length的重叠。
+
+ 参数:
+ - file_path: 文本文件的路径
+ - segment_length: 每个片段的长度,默认为300
+ - overlap_length: 相邻片段之间的重叠长度,默认为50
+
+ 返回:
+ - 包含片段ID和片段内容的字典
+ """
+ chunks = {} # 用于存储片段的字典
+ with open(file_path, "r", encoding="utf-8") as file:
+ content = file.read() # 读取文件内容
+
+ text_segments = [] # 用于存储分割后的文本片段
+ start_index = 0 # 初始化起始索引
+
+ # 循环分割文本,直到剩余文本长度不足以形成新的片段
+ while start_index + segment_length <= len(content):
+ text_segments.append(content[start_index : start_index + segment_length])
+ start_index += segment_length - overlap_length # 更新起始索引,考虑重叠长度
+
+ # 处理剩余的文本,如果剩余文本长度小于segment_length但大于0
+ if start_index < len(content):
+ text_segments.append(content[start_index:])
+
+ # 为每个片段生成唯一的ID,并将其存储在字典中
+ for segement in text_segments:
+ chunks.update({compute_mdhash_id(segement, prefix="chunk-"): segement})
+
+ return chunks
+
+ def get_entity(self, text: str, chunk_id: str) -> List[Dict]:
+ """
+ 从给定的文本中提取实体,并为每个实体生成唯一的ID和描述。
+
+ 参数:
+ - text: 输入的文本
+ - chunk_id: 文本块的ID
+
+ 返回:
+ - 包含提取的实体信息的列表
+ """
+ # 使用语言模型预测实体信息
+ data = self.llm.predict(GET_ENTITY.format(text=text))
+ concepts = [] # 用于存储提取的实体信息
+
+ # 从预测结果中提取实体信息
+ for concept_html in get_text_inside_tag(data, "concept"):
+ concept = {}
+ concept["name"] = get_text_inside_tag(concept_html, "name")[0].strip()
+ concept["description"] = get_text_inside_tag(concept_html, "description")[
+ 0
+ ].strip()
+ concept["chunks id"] = [chunk_id]
+ concept["entity id"] = compute_mdhash_id(
+ concept["description"], prefix="entity-"
+ )
+ concepts.append(concept)
+
+ return concepts
+
+ def get_triplets(self, content, entity: list) -> List[Dict]:
+ """
+ 从给定的内容中提取三元组(Triplet)信息,并返回包含这些三元组信息的列表。
+
+ 参数:
+ - content: 输入的内容
+ - entity: 实体列表
+
+ 返回:
+ - 包含提取的三元组信息的列表
+ """
+ try:
+ # 使用语言模型预测三元组信息
+ data = self.llm.predict(GET_TRIPLETS.format(text=content, entity=entity))
+ data = get_text_inside_tag(data, "triplet")
+ except Exception as e:
+ print(f"Error predicting triplets: {e}")
+ return []
+
+ res = [] # 用于存储提取的三元组信息
+
+ # 从预测结果中提取三元组信息
+ for triplet_data in data:
+ try:
+ subject = get_text_inside_tag(triplet_data, "subject")[0]
+ subject_id = get_text_inside_tag(triplet_data, "subject_id")[0]
+ predicate = get_text_inside_tag(triplet_data, "predicate")[0]
+ object = get_text_inside_tag(triplet_data, "object")[0]
+ object_id = get_text_inside_tag(triplet_data, "object_id")[0]
+ res.append(
+ {
+ "subject": subject,
+ "subject_id": subject_id,
+ "predicate": predicate,
+ "object": object,
+ "object_id": object_id,
+ }
+ )
+ except Exception as e:
+ print(f"Error extracting triplet: {e}")
+ continue
+
+ return res
+
+ def add_document(self, filepath, use_llm_deambiguation=False) -> None:
+ """
+ 将文档添加到系统中,执行以下步骤:
+ 1. 检查文档是否已经加载。
+ 2. 将文档分割成块。
+ 3. 从块中提取实体和三元组。
+ 4. 执行实体消岐,有两种方法可选,默认将同名实体认为即为同一实体。
+ 5. 合并实体和三元组。
+ 6. 将合并的实体和三元组存储到Neo4j数据库中。
+
+ 参数:
+ - filepath: 要添加的文档的路径
+ - use_llm_deambiguation: 是否使用LLM进行实体消岐
+ """
+ # ================ Check if the document has been loaded ================
+ if filepath in self.get_loaded_documents():
+ print(
+ f"Document '{filepath}' has already been loaded, skipping import process."
+ )
+ return
+
+ # ================ Chunking ================
+ chunks = self.split_text(filepath)
+ existing_chunks = read_json_file(self.chunk_path)
+
+ # Filter out chunks that are already in storage
+ new_chunks = {k: v for k, v in chunks.items() if k not in existing_chunks}
+
+ if not new_chunks:
+ print("All chunks are already in the storage.")
+ return
+
+ # Merge new chunks with existing chunks
+ all_chunks = {**existing_chunks, **new_chunks}
+ write_json_file(all_chunks, self.chunk_path)
+ print(f"Document '{filepath}' has been chunked.")
+
+ # ================ Entity Extraction ================
+ all_entities = []
+ all_triplets = []
+
+ for chunk_id, chunk_content in tqdm(
+ new_chunks.items(), desc=f"Processing '{filepath}'"
+ ):
+ try:
+ entities = self.get_entity(chunk_content, chunk_id=chunk_id)
+ all_entities.extend(entities)
+ triplets = self.get_triplets(chunk_content, entities)
+ all_triplets.extend(triplets)
+ except:
+ print(
+ f"An error occurred while processing chunk '{chunk_id}'. SKIPPING..."
+ )
+
+ print(
+ f"{len(all_entities)} entities and {len(all_triplets)} triplets have been extracted."
+ )
+ # ================ Entity Disambiguation ================
+ entity_names = list(set(entity["name"] for entity in all_entities))
+
+ if use_llm_deambiguation:
+ entity_id_mapping = {}
+ for name in entity_names:
+ same_name_entities = [
+ entity for entity in all_entities if entity["name"] == name
+ ]
+ transform_text = self.llm.predict(
+ ENTITY_DISAMBIGUATION.format(same_name_entities)
+ )
+ entity_id_mapping.update(
+ get_text_inside_tag(transform_text, "transform")
+ )
+ else:
+ entity_id_mapping = {}
+ for entity in all_entities:
+ entity_name = entity["name"]
+ if entity_name not in entity_id_mapping:
+ entity_id_mapping[entity_name] = entity["entity id"]
+
+ for entity in all_entities:
+ entity["entity id"] = entity_id_mapping.get(
+ entity["name"], entity["entity id"]
+ )
+
+ triplets_to_remove = [
+ triplet
+ for triplet in all_triplets
+ if entity_id_mapping.get(triplet["subject"], triplet["subject_id"]) is None
+ or entity_id_mapping.get(triplet["object"], triplet["object_id"]) is None
+ ]
+
+ updated_triplets = [
+ {
+ **triplet,
+ "subject_id": entity_id_mapping.get(
+ triplet["subject"], triplet["subject_id"]
+ ),
+ "object_id": entity_id_mapping.get(
+ triplet["object"], triplet["object_id"]
+ ),
+ }
+ for triplet in all_triplets
+ if triplet not in triplets_to_remove
+ ]
+ all_triplets = updated_triplets
+
+ # ================ Merge Entities ================
+ entity_map = {}
+
+ for entity in all_entities:
+ entity_id = entity["entity id"]
+ if entity_id not in entity_map:
+ entity_map[entity_id] = {
+ "name": entity["name"],
+ "description": entity["description"],
+ "chunks id": [],
+ "entity id": entity_id,
+ }
+ else:
+ entity_map[entity_id]["description"] += " " + entity["description"]
+
+ entity_map[entity_id]["chunks id"].extend(entity["chunks id"])
+ # ================ Store Data in Neo4j ================
+ for triplet in all_triplets:
+ subject_id = triplet["subject_id"]
+ object_id = triplet["object_id"]
+
+ subject = entity_map.get(subject_id)
+ object = entity_map.get(object_id)
+ if subject and object:
+ self.create_triplet(subject, triplet["predicate"], object)
+ # ================ communities ================
+ self.gen_community()
+ self.generate_community_report()
+ # ================ embedding ================
+ self.add_embedding_for_graph()
+ self.add_loaded_documents(filepath)
+ print(f"doc '{filepath}' has been loaded.")
+
+ def detect_communities(self) -> None:
+ query = """
+ CALL gds.graph.project(
+ 'graph_help',
+ ['Entity'],
+ {
+ Relationship: {
+ orientation: 'UNDIRECTED'
+ }
+ }
+ )
+ """
+ with self.driver.session() as session:
+ result = session.run(query)
+
+ query = """
+ CALL gds.leiden.write('graph_help', {
+ writeProperty: 'communityIds',
+ includeIntermediateCommunities: True,
+ maxLevels: 10,
+ tolerance: 0.0001,
+ gamma: 1.0,
+ theta: 0.01
+ })
+ YIELD communityCount, modularity, modularities
+ """
+ with self.driver.session() as session:
+ result = session.run(query)
+ for record in result:
+ print(
+ f"社区数量: {record['communityCount']}, 模块度: {record['modularity']}"
+ )
+ session.run("CALL gds.graph.drop('graph_help')")
+
+ def get_entity_by_name(self, name):
+ query = """
+ MATCH (n:Entity {name: $name})
+ RETURN n
+ """
+ with self.driver.session() as session:
+ result = session.run(query, name=name)
+ entities = [record["n"].get("name") for record in result]
+ return entities[0]
+
+ def get_node_edgs(self, node: Node):
+ query = """
+ MATCH (n)-[r]-(m)
+ WHERE n.entity_id = $id
+ RETURN n.name AS n,r.name AS r,m.name AS m
+ """
+ with self.driver.session() as session:
+ result = session.run(query, id=node.entity_id)
+ edges = [(record["n"], record["r"], record["m"]) for record in result]
+ return edges
+
+ def get_node_chunks(self, node):
+ existing_chunks = read_json_file(self.chunk_path)
+ chunks = [existing_chunks[i] for i in node.chunks_id]
+ return chunks
+
+ def add_embedding_for_graph(self):
+ query = """
+ MATCH (n)
+ RETURN n
+ """
+ with self.driver.session() as session:
+ result = session.run(query)
+ for record in result:
+ node = record["n"]
+ description = node["description"]
+ id = node["entity_id"]
+ embedding = self.embedding.get_emb(description)
+ # 更新节点,添加新的 embedding 属性
+ update_query = """
+ MATCH (n {entity_id: $id})
+ SET n.embedding = $embedding
+ """
+ session.run(update_query, id=id, embedding=embedding)
+
+ def get_topk_similar_entities(self, input_emb, k=1) -> List[Node]:
+ res = []
+ query = """
+ MATCH (n)
+ RETURN n
+ """
+ with self.driver.session() as session:
+ result = session.run(query)
+ # 如果遇到报错:ResultConsumedError: The result has been consumed. Fetch all needed records before calling Result.consume().可将result = session.run(query)修改为result = list(session.run(query))
+ for record in result:
+ node = record["n"]
+ if node["embedding"] is not None:
+ similarity = cosine_similarity(input_emb, node["embedding"])
+ node = Node(
+ name=node["name"],
+ desc=node["description"],
+ chunks_id=node["chunks_id"],
+ entity_id=node["entity_id"],
+ similarity=similarity,
+ )
+ res.append(node)
+ return sorted(res, key=lambda x: x.similarity, reverse=True)[:k]
+
+ def get_communities(self, nodes: List[Node]):
+ communities_schema = self.read_community_schema()
+ res = []
+ nodes_ids = [i.entity_id for i in nodes]
+ for community_id, community_info in communities_schema.items():
+ if set(nodes_ids) & set(community_info["nodes"]):
+ res.append(
+ {
+ "community_id": community_id,
+ "community_info": community_info["report"],
+ }
+ )
+ return res
+
+ def get_relations(self, nodes: List, input_emb):
+ res = []
+ for i in nodes:
+ res.append(self.get_node_edgs(i))
+ return res
+
+ def get_chunks(self, nodes, input_emb):
+ chunks = []
+ for i in nodes:
+ chunks.append(self.get_node_chunks(i))
+ return chunks
+
+ def gen_community_schema(self) -> dict[str, dict]:
+ results = defaultdict(
+ lambda: dict(
+ level=None,
+ title=None,
+ edges=set(),
+ nodes=set(),
+ chunk_ids=set(),
+ sub_communities=[],
+ )
+ )
+
+ with self.driver.session() as session:
+ # Fetch community data
+ result = session.run(
+ f"""
+ MATCH (n:Entity)
+ WITH n, n.communityIds AS communityIds, [(n)-[]-(m:Entity) | m.entity_id] AS connected_nodes
+ RETURN n.entity_id AS node_id,
+ communityIds AS cluster_key,
+ connected_nodes
+ """
+ )
+
+ max_num_ids = 0
+ for record in result:
+ for index, c_id in enumerate(record["cluster_key"]):
+ node_id = str(record["node_id"])
+ level = index
+ cluster_key = str(c_id)
+ connected_nodes = record["connected_nodes"]
+
+ results[cluster_key]["level"] = level
+ results[cluster_key]["title"] = f"Cluster {cluster_key}"
+ results[cluster_key]["nodes"].add(node_id)
+ results[cluster_key]["edges"].update(
+ [
+ tuple(sorted([node_id, str(connected)]))
+ for connected in connected_nodes
+ if connected != node_id
+ ]
+ )
+ for k, v in results.items():
+ v["edges"] = [list(e) for e in v["edges"]]
+ v["nodes"] = list(v["nodes"])
+ v["chunk_ids"] = list(v["chunk_ids"])
+ for cluster in results.values():
+ cluster["sub_communities"] = [
+ sub_key
+ for sub_key, sub_cluster in results.items()
+ if sub_cluster["level"] > cluster["level"]
+ and set(sub_cluster["nodes"]).issubset(set(cluster["nodes"]))
+ ]
+
+ return dict(results)
+
+ def gen_community(self):
+ self.detect_communities()
+ community_schema = self.gen_community_schema()
+ with open(self.community_path, "w", encoding="utf-8") as file:
+ json.dump(community_schema, file, indent=4)
+
+ def read_community_schema(self) -> dict:
+ try:
+ with open(self.community_path, "r", encoding="utf-8") as file:
+ community_schema = json.load(file)
+ except:
+ raise FileNotFoundError(
+ "Community schema not found. Please make sure to generate it first."
+ )
+ return community_schema
+
+ def get_loaded_documents(self):
+ try:
+ with open(self.doc_path, "r", encoding="utf-8") as file:
+ lines = file.readlines()
+ return set(line.strip() for line in lines)
+ except:
+ raise FileNotFoundError("Cache file not found.")
+
+ def add_loaded_documents(self, file_path):
+ if file_path in self.loaded_documents:
+ print(
+ f"Document '{file_path}' has already been loaded, skipping addition to cache."
+ )
+ return
+ with open(self.doc_path, "a", encoding="utf-8") as file:
+ file.write(file_path + "\n")
+ self.loaded_documents.add(file_path)
+
+ def get_node_by_id(self, node_id):
+ query = """
+ MATCH (n:Entity {entity_id: $node_id})
+ RETURN n
+ """
+ with self.driver.session() as session:
+ result = session.run(query, node_id=node_id)
+ nodes = [record["n"] for record in result]
+ return nodes[0]
+
+ def get_edges_by_id(self, src, tar):
+ query = """
+ MATCH (n:Entity {entity_id: $src})-[r]-(m:Entity {entity_id: $tar})
+ RETURN {src: n.name, r: r.name, tar: m.name} AS R
+ """
+ with self.driver.session() as session:
+ result = session.run(query, {"src": src, "tar": tar})
+ edges = [record["R"] for record in result]
+ return edges[0]
+
+ def gen_single_community_report(self, community: dict):
+ nodes = community["nodes"]
+ edges = community["edges"]
+ nodes_describe = []
+ edges_describe = []
+ for i in nodes:
+ node = self.get_node_by_id(i)
+ nodes_describe.append({"name": node["name"], "desc": node["description"]})
+ for i in edges:
+ edge = self.get_edges_by_id(i[0], i[1])
+ edges_describe.append(
+ {"source": edge["src"], "target": edge["tar"], "desc": edge["r"]}
+ )
+ nodes_csv = "entity,description\n"
+ for node in nodes_describe:
+ nodes_csv += f"{node['name']},{node['desc']}\n"
+ edges_csv = "source,target,description\n"
+ for edge in edges_describe:
+ edges_csv += f"{edge['source']},{edge['target']},{edge['desc']}\n"
+ data = f"""
+ Text:
+ -----Entities-----
+ ```csv
+ {nodes_csv}
+ ```
+ -----Relationships-----
+ ```csv
+ {edges_csv}
+ ```"""
+ prompt = GEN_COMMUNITY_REPORT.format(input_text=data)
+ report = self.llm.predict(prompt)
+ return report
+
+ def generate_community_report(self):
+ communities_schema = self.read_community_schema()
+ for community_key, community in tqdm(
+ communities_schema.items(), desc="generating community report"
+ ):
+ community["report"] = self.gen_single_community_report(community)
+ with open(self.community_path, "w", encoding="utf-8") as file:
+ json.dump(communities_schema, file, indent=4)
+ print("All community report has been generated.")
+
+ def build_local_query_context(self, query):
+ query_emb = self.embedding.get_emb(query)
+ topk_similar_entities_context = self.get_topk_similar_entities(query_emb)
+ topk_similar_communities_context = self.get_communities(
+ topk_similar_entities_context
+ )
+ topk_similar_relations_context = self.get_relations(
+ topk_similar_entities_context, query
+ )
+ topk_similar_chunks_context = self.get_chunks(
+ topk_similar_entities_context, query
+ )
+ return f"""
+ -----Reports-----
+ ```csv
+ {topk_similar_communities_context}
+ ```
+ -----Entities-----
+ ```csv
+ {topk_similar_entities_context}
+ ```
+ -----Relationships-----
+ ```csv
+ {topk_similar_relations_context}
+ ```
+ -----Sources-----
+ ```csv
+ {topk_similar_chunks_context}
+ ```
+ """
+
+ def map_community_points(self, community_info, query):
+ points_html = self.llm.predict(
+ GLOBAL_MAP_POINTS.format(context_data=community_info, query=query)
+ )
+ points = get_text_inside_tag(points_html, "point")
+ res = []
+ for point in points:
+ try:
+ score = get_text_inside_tag(point, "score")[0]
+ desc = get_text_inside_tag(point, "description")[0]
+ res.append((desc, score))
+ except:
+ continue
+ return res
+
+ def build_global_query_context(self, query, level=1):
+ communities_schema = self.read_community_schema()
+ candidate_community = {}
+ points = []
+ for communityid, community_info in communities_schema.items():
+ if community_info["level"] < level:
+ candidate_community.update({communityid: community_info})
+ for communityid, community_info in candidate_community.items():
+ points.extend(self.map_community_points(community_info["report"], query))
+ points = sorted(points, key=lambda x: x[-1], reverse=True)
+ return points
+
+ def local_query(self, query):
+ context = self.build_local_query_context(query)
+ prompt = LOCAL_QUERY.format(query=query, context=context)
+ response = self.llm.predict(prompt)
+ return response
+
+ def global_query(self, query, level=1):
+ context = self.build_global_query_context(query, level)
+ prompt = GLOBAL_QUERY.format(query=query, context=context)
+ response = self.llm.predict(prompt)
+ return response
diff --git a/content/TinyGraphRAG/tinygraph/llm/__init__.py b/content/TinyGraphRAG/tinygraph/llm/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/content/TinyGraphRAG/tinygraph/llm/base.py b/content/TinyGraphRAG/tinygraph/llm/base.py
new file mode 100644
index 0000000..1832c45
--- /dev/null
+++ b/content/TinyGraphRAG/tinygraph/llm/base.py
@@ -0,0 +1,32 @@
+from abc import ABC, abstractmethod
+from typing import Any, Optional
+
+
+class BaseLLM(ABC):
+ """Interface for large language models.
+
+ Args:
+ model_name (str): The name of the language model.
+ model_params (Optional[dict[str, Any]], optional): Additional parameters passed to the model when text is sent to it. Defaults to None.
+ **kwargs (Any): Arguments passed to the model when for the class is initialised. Defaults to None.
+ """
+
+ def __init__(
+ self,
+ model_name: str,
+ model_params: Optional[dict[str, Any]] = None,
+ **kwargs: Any,
+ ):
+ self.model_name = model_name
+ self.model_params = model_params or {}
+
+ @abstractmethod
+ def predict(self, input: str) -> str:
+ """Sends a text input to the LLM and retrieves a response.
+
+ Args:
+ input (str): Text sent to the LLM
+
+ Returns:
+ str: The response from the LLM.
+ """
diff --git a/content/TinyGraphRAG/tinygraph/llm/groq.py b/content/TinyGraphRAG/tinygraph/llm/groq.py
new file mode 100644
index 0000000..34da768
--- /dev/null
+++ b/content/TinyGraphRAG/tinygraph/llm/groq.py
@@ -0,0 +1,32 @@
+from groq import Groq
+from typing import Any, Optional
+from .base import BaseLLM
+
+
+class groqLLM(BaseLLM):
+ """Implementation of the BaseLLM interface using zhipuai."""
+
+ def __init__(
+ self,
+ model_name: str,
+ api_key: str,
+ model_params: Optional[dict[str, Any]] = None,
+ **kwargs: Any,
+ ):
+ super().__init__(model_name, model_params, **kwargs)
+ self.client = Groq(api_key=api_key)
+
+ def predict(self, input: str) -> str:
+ """Sends a text input to the zhipuai model and retrieves a response.
+
+ Args:
+ input (str): Text sent to the zhipuai model
+
+ Returns:
+ str: The response from the zhipuai model.
+ """
+ response = self.client.chat.completions.create(
+ model=self.model_name,
+ messages=[{"role": "user", "content": input}],
+ )
+ return response.choices[0].message.content
diff --git a/content/TinyGraphRAG/tinygraph/llm/zhipu.py b/content/TinyGraphRAG/tinygraph/llm/zhipu.py
new file mode 100644
index 0000000..1031e34
--- /dev/null
+++ b/content/TinyGraphRAG/tinygraph/llm/zhipu.py
@@ -0,0 +1,32 @@
+from zhipuai import ZhipuAI
+from typing import Any, Optional
+from .base import BaseLLM
+
+
+class zhipuLLM(BaseLLM):
+ """Implementation of the BaseLLM interface using zhipuai."""
+
+ def __init__(
+ self,
+ model_name: str,
+ api_key: str,
+ model_params: Optional[dict[str, Any]] = None,
+ **kwargs: Any,
+ ):
+ super().__init__(model_name, model_params, **kwargs)
+ self.client = ZhipuAI(api_key=api_key)
+
+ def predict(self, input: str) -> str:
+ """Sends a text input to the zhipuai model and retrieves a response.
+
+ Args:
+ input (str): Text sent to the zhipuai model
+
+ Returns:
+ str: The response from the zhipuai model.
+ """
+ response = self.client.chat.completions.create(
+ model=self.model_name,
+ messages=[{"role": "user", "content": input}],
+ )
+ return response.choices[0].message.content
diff --git a/content/TinyGraphRAG/tinygraph/prompt.py b/content/TinyGraphRAG/tinygraph/prompt.py
new file mode 100644
index 0000000..61164f7
--- /dev/null
+++ b/content/TinyGraphRAG/tinygraph/prompt.py
@@ -0,0 +1,383 @@
+GEN_NODES = """
+## Goal
+Please identify and extract triplet information from the provided article, focusing only on entities and relationships related to significant knowledge points.
+Each triplet should be in the form of (Subject, Predicate, Object).
+Follow these guidelines:
+
+1. **Subject:** Concepts in Bayesian Optimization
+2. **Predicate:** The action or relationship that links the subject to the object.
+3. **Object:** Concepts in Bayesian Optimization that is affected by or related to the action of the subject.
+
+## Example
+For the sentence "Gaussian Processes are used to model the objective function in Bayesian Optimization" the triplet would be:
+
+Gaussian Processesare used to model the objective function in
+
+For the sentence "John read a book on the weekend," which is not related to any knowledge points, no triplet should be extracted.
+
+## Instructions
+1. Read through the article carefully.
+2. Think step by step. Try to find some useful knowledge points from the article. You need to reorganize the content of the sentence into corresponding knowledge points.
+3. Identify key sentences that contain relevant triplet information related to significant knowledge points.
+4. Extract and format the triplets as per the given example, excluding any information that is not relevant to significant knowledge points.
+
+## Output Format
+For each identified triplet, provide:
+[Entity]The action or relationship
+
+## Article
+
+{text}
+
+## Your response
+"""
+
+GET_ENTITY = """
+## Goal
+
+You are an experienced machine learning teacher.
+You need to identify the key concepts related to machine learning that the article requires students to master. For each concept, provide a brief description that explains its relevance and importance in the context of the article.
+
+## Example
+
+article:
+"In the latest study, we explored the potential of using machine learning algorithms for disease prediction. We used support vector machines (SVM) and random forest algorithms to analyze medical data. The results showed that these models performed well in predicting disease risk through feature selection and cross-validation. In particular, the random forest model showed better performance in dealing with overfitting problems. In addition, we discussed the application of deep learning in medical image analysis."
+
+response:
+
+ Support Vector Machine (SVM)
+ A supervised learning model used for classification and regression tasks, particularly effective in high-dimensional spaces.
+
+
+ Random Forest Algorithm
+ An ensemble learning method that builds multiple decision trees and merges them together to get a more accurate and stable prediction, often used to reduce overfitting.
+
+
+ Feature Selection
+ The process of selecting a subset of relevant features for use in model construction, crucial for improving model performance and reducing complexity.
+
+
+ Overfitting
+ A common issue where a model learns the details and noise in the training data to the extent that it negatively impacts the model's performance on new data.
+
+
+ Deep Learning
+ A subset of machine learning that uses neural networks with many layers to model complex patterns in large datasets, often applied in image and speech recognition tasks.
+
+
+## Format
+
+Wrap each concept in the HTML tag , and include the name of the concept in the tag and its description in the tag.
+
+## Article
+
+{text}
+
+## Your response
+"""
+
+
+ENTITY_DISAMBIGUATION = """
+## Goal
+Given multiple entities with the same name, determine if they can be merged into a single entity. If merging is possible, provide the transformation from entity id to entity id.
+
+## Guidelines
+1. **Entities:** A list of entities with the same name.
+2. **Merge:** Determine if the entities can be merged into a single entity.
+3. **Transformation:** If merging is possible, provide the transformation from entity id to entity id.
+
+## Example
+1. Entities:
+ [
+ {"name": "Entity A", "entity id": "entity-1"},
+ {"name": "Entity A", "entity id": "entity-2"},
+ {"name": "Entity A", "entity id": "entity-3"}
+ ]
+
+Your response should be:
+
+{"entity-2": "entity-1", "entity-3": "entity-1"}
+
+
+2. Entities:
+ [
+ {"name": "Entity B", "entity id": "entity-4"},
+ {"name": "Entity C", "entity id": "entity-5"},
+ {"name": "Entity B", "entity id": "entity-6"}
+ ]
+
+Your response should be:
+
+None
+
+## Output Format
+Provide the following information:
+- Transformation: A dictionary mapping entity ids to the final entity id after merging.
+
+## Given Entities
+{entities}
+
+## Your response
+"""
+
+GET_TRIPLETS = """
+## Goal
+Identify and extract all the relationships between the given concepts from the provided text.
+Identify as many relationships between the concepts as possible.
+The relationship in the triple should accurately reflect the interaction or connection between the two concepts.
+
+## Guidelines:
+1. **Subject:** The first entity from the given entities.
+2. **Predicate:** The action or relationship linking the subject to the object.
+3. **Object:** The second entity from the given entities.
+
+## Example:
+1. Article :
+ "Gaussian Processes are used to model the objective function in Bayesian Optimization"
+ Given entities:
+ [{{"name": "Gaussian Processes", "entity id": "entity-1"}}, {{"name": "Bayesian Optimization", "entity id": "entity-2"}}]
+ Output:
+ Gaussian Processesentity-1are used to model the objective function inentity-2
+
+2. Article :
+ "Hydrogen is a colorless, odorless, non-toxic gas and is the lightest and most abundant element in the universe. Oxygen is a gas that supports combustion and is widely present in the Earth's atmosphere. Water is a compound made up of hydrogen and oxygen, with the chemical formula H2O."
+ Given entities:
+ [{{"name": "Hydrogen", "entity id": "entity-3"}}, {{"name": "Oxygen", "entity id": "entity-4"}}, {{"name": "Water", "entity id": "entity-5"}}]
+ Output:
+ Hydrogenentity-3is a component ofentity-5
+3. Article :
+ "John read a book on the weekend"
+ Given entities:
+ []
+ Output:
+ None
+
+## Format:
+For each identified triplet, provide:
+**the entity should just from "Given Entities"**
+[Entity][Entity ID][The action or relationship][Entity ID]
+
+## Given Entities:
+{entity}
+
+### Article:
+{text}
+
+## Additional Instructions:
+- Before giving your response, you should analyze and think about it sentence by sentence.
+- Both the subject and object must be selected from the given entities and cannot change their content.
+- If no relevant triplet involving both entities is found, no triplet should be extracted.
+- If there are similar concepts, please rewrite them into a form that suits our requirements.
+
+## Your response:
+"""
+
+TEST_PROMPT = """
+## Foundation of students
+{state}
+## Gole
+You will help students solve question through multiple rounds of dialogue.
+Please follow the steps below to help students solve the question:
+1. Explain the basic knowledge and principles behind the question and make sure the other party understands these basic concepts.
+2. Don't give a complete answer directly, but guide the student to think about the key steps of the question.
+3. After guiding the student to think, let them try to solve the question by themselves. Give appropriate hints and feedback to help them correct their mistakes and further improve their solutions.
+4. Return to TERMINATE after solving the problem
+"""
+
+GEN_COMMUNITY_REPORT = """
+## Role
+You are an AI assistant that helps a human analyst to perform general information discovery.
+Information discovery is the process of identifying and assessing relevant information associated with certain entities (e.g., organizations and individuals) within a network.
+
+## Goal
+Write a comprehensive report of a community.
+Given a list of entities that belong to the community as well as their relationships and optional associated claims. The report will be used to inform decision-makers about information associated with the community and their potential impact.
+The content of this report includes an overview of the community's key entities, their legal compliance, technical capabilities, reputation, and noteworthy claims.
+
+## Report Structure
+
+The report should include the following sections:
+
+- TITLE: community's name that represents its key entities - title should be short but specific. When possible, include representative named entities in the title.
+- SUMMARY: An executive summary of the community's overall structure, how its entities are related to each other, and significant information associated with its entities.
+- DETAILED FINDINGS: A list of 5-10 key insights about the community. Each insight should have a short summary followed by multiple paragraphs of explanatory text grounded according to the grounding rules below. Be comprehensive.
+
+Return output as a well-formed JSON-formatted string with the following format:
+{{
+"title": ,
+"summary": ,
+"findings": [
+{{
+"summary":,
+"explanation":
+}},
+{{
+"summary":,
+"explanation":
+}}
+...
+]
+}}
+
+## Grounding Rules
+Do not include information where the supporting evidence for it is not provided.
+
+## Example Input
+-----------
+Text:
+```
+Entities:
+```csv
+entity,description
+VERDANT OASIS PLAZA,Verdant Oasis Plaza is the location of the Unity March
+HARMONY ASSEMBLY,Harmony Assembly is an organization that is holding a march at Verdant Oasis Plaza
+```
+Relationships:
+```csv
+source,target,description
+VERDANT OASIS PLAZA,UNITY MARCH,Verdant Oasis Plaza is the location of the Unity March
+VERDANT OASIS PLAZA,HARMONY ASSEMBLY,Harmony Assembly is holding a march at Verdant Oasis Plaza
+VERDANT OASIS PLAZA,UNITY MARCH,The Unity March is taking place at Verdant Oasis Plaza
+VERDANT OASIS PLAZA,TRIBUNE SPOTLIGHT,Tribune Spotlight is reporting on the Unity march taking place at Verdant Oasis Plaza
+VERDANT OASIS PLAZA,BAILEY ASADI,Bailey Asadi is speaking at Verdant Oasis Plaza about the march
+HARMONY ASSEMBLY,UNITY MARCH,Harmony Assembly is organizing the Unity March
+```
+```
+Output:
+{{
+"title": "Verdant Oasis Plaza and Unity March",
+"summary": "The community revolves around the Verdant Oasis Plaza, which is the location of the Unity March. The plaza has relationships with the Harmony Assembly, Unity March, and Tribune Spotlight, all of which are associated with the march event.",
+"findings": [
+{{
+"summary": "Verdant Oasis Plaza as the central location",
+"explanation": "Verdant Oasis Plaza is the central entity in this community, serving as the location for the Unity March. This plaza is the common link between all other entities, suggesting its significance in the community. The plaza's association with the march could potentially lead to issues such as public disorder or conflict, depending on the nature of the march and the reactions it provokes."
+}},
+{{
+"summary": "Harmony Assembly's role in the community",
+"explanation": "Harmony Assembly is another key entity in this community, being the organizer of the march at Verdant Oasis Plaza. The nature of Harmony Assembly and its march could be a potential source of threat, depending on their objectives and the reactions they provoke. The relationship between Harmony Assembly and the plaza is crucial in understanding the dynamics of this community."
+}},
+{{
+"summary": "Unity March as a significant event",
+"explanation": "The Unity March is a significant event taking place at Verdant Oasis Plaza. This event is a key factor in the community's dynamics and could be a potential source of threat, depending on the nature of the march and the reactions it provokes. The relationship between the march and the plaza is crucial in understanding the dynamics of this community."
+}},
+{{
+"summary": "Role of Tribune Spotlight",
+"explanation": "Tribune Spotlight is reporting on the Unity March taking place in Verdant Oasis Plaza. This suggests that the event has attracted media attention, which could amplify its impact on the community. The role of Tribune Spotlight could be significant in shaping public perception of the event and the entities involved."
+}}
+]
+}}
+
+## Real Data
+Use the following text for your answer. Do not make anything up in your answer.
+
+Text:
+```
+{input_text}
+```
+
+The report should include the following sections:
+
+- TITLE: community's name that represents its key entities - title should be short but specific. When possible, include representative named entities in the title.
+- SUMMARY: An executive summary of the community's overall structure, how its entities are related to each other, and significant information associated with its entities.
+- DETAILED FINDINGS: A list of 5-10 key insights about the community. Each insight should have a short summary followed by multiple paragraphs of explanatory text grounded according to the grounding rules below. Be comprehensive.
+
+Return output as a well-formed JSON-formatted string with the following format:
+{{
+"title": ,
+"summary": ,
+"rating": ,
+"rating_explanation": ,
+"findings": [
+{{
+"summary":,
+"explanation":
+}},
+{{
+"summary":,
+"explanation":
+}}
+...
+]
+}}
+
+## Grounding Rules
+Do not include information where the supporting evidence for it is not provided.
+
+Output:
+"""
+
+GLOBAL_MAP_POINTS = """
+You are a helpful assistant responding to questions about data in the tables provided.
+
+
+---Goal---
+
+Generate a response consisting of a list of key points that responds to the user's question, summarizing all relevant information in the input data tables.
+
+You should use the data provided in the data tables below as the primary context for generating the response.
+If you don't know the answer or if the input data tables do not contain sufficient information to provide an answer, just say so. Do not make anything up.
+
+Each key point in the response should have the following element:
+- Description: A comprehensive description of the point.
+- Importance Score: An integer score between 0-100 that indicates how important the point is in answering the user's question. An 'I don't know' type of response should have a score of 0.
+
+The response should be HTML formatted as follows:
+
+
+"Description of point 1..."score_value
+"Description of point 2..."score_value
+
+
+The response shall preserve the original meaning and use of modal verbs such as "shall", "may" or "will".
+Do not include information where the supporting evidence for it is not provided.
+
+
+---Data tables---
+
+{context_data}
+
+---User query---
+
+{query}
+
+---Goal---
+
+Generate a response consisting of a list of key points that responds to the user's question, summarizing all relevant information in the input data tables.
+
+You should use the data provided in the data tables below as the primary context for generating the response.
+If you don't know the answer or if the input data tables do not contain sufficient information to provide an answer, just say so. Do not make anything up.
+
+Each key point in the response should have the following element:
+- Description: A comprehensive description of the point.
+- Importance Score: An integer score between 0-100 that indicates how important the point is in answering the user's question. An 'I don't know' type of response should have a score of 0.
+
+The response shall preserve the original meaning and use of modal verbs such as "shall", "may" or "will".
+Do not include information where the supporting evidence for it is not provided.
+
+The response should be HTML formatted as follows:
+
+"Description of point 1..."score_value
+"Description of point 2..."score_value
+
+
+"""
+
+LOCAL_QUERY = """
+## User Query
+{query}
+## Context
+{context}
+## Task
+Based on given context, please provide a response to the user query.
+## Your Response
+"""
+
+GLOBAL_QUERY = """
+## User Query
+{query}
+## Context
+{context}
+## Task
+Based on given context, please provide a response to the user query.
+## Your Response
+"""
diff --git a/content/TinyGraphRAG/tinygraph/utils.py b/content/TinyGraphRAG/tinygraph/utils.py
new file mode 100644
index 0000000..decaf58
--- /dev/null
+++ b/content/TinyGraphRAG/tinygraph/utils.py
@@ -0,0 +1,55 @@
+import re
+import numpy as np
+from typing import List, Tuple
+from hashlib import md5
+import json
+import os
+
+
+def get_text_inside_tag(html_string: str, tag: str):
+ # html_string 为待解析文本,tag为查找标签
+ pattern = f"<{tag}>(.*?)<\/{tag}>"
+ try:
+ result = re.findall(pattern, html_string, re.DOTALL)
+ return result
+ except SyntaxError as e:
+ raise ("Json Decode Error: {error}".format(error=e))
+
+
+def read_json_file(file_path):
+ try:
+ with open(file_path, "r", encoding="utf-8") as file:
+ return json.load(file)
+ except:
+ return {}
+
+
+def write_json_file(data, file_path):
+ with open(file_path, "w", encoding="utf-8") as file:
+ json.dump(data, file, indent=4, ensure_ascii=False)
+
+
+def compute_mdhash_id(content, prefix: str = ""):
+ return prefix + md5(content.encode()).hexdigest()
+
+
+def save_triplets_to_txt(triplets, file_path):
+ with open(file_path, "a", encoding="utf-8") as file:
+ file.write(f"{triplets[0]},{triplets[1]},{triplets[2]}\n")
+
+
+def cosine_similarity(vector1: List[float], vector2: List[float]) -> float:
+ """
+ calculate cosine similarity between two vectors
+ """
+ dot_product = np.dot(vector1, vector2)
+ magnitude = np.linalg.norm(vector1) * np.linalg.norm(vector2)
+ if not magnitude:
+ return 0
+ return dot_product / magnitude
+
+
+def create_file_if_not_exists(file_path: str):
+ if not os.path.exists(file_path):
+ with open(file_path, "w") as f:
+ f.write("")