microsoft
diff --git a/‎.semversioner/next-release/patch-20241204203534799756.json‎
Lines changed: 4 additions & 0 deletions b/‎.semversioner/next-release/patch-20241204203534799756.json‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎.semversioner/next-release/patch-20241204211013990211.json‎
Lines changed: 4 additions & 0 deletions b/‎.semversioner/next-release/patch-20241204211013990211.json‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎graphrag/index/flows/create_base_entity_graph.py‎
Lines changed: 3 additions & 2 deletions b/‎graphrag/index/flows/create_base_entity_graph.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎graphrag/index/flows/create_base_text_units.py‎
Lines changed: 2 additions & 2 deletions b/‎graphrag/index/flows/create_base_text_units.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎graphrag/index/input/csv.py‎
Lines changed: 2 additions & 2 deletions b/‎graphrag/index/input/csv.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎graphrag/index/input/text.py‎
Lines changed: 2 additions & 2 deletions b/‎graphrag/index/input/text.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎graphrag/index/utils/hashing.py‎
Lines changed: 4 additions & 4 deletions b/‎graphrag/index/utils/hashing.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎tests/verbs/data/base_communities.parquet‎
-191 Bytes b/‎tests/verbs/data/base_communities.parquet‎
-191 Bytes
diff --git a/‎tests/verbs/data/base_entity_nodes.parquet‎
4.04 KB b/‎tests/verbs/data/base_entity_nodes.parquet‎
4.04 KB
diff --git a/‎tests/verbs/data/base_relationship_edges.parquet‎
3.73 KB b/‎tests/verbs/data/base_relationship_edges.parquet‎
3.73 KB
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "replaced md5 hash with sha256"
+}
@@ -0,0 +1,4 @@
+{
+  "type": "patch",
+  "description": "replaced md5 hash with sha512"
+}
@@ -169,5 +169,6 @@ def _prep_communities(communities) -> pd.DataFrame:
 
 def _compute_degree(graph: nx.Graph) -> pd.DataFrame:
     return pd.DataFrame([
-        {"name": node, "degree": int(degree)} for node, degree in graph.degree
-    ])  # type: ignore
+        {"name": node, "degree": int(degree)}
+        for node, degree in graph.degree  # type: ignore
+    ])
@@ -16,7 +16,7 @@
 
 from graphrag.index.operations.chunk_text import chunk_text
 from graphrag.index.operations.snapshot import snapshot
-from graphrag.index.utils.hashing import gen_md5_hash
+from graphrag.index.utils.hashing import gen_sha512_hash
 from graphrag.storage.pipeline_storage import PipelineStorage
 
 
@@ -67,7 +67,7 @@ async def create_base_text_units(
         },
         inplace=True,
     )
-    chunked["id"] = chunked.apply(lambda row: gen_md5_hash(row, ["chunk"]), axis=1)
+    chunked["id"] = chunked.apply(lambda row: gen_sha512_hash(row, ["chunk"]), axis=1)
     chunked[["document_ids", "chunk", "n_tokens"]] = pd.DataFrame(
         chunked["chunk"].tolist(), index=chunked.index
     )
 
@@ -11,7 +11,7 @@
 import pandas as pd
 
 from graphrag.index.config.input import PipelineCSVInputConfig, PipelineInputConfig
-from graphrag.index.utils.hashing import gen_md5_hash
+from graphrag.index.utils.hashing import gen_sha512_hash
 from graphrag.logging.base import ProgressReporter
 from graphrag.storage.pipeline_storage import PipelineStorage
 
@@ -42,7 +42,7 @@ async def load_file(path: str, group: dict | None) -> pd.DataFrame:
                 lambda _row: pd.Series([group[key] for key in additional_keys]), axis=1
             )
         if "id" not in data.columns:
-            data["id"] = data.apply(lambda x: gen_md5_hash(x, x.keys()), axis=1)
+            data["id"] = data.apply(lambda x: gen_sha512_hash(x, x.keys()), axis=1)
         if csv_config.source_column is not None and "source" not in data.columns:
             if csv_config.source_column not in data.columns:
                 log.warning(
 
@@ -11,7 +11,7 @@
 import pandas as pd
 
 from graphrag.index.config.input import PipelineInputConfig
-from graphrag.index.utils.hashing import gen_md5_hash
+from graphrag.index.utils.hashing import gen_sha512_hash
 from graphrag.logging.base import ProgressReporter
 from graphrag.storage.pipeline_storage import PipelineStorage
 
@@ -36,7 +36,7 @@ async def load_file(
             group = {}
         text = await storage.get(path, encoding="utf-8")
         new_item = {**group, "text": text}
-        new_item["id"] = gen_md5_hash(new_item, new_item.keys())
+        new_item["id"] = gen_sha512_hash(new_item, new_item.keys())
         new_item["title"] = str(Path(path).name)
         return new_item
 
 
@@ -4,11 +4,11 @@
 """Hashing utilities."""
 
 from collections.abc import Iterable
-from hashlib import md5
+from hashlib import sha512
 from typing import Any
 
 
-def gen_md5_hash(item: dict[str, Any], hashcode: Iterable[str]):
-    """Generate an md5 hash."""
+def gen_sha512_hash(item: dict[str, Any], hashcode: Iterable[str]):
+    """Generate a SHA512 hash."""
     hashed = "".join([str(item[column]) for column in hashcode])
-    return f"{md5(hashed.encode('utf-8'), usedforsecurity=False).hexdigest()}"
+    return f"{sha512(hashed.encode('utf-8'), usedforsecurity=False).hexdigest()}"
-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +{
 +  "type": "patch",
 +  "description": "replaced md5 hash with sha256"
 +}
Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,7 @@`
`16`	`16`
`17`	`17`	`from graphrag.index.operations.chunk_text import chunk_text`
`18`	`18`	`from graphrag.index.operations.snapshot import snapshot`
`19`		`-from graphrag.index.utils.hashing import gen_md5_hash`
	`19`	`+from graphrag.index.utils.hashing import gen_sha512_hash`
`20`	`20`	`from graphrag.storage.pipeline_storage import PipelineStorage`
`21`	`21`
`22`	`22`
`@@ -67,7 +67,7 @@ async def create_base_text_units(`
`67`	`67`	`},`
`68`	`68`	`inplace=True,`
`69`	`69`	`)`
`70`		`- chunked["id"] = chunked.apply(lambda row: gen_md5_hash(row, ["chunk"]), axis=1)`
	`70`	`+ chunked["id"] = chunked.apply(lambda row: gen_sha512_hash(row, ["chunk"]), axis=1)`
`71`	`71`	`chunked[["document_ids", "chunk", "n_tokens"]] = pd.DataFrame(`
`72`	`72`	`chunked["chunk"].tolist(), index=chunked.index`
`73`	`73`	`)`