Skip to content

Commit 10f84c9

Browse files
authored
Replace md5 hash (#1470)
* switched hashing function helper to sha256 * refactored references to hashing util * semversioner * switched from sha256 to sha512 * new semversioner * updated tests/verbs/data folder * generated fresh parquet files in data folder * moved ignore flag
1 parent d17dfd0 commit 10f84c9

21 files changed

+21
-12
lines changed
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"type": "patch",
3+
"description": "replaced md5 hash with sha256"
4+
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"type": "patch",
3+
"description": "replaced md5 hash with sha512"
4+
}

graphrag/index/flows/create_base_entity_graph.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -169,5 +169,6 @@ def _prep_communities(communities) -> pd.DataFrame:
169169

170170
def _compute_degree(graph: nx.Graph) -> pd.DataFrame:
171171
return pd.DataFrame([
172-
{"name": node, "degree": int(degree)} for node, degree in graph.degree
173-
]) # type: ignore
172+
{"name": node, "degree": int(degree)}
173+
for node, degree in graph.degree # type: ignore
174+
])

graphrag/index/flows/create_base_text_units.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
from graphrag.index.operations.chunk_text import chunk_text
1818
from graphrag.index.operations.snapshot import snapshot
19-
from graphrag.index.utils.hashing import gen_md5_hash
19+
from graphrag.index.utils.hashing import gen_sha512_hash
2020
from graphrag.storage.pipeline_storage import PipelineStorage
2121

2222

@@ -67,7 +67,7 @@ async def create_base_text_units(
6767
},
6868
inplace=True,
6969
)
70-
chunked["id"] = chunked.apply(lambda row: gen_md5_hash(row, ["chunk"]), axis=1)
70+
chunked["id"] = chunked.apply(lambda row: gen_sha512_hash(row, ["chunk"]), axis=1)
7171
chunked[["document_ids", "chunk", "n_tokens"]] = pd.DataFrame(
7272
chunked["chunk"].tolist(), index=chunked.index
7373
)

graphrag/index/input/csv.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import pandas as pd
1212

1313
from graphrag.index.config.input import PipelineCSVInputConfig, PipelineInputConfig
14-
from graphrag.index.utils.hashing import gen_md5_hash
14+
from graphrag.index.utils.hashing import gen_sha512_hash
1515
from graphrag.logging.base import ProgressReporter
1616
from graphrag.storage.pipeline_storage import PipelineStorage
1717

@@ -42,7 +42,7 @@ async def load_file(path: str, group: dict | None) -> pd.DataFrame:
4242
lambda _row: pd.Series([group[key] for key in additional_keys]), axis=1
4343
)
4444
if "id" not in data.columns:
45-
data["id"] = data.apply(lambda x: gen_md5_hash(x, x.keys()), axis=1)
45+
data["id"] = data.apply(lambda x: gen_sha512_hash(x, x.keys()), axis=1)
4646
if csv_config.source_column is not None and "source" not in data.columns:
4747
if csv_config.source_column not in data.columns:
4848
log.warning(

graphrag/index/input/text.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
import pandas as pd
1212

1313
from graphrag.index.config.input import PipelineInputConfig
14-
from graphrag.index.utils.hashing import gen_md5_hash
14+
from graphrag.index.utils.hashing import gen_sha512_hash
1515
from graphrag.logging.base import ProgressReporter
1616
from graphrag.storage.pipeline_storage import PipelineStorage
1717

@@ -36,7 +36,7 @@ async def load_file(
3636
group = {}
3737
text = await storage.get(path, encoding="utf-8")
3838
new_item = {**group, "text": text}
39-
new_item["id"] = gen_md5_hash(new_item, new_item.keys())
39+
new_item["id"] = gen_sha512_hash(new_item, new_item.keys())
4040
new_item["title"] = str(Path(path).name)
4141
return new_item
4242

graphrag/index/utils/hashing.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@
44
"""Hashing utilities."""
55

66
from collections.abc import Iterable
7-
from hashlib import md5
7+
from hashlib import sha512
88
from typing import Any
99

1010

11-
def gen_md5_hash(item: dict[str, Any], hashcode: Iterable[str]):
12-
"""Generate an md5 hash."""
11+
def gen_sha512_hash(item: dict[str, Any], hashcode: Iterable[str]):
12+
"""Generate a SHA512 hash."""
1313
hashed = "".join([str(item[column]) for column in hashcode])
14-
return f"{md5(hashed.encode('utf-8'), usedforsecurity=False).hexdigest()}"
14+
return f"{sha512(hashed.encode('utf-8'), usedforsecurity=False).hexdigest()}"
-191 Bytes
Binary file not shown.
4.04 KB
Binary file not shown.
3.73 KB
Binary file not shown.

0 commit comments

Comments
 (0)