Skip to content

Commit b96eed9

Browse files
rhdedgariamemilio
authored andcommitted
feat: migrate to FIPS-validated cryptographic algorithms (llamastack#3423)
# What does this PR do? Migrates MD5 and SHA-1 hash algorithms to SHA-256. In particular, replaces: - MD5 in chunk ID generation. - MD5 in file verification. - SHA-1 in model identifier digests. And updates all related test expectations. Original discussion: llamastack#3413 <!-- If resolving an issue, uncomment and update the line below --> Closes llamastack#3424. ## Test Plan Unit tests from scripts/unit-tests.sh were updated to match the new hash output, and ran to verify the tests pass. Signed-off-by: Doug Edgar <[email protected]>
1 parent 14baf7e commit b96eed9

File tree

4 files changed

+16
-21
lines changed

4 files changed

+16
-21
lines changed

llama_stack/cli/verify_download.py

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -48,26 +48,23 @@ def setup_verify_download_parser(parser: argparse.ArgumentParser) -> None:
4848
parser.set_defaults(func=partial(run_verify_cmd, parser=parser))
4949

5050

51-
def calculate_md5(filepath: Path, chunk_size: int = 8192) -> str:
52-
# NOTE: MD5 is used here only for download integrity verification,
53-
# not for security purposes
54-
# TODO: switch to SHA256
55-
md5_hash = hashlib.md5(usedforsecurity=False)
51+
def calculate_sha256(filepath: Path, chunk_size: int = 8192) -> str:
52+
sha256_hash = hashlib.sha256()
5653
with open(filepath, "rb") as f:
5754
for chunk in iter(lambda: f.read(chunk_size), b""):
58-
md5_hash.update(chunk)
59-
return md5_hash.hexdigest()
55+
sha256_hash.update(chunk)
56+
return sha256_hash.hexdigest()
6057

6158

6259
def load_checksums(checklist_path: Path) -> dict[str, str]:
6360
checksums = {}
6461
with open(checklist_path) as f:
6562
for line in f:
6663
if line.strip():
67-
md5sum, filepath = line.strip().split(" ", 1)
64+
sha256sum, filepath = line.strip().split(" ", 1)
6865
# Remove leading './' if present
6966
filepath = filepath.lstrip("./")
70-
checksums[filepath] = md5sum
67+
checksums[filepath] = sha256sum
7168
return checksums
7269

7370

@@ -88,7 +85,7 @@ def verify_files(model_dir: Path, checksums: dict[str, str], console: Console) -
8885
matches = False
8986

9087
if exists:
91-
actual_hash = calculate_md5(full_path)
88+
actual_hash = calculate_sha256(full_path)
9289
matches = actual_hash == expected_hash
9390

9491
results.append(

llama_stack/providers/utils/vector_io/vector_utils.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,12 @@
1212
def generate_chunk_id(document_id: str, chunk_text: str, chunk_window: str | None = None) -> str:
1313
"""
1414
Generate a unique chunk ID using a hash of the document ID and chunk text.
15-
16-
Note: MD5 is used only to calculate an identifier, not for security purposes.
17-
Adding usedforsecurity=False for compatibility with FIPS environments.
15+
Then use the first 32 characters of the hash to create a UUID.
1816
"""
1917
hash_input = f"{document_id}:{chunk_text}".encode()
2018
if chunk_window:
2119
hash_input += f":{chunk_window}".encode()
22-
return str(uuid.UUID(hashlib.md5(hash_input, usedforsecurity=False).hexdigest()))
20+
return str(uuid.UUID(hashlib.sha256(hash_input).hexdigest()[:32]))
2321

2422

2523
def proper_case(s: str) -> str:

llama_stack/testing/inference_recorder.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ def _extract_model_identifiers():
211211
return sorted(set(idents))
212212

213213
identifiers = _extract_model_identifiers()
214-
return hashlib.sha1(("|".join(identifiers)).encode("utf-8")).hexdigest()[:8]
214+
return hashlib.sha256(("|".join(identifiers)).encode("utf-8")).hexdigest()[:8]
215215

216216

217217
def _combine_model_list_responses(endpoint: str, records: list[dict[str, Any]]) -> dict[str, Any] | None:

tests/unit/providers/vector_io/test_vector_utils.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -26,24 +26,24 @@ def test_generate_chunk_id():
2626

2727
chunk_ids = sorted([chunk.chunk_id for chunk in chunks])
2828
assert chunk_ids == [
29-
"177a1368-f6a8-0c50-6e92-18677f2c3de3",
30-
"bc744db3-1b25-0a9c-cdff-b6ba3df73c36",
31-
"f68df25d-d9aa-ab4d-5684-64a233add20d",
29+
"31d1f9a3-c8d2-66e7-3c37-af2acd329778",
30+
"d07dade7-29c0-cda7-df29-0249a1dcbc3e",
31+
"d14f75a1-5855-7f72-2c78-d9fc4275a346",
3232
]
3333

3434

3535
def test_generate_chunk_id_with_window():
3636
chunk = Chunk(content="test", metadata={"document_id": "doc-1"})
3737
chunk_id1 = generate_chunk_id("doc-1", chunk, chunk_window="0-1")
3838
chunk_id2 = generate_chunk_id("doc-1", chunk, chunk_window="1-2")
39-
assert chunk_id1 == "149018fe-d0eb-0f8d-5f7f-726bdd2aeedb"
40-
assert chunk_id2 == "4562c1ee-9971-1f3b-51a6-7d05e5211154"
39+
assert chunk_id1 == "8630321a-d9cb-2bb6-cd28-ebf68dafd866"
40+
assert chunk_id2 == "13a1c09a-cbda-b61a-2d1a-7baa90888685"
4141

4242

4343
def test_chunk_id():
4444
# Test with existing chunk ID
4545
chunk_with_id = Chunk(content="test", metadata={"document_id": "existing-id"})
46-
assert chunk_with_id.chunk_id == "84ededcc-b80b-a83e-1a20-ca6515a11350"
46+
assert chunk_with_id.chunk_id == "11704f92-42b6-61df-bf85-6473e7708fbd"
4747

4848
# Test with document ID in metadata
4949
chunk_with_doc_id = Chunk(content="test", metadata={"document_id": "doc-1"})

0 commit comments

Comments
 (0)