feat: migrate to FIPS-validated cryptographic algorithms (llamastack#3423)

rhdedgar · iamemilio · commit b96eed950c29 · 2025-09-24T15:11:54.000-04:00
# What does this PR do? Migrates MD5 and SHA-1 hash algorithms to SHA-256. In particular, replaces: - MD5 in chunk ID generation. - MD5 in file verification. - SHA-1 in model identifier digests. And updates all related test expectations. Original discussion: llamastack#3413  Closes llamastack#3424. ## Test Plan Unit tests from scripts/unit-tests.sh were updated to match the new hash output, and ran to verify the tests pass. Signed-off-by: Doug Edgar <dedgar@redhat.com>
diff --git a/llama_stack/cli/verify_download.py b/llama_stack/cli/verify_download.py
@@ -48,26 +48,23 @@ def setup_verify_download_parser(parser: argparse.ArgumentParser) -> None:
     parser.set_defaults(func=partial(run_verify_cmd, parser=parser))
 
 
-def calculate_md5(filepath: Path, chunk_size: int = 8192) -> str:
-    # NOTE: MD5 is used here only for download integrity verification,
-    # not for security purposes
-    # TODO: switch to SHA256
-    md5_hash = hashlib.md5(usedforsecurity=False)
+def calculate_sha256(filepath: Path, chunk_size: int = 8192) -> str:
+    sha256_hash = hashlib.sha256()
     with open(filepath, "rb") as f:
         for chunk in iter(lambda: f.read(chunk_size), b""):
-            md5_hash.update(chunk)
-    return md5_hash.hexdigest()
+            sha256_hash.update(chunk)
+    return sha256_hash.hexdigest()
 
 
 def load_checksums(checklist_path: Path) -> dict[str, str]:
     checksums = {}
     with open(checklist_path) as f:
         for line in f:
             if line.strip():
-                md5sum, filepath = line.strip().split("  ", 1)
+                sha256sum, filepath = line.strip().split("  ", 1)
                 # Remove leading './' if present
                 filepath = filepath.lstrip("./")
-                checksums[filepath] = md5sum
+                checksums[filepath] = sha256sum
     return checksums
 
 
@@ -88,7 +85,7 @@ def verify_files(model_dir: Path, checksums: dict[str, str], console: Console) -
             matches = False
 
             if exists:
-                actual_hash = calculate_md5(full_path)
+                actual_hash = calculate_sha256(full_path)
                 matches = actual_hash == expected_hash
 
             results.append(
diff --git a/llama_stack/providers/utils/vector_io/vector_utils.py b/llama_stack/providers/utils/vector_io/vector_utils.py
@@ -12,14 +12,12 @@
 def generate_chunk_id(document_id: str, chunk_text: str, chunk_window: str | None = None) -> str:
     """
     Generate a unique chunk ID using a hash of the document ID and chunk text.
-
-    Note: MD5 is used only to calculate an identifier, not for security purposes.
-    Adding usedforsecurity=False for compatibility with FIPS environments.
+    Then use the first 32 characters of the hash to create a UUID.
     """
     hash_input = f"{document_id}:{chunk_text}".encode()
     if chunk_window:
         hash_input += f":{chunk_window}".encode()
-    return str(uuid.UUID(hashlib.md5(hash_input, usedforsecurity=False).hexdigest()))
+    return str(uuid.UUID(hashlib.sha256(hash_input).hexdigest()[:32]))
 
 
 def proper_case(s: str) -> str:
diff --git a/llama_stack/testing/inference_recorder.py b/llama_stack/testing/inference_recorder.py
@@ -211,7 +211,7 @@ def _extract_model_identifiers():
         return sorted(set(idents))
 
     identifiers = _extract_model_identifiers()
-    return hashlib.sha1(("|".join(identifiers)).encode("utf-8")).hexdigest()[:8]
+    return hashlib.sha256(("|".join(identifiers)).encode("utf-8")).hexdigest()[:8]
 
 
 def _combine_model_list_responses(endpoint: str, records: list[dict[str, Any]]) -> dict[str, Any] | None:
diff --git a/tests/unit/providers/vector_io/test_vector_utils.py b/tests/unit/providers/vector_io/test_vector_utils.py
@@ -26,24 +26,24 @@ def test_generate_chunk_id():
 
     chunk_ids = sorted([chunk.chunk_id for chunk in chunks])
     assert chunk_ids == [
-        "177a1368-f6a8-0c50-6e92-18677f2c3de3",
-        "bc744db3-1b25-0a9c-cdff-b6ba3df73c36",
-        "f68df25d-d9aa-ab4d-5684-64a233add20d",
+        "31d1f9a3-c8d2-66e7-3c37-af2acd329778",
+        "d07dade7-29c0-cda7-df29-0249a1dcbc3e",
+        "d14f75a1-5855-7f72-2c78-d9fc4275a346",
     ]
 
 
 def test_generate_chunk_id_with_window():
     chunk = Chunk(content="test", metadata={"document_id": "doc-1"})
     chunk_id1 = generate_chunk_id("doc-1", chunk, chunk_window="0-1")
     chunk_id2 = generate_chunk_id("doc-1", chunk, chunk_window="1-2")
-    assert chunk_id1 == "149018fe-d0eb-0f8d-5f7f-726bdd2aeedb"
-    assert chunk_id2 == "4562c1ee-9971-1f3b-51a6-7d05e5211154"
+    assert chunk_id1 == "8630321a-d9cb-2bb6-cd28-ebf68dafd866"
+    assert chunk_id2 == "13a1c09a-cbda-b61a-2d1a-7baa90888685"
 
 
 def test_chunk_id():
     # Test with existing chunk ID
     chunk_with_id = Chunk(content="test", metadata={"document_id": "existing-id"})
-    assert chunk_with_id.chunk_id == "84ededcc-b80b-a83e-1a20-ca6515a11350"
+    assert chunk_with_id.chunk_id == "11704f92-42b6-61df-bf85-6473e7708fbd"
 
     # Test with document ID in metadata
     chunk_with_doc_id = Chunk(content="test", metadata={"document_id": "doc-1"})