From dcdd99f911e8b1a05d19cf1ad939b0fefae47864 Mon Sep 17 00:00:00 2001 From: Brock Wade Date: Mon, 16 Dec 2024 12:56:02 -0800 Subject: [PATCH 1/4] fix: security update -> use sha256 instead of md5 for file hashing --- src/sagemaker/workflow/utilities.py | 54 ++++++++++++++--------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/src/sagemaker/workflow/utilities.py b/src/sagemaker/workflow/utilities.py index 4ef5ad5dd2..4fc98eb29a 100644 --- a/src/sagemaker/workflow/utilities.py +++ b/src/sagemaker/workflow/utilities.py @@ -268,29 +268,29 @@ def get_config_hash(step: Entity): def hash_object(obj) -> str: - """Get the MD5 hash of an object. + """Get the SHA256 hash of an object. Args: obj (dict): The object Returns: - str: The MD5 hash of the object + str: The SHA256 hash of the object """ - return hashlib.md5(str(obj).encode()).hexdigest() + return hashlib.sha256(str(obj).encode()).hexdigest() def hash_file(path: str) -> str: - """Get the MD5 hash of a file. + """Get the SHA256 hash of a file. Args: path (str): The local path for the file. Returns: - str: The MD5 hash of the file. + str: The SHA256 hash of the file. """ - return _hash_file(path, hashlib.md5()).hexdigest() + return _hash_file(path, hashlib.sha256()).hexdigest() def hash_files_or_dirs(paths: List[str]) -> str: - """Get the MD5 hash of the contents of a list of files or directories. + """Get the SHA256 hash of the contents of a list of files or directories. Hash is changed if: * input list is changed @@ -301,58 +301,58 @@ def hash_files_or_dirs(paths: List[str]) -> str: Args: paths: List of file or directory paths Returns: - str: The MD5 hash of the list of files or directories. + str: The SHA256 hash of the list of files or directories. """ - md5 = hashlib.md5() + sha256 = hashlib.sha256() for path in sorted(paths): - md5 = _hash_file_or_dir(path, md5) - return md5.hexdigest() + sha256 = _hash_file_or_dir(path, sha256) + return sha256.hexdigest() -def _hash_file_or_dir(path: str, md5: Hash) -> Hash: +def _hash_file_or_dir(path: str, sha256: Hash) -> Hash: """Updates the inputted Hash with the contents of the current path. Args: path: path of file or directory Returns: - str: The MD5 hash of the file or directory + str: The SHA256 hash of the file or directory """ if isinstance(path, str) and path.lower().startswith("file://"): path = unquote(urlparse(path).path) - md5.update(path.encode()) + sha256.update(path.encode()) if Path(path).is_dir(): - md5 = _hash_dir(path, md5) + sha256 = _hash_dir(path, sha256) elif Path(path).is_file(): - md5 = _hash_file(path, md5) - return md5 + sha256 = _hash_file(path, sha256) + return sha256 -def _hash_dir(directory: Union[str, Path], md5: Hash) -> Hash: +def _hash_dir(directory: Union[str, Path], sha256: Hash) -> Hash: """Updates the inputted Hash with the contents of the current path. Args: directory: path of the directory Returns: - str: The MD5 hash of the directory + str: The SHA256 hash of the directory """ if not Path(directory).is_dir(): raise ValueError(str(directory) + " is not a valid directory") for path in sorted(Path(directory).iterdir()): - md5.update(path.name.encode()) + sha256.update(path.name.encode()) if path.is_file(): - md5 = _hash_file(path, md5) + sha256 = _hash_file(path, sha256) elif path.is_dir(): - md5 = _hash_dir(path, md5) - return md5 + sha256 = _hash_dir(path, sha256) + return sha256 -def _hash_file(file: Union[str, Path], md5: Hash) -> Hash: +def _hash_file(file: Union[str, Path], sha256: Hash) -> Hash: """Updates the inputted Hash with the contents of the current path. Args: file: path of the file Returns: - str: The MD5 hash of the file + str: The SHA256 hash of the file """ if isinstance(file, str) and file.lower().startswith("file://"): file = unquote(urlparse(file).path) @@ -363,8 +363,8 @@ def _hash_file(file: Union[str, Path], md5: Hash) -> Hash: data = f.read(BUF_SIZE) if not data: break - md5.update(data) - return md5 + sha256.update(data) + return sha256 def validate_step_args_input( From c39700d6032432ef1d6c3950ab9b745d9557c033 Mon Sep 17 00:00:00 2001 From: Brock Wade Date: Mon, 16 Dec 2024 12:56:02 -0800 Subject: [PATCH 2/4] fix: security update -> use sha256 instead of md5 for file hashing --- tests/unit/sagemaker/workflow/test_steps.py | 2 +- tests/unit/sagemaker/workflow/test_utilities.py | 4 ++-- tests/unit/sagemaker/workflow/test_utils.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/unit/sagemaker/workflow/test_steps.py b/tests/unit/sagemaker/workflow/test_steps.py index b3d667a1c3..248fee6532 100644 --- a/tests/unit/sagemaker/workflow/test_steps.py +++ b/tests/unit/sagemaker/workflow/test_steps.py @@ -671,7 +671,7 @@ def test_processing_step_normalizes_args_with_local_code(mock_normalize_args, sc mock_normalize_args.return_value = [step.inputs, step.outputs] step.to_request() mock_normalize_args.assert_called_with( - job_name="MyProcessingStep-3e89f0c7e101c356cbedf27d9d27e9db", + job_name="MyProcessingStep-a22fc59b38f13da26f6a40b18687ba598cf669f74104b793cefd9c63eddf4ac7", arguments=step.job_arguments, inputs=step.inputs, outputs=step.outputs, diff --git a/tests/unit/sagemaker/workflow/test_utilities.py b/tests/unit/sagemaker/workflow/test_utilities.py index e65d3ea933..b284ced91e 100644 --- a/tests/unit/sagemaker/workflow/test_utilities.py +++ b/tests/unit/sagemaker/workflow/test_utilities.py @@ -31,14 +31,14 @@ def test_hash_file(): with tempfile.NamedTemporaryFile() as tmp: tmp.write("hashme".encode()) hash = hash_file(tmp.name) - assert hash == "d41d8cd98f00b204e9800998ecf8427e" + assert hash == "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" def test_hash_file_uri(): with tempfile.NamedTemporaryFile() as tmp: tmp.write("hashme".encode()) hash = hash_file(f"file:///{tmp.name}") - assert hash == "d41d8cd98f00b204e9800998ecf8427e" + assert hash == "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855" def test_hash_files_or_dirs_with_file(): diff --git a/tests/unit/sagemaker/workflow/test_utils.py b/tests/unit/sagemaker/workflow/test_utils.py index 48b1d762c3..6e9f4ecdcb 100644 --- a/tests/unit/sagemaker/workflow/test_utils.py +++ b/tests/unit/sagemaker/workflow/test_utils.py @@ -82,7 +82,7 @@ def test_repack_model_step(estimator): assert hyperparameters["sagemaker_program"] == f'"{REPACK_SCRIPT_LAUNCHER}"' assert ( hyperparameters["sagemaker_submit_directory"] - == '"s3://my-bucket/MyRepackModelStep-b5ea77f701b47a8d075605497462ccc2/source/sourcedir.tar.gz"' + == '"s3://my-bucket/MyRepackModelStep-717d7bdd388168c27e9ad2938ff0314e35be50b3157cf2498688c7525ea27e1e/source/sourcedir.tar.gz"' ) del request_dict["Arguments"]["HyperParameters"] From e0c8a007181788789530fbf188c7c7f2fcd70576 Mon Sep 17 00:00:00 2001 From: Brock Wade Date: Tue, 17 Dec 2024 23:39:35 -0800 Subject: [PATCH 3/4] fix flake8 --- tests/unit/sagemaker/workflow/test_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/unit/sagemaker/workflow/test_utils.py b/tests/unit/sagemaker/workflow/test_utils.py index 6e9f4ecdcb..0100045dc8 100644 --- a/tests/unit/sagemaker/workflow/test_utils.py +++ b/tests/unit/sagemaker/workflow/test_utils.py @@ -82,7 +82,8 @@ def test_repack_model_step(estimator): assert hyperparameters["sagemaker_program"] == f'"{REPACK_SCRIPT_LAUNCHER}"' assert ( hyperparameters["sagemaker_submit_directory"] - == '"s3://my-bucket/MyRepackModelStep-717d7bdd388168c27e9ad2938ff0314e35be50b3157cf2498688c7525ea27e1e/source/sourcedir.tar.gz"' + == '"s3://my-bucket/MyRepackModelStep-717d7bdd388168c27e9ad2938ff0314e35be50b3157cf2498688c7525ea27e1e\ + /source/sourcedir.tar.gz"' ) del request_dict["Arguments"]["HyperParameters"] From d0e9adfb73677b1fadd02a2e8dc0ef9ad3967ef7 Mon Sep 17 00:00:00 2001 From: Brock Wade Date: Wed, 18 Dec 2024 00:49:26 -0800 Subject: [PATCH 4/4] fix: test spacing --- tests/unit/sagemaker/workflow/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/sagemaker/workflow/test_utils.py b/tests/unit/sagemaker/workflow/test_utils.py index 0100045dc8..e16293a1c5 100644 --- a/tests/unit/sagemaker/workflow/test_utils.py +++ b/tests/unit/sagemaker/workflow/test_utils.py @@ -83,7 +83,7 @@ def test_repack_model_step(estimator): assert ( hyperparameters["sagemaker_submit_directory"] == '"s3://my-bucket/MyRepackModelStep-717d7bdd388168c27e9ad2938ff0314e35be50b3157cf2498688c7525ea27e1e\ - /source/sourcedir.tar.gz"' +/source/sourcedir.tar.gz"' ) del request_dict["Arguments"]["HyperParameters"]