diff --git a/src/sagemaker/git_utils.py b/src/sagemaker/git_utils.py index 49d151a00b..25e745446a 100644 --- a/src/sagemaker/git_utils.py +++ b/src/sagemaker/git_utils.py @@ -14,14 +14,78 @@ from __future__ import absolute_import import os -from pathlib import Path +import re import subprocess import tempfile import warnings +from pathlib import Path +from urllib.parse import urlparse + import six from six.moves import urllib +def _sanitize_git_url(repo_url): + """Sanitize Git repository URL to prevent URL injection attacks. + + Args: + repo_url (str): The Git repository URL to sanitize + + Returns: + str: The sanitized URL + + Raises: + ValueError: If the URL contains suspicious patterns that could indicate injection + """ + at_count = repo_url.count("@") + + if repo_url.startswith("git@"): + # git@ format requires exactly one @ + if at_count != 1: + raise ValueError("Invalid SSH URL format: git@ URLs must have exactly one @ symbol") + elif repo_url.startswith("ssh://"): + # ssh:// format can have 0 or 1 @ symbols + if at_count > 1: + raise ValueError("Invalid SSH URL format: multiple @ symbols detected") + elif repo_url.startswith("https://") or repo_url.startswith("http://"): + # HTTPS format allows 0 or 1 @ symbols + if at_count > 1: + raise ValueError("Invalid HTTPS URL format: multiple @ symbols detected") + + # Check for invalid characters in the URL before parsing + # These characters should not appear in legitimate URLs + invalid_chars = ["<", ">", "[", "]", "{", "}", "\\", "^", "`", "|"] + for char in invalid_chars: + if char in repo_url: + raise ValueError("Invalid characters in hostname") + + try: + parsed = urlparse(repo_url) + + # Check for suspicious characters in hostname that could indicate injection + if parsed.hostname: + # Check for URL-encoded characters that might be used for obfuscation + suspicious_patterns = ["%25", "%40", "%2F", "%3A"] # encoded %, @, /, : + for pattern in suspicious_patterns: + if pattern in parsed.hostname.lower(): + raise ValueError(f"Suspicious URL encoding detected in hostname: {pattern}") + + # Validate that the hostname looks legitimate + if not re.match(r"^[a-zA-Z0-9.-]+$", parsed.hostname): + raise ValueError("Invalid characters in hostname") + + except Exception as e: + if isinstance(e, ValueError): + raise + raise ValueError(f"Failed to parse URL: {str(e)}") + else: + raise ValueError( + "Unsupported URL scheme: only https://, http://, git@, and ssh:// are allowed" + ) + + return repo_url + + def git_clone_repo(git_config, entry_point, source_dir=None, dependencies=None): """Git clone repo containing the training code and serving code. @@ -87,6 +151,10 @@ def git_clone_repo(git_config, entry_point, source_dir=None, dependencies=None): if entry_point is None: raise ValueError("Please provide an entry point.") _validate_git_config(git_config) + + # SECURITY: Sanitize the repository URL to prevent injection attacks + git_config["repo"] = _sanitize_git_url(git_config["repo"]) + dest_dir = tempfile.mkdtemp() _generate_and_run_clone_command(git_config, dest_dir) diff --git a/tests/unit/test_estimator.py b/tests/unit/test_estimator.py index 11cc83a463..cfb243b563 100644 --- a/tests/unit/test_estimator.py +++ b/tests/unit/test_estimator.py @@ -2794,7 +2794,7 @@ def test_git_support_bad_repo_url_format(sagemaker_session): ) with pytest.raises(ValueError) as error: fw.fit() - assert "Invalid Git url provided." in str(error) + assert "Unsupported URL scheme" in str(error) @patch( diff --git a/tests/unit/test_git_utils.py b/tests/unit/test_git_utils.py index 03bbc1ebcd..2d10ac7619 100644 --- a/tests/unit/test_git_utils.py +++ b/tests/unit/test_git_utils.py @@ -12,11 +12,12 @@ # language governing permissions and limitations under the License. from __future__ import absolute_import -import pytest import os -from pathlib import Path import subprocess -from mock import patch, ANY +from pathlib import Path + +import pytest +from mock import ANY, patch from sagemaker import git_utils @@ -494,3 +495,212 @@ def test_git_clone_repo_codecommit_https_creds_not_stored_locally(tempdir, mkdte with pytest.raises(subprocess.CalledProcessError) as error: git_utils.git_clone_repo(git_config, entry_point) assert "returned non-zero exit status" in str(error.value) + + +class TestGitUrlSanitization: + """Test cases for Git URL sanitization to prevent injection attacks.""" + + def test_sanitize_git_url_valid_https_urls(self): + """Test that valid HTTPS URLs pass sanitization.""" + valid_urls = [ + "https://github.com/user/repo.git", + "https://gitlab.com/user/repo.git", + "https://token@github.com/user/repo.git", + "https://user:pass@github.com/user/repo.git", + "http://internal-git.company.com/repo.git", + ] + + for url in valid_urls: + # Should not raise any exception + result = git_utils._sanitize_git_url(url) + assert result == url + + def test_sanitize_git_url_valid_ssh_urls(self): + """Test that valid SSH URLs pass sanitization.""" + valid_urls = [ + "git@github.com:user/repo.git", + "git@gitlab.com:user/repo.git", + "ssh://git@github.com/user/repo.git", + "ssh://git-codecommit.us-west-2.amazonaws.com/v1/repos/test-repo/", # 0 @ symbols - valid for ssh:// + "git@internal-git.company.com:repo.git", + ] + + for url in valid_urls: + # Should not raise any exception + result = git_utils._sanitize_git_url(url) + assert result == url + + def test_sanitize_git_url_blocks_multiple_at_https(self): + """Test that HTTPS URLs with multiple @ symbols are blocked.""" + malicious_urls = [ + "https://user@attacker.com@github.com/repo.git", + "https://token@evil.com@gitlab.com/user/repo.git", + "https://a@b@c@github.com/repo.git", + "https://user@malicious-host@github.com/legit/repo.git", + ] + + for url in malicious_urls: + with pytest.raises(ValueError) as error: + git_utils._sanitize_git_url(url) + assert "multiple @ symbols detected" in str(error.value) + + def test_sanitize_git_url_blocks_multiple_at_ssh(self): + """Test that SSH URLs with multiple @ symbols are blocked.""" + malicious_urls = [ + "git@attacker.com@github.com:repo.git", + "git@evil@gitlab.com:user/repo.git", + "ssh://git@malicious@github.com/repo.git", + "git@a@b@c:repo.git", + ] + + for url in malicious_urls: + with pytest.raises(ValueError) as error: + git_utils._sanitize_git_url(url) + # git@ URLs should give "exactly one @ symbol" error + # ssh:// URLs should give "multiple @ symbols detected" error + assert any( + phrase in str(error.value) + for phrase in ["multiple @ symbols detected", "exactly one @ symbol"] + ) + + def test_sanitize_git_url_blocks_invalid_schemes_and_git_at_format(self): + """Test that invalid schemes and git@ format violations are blocked.""" + # Test unsupported schemes + unsupported_scheme_urls = [ + "git-github.com:user/repo.git", # Doesn't start with git@, ssh://, http://, https:// + ] + + for url in unsupported_scheme_urls: + with pytest.raises(ValueError) as error: + git_utils._sanitize_git_url(url) + assert "Unsupported URL scheme" in str(error.value) + + # Test git@ URLs with wrong @ count + invalid_git_at_urls = [ + "git@github.com@evil.com:repo.git", # 2 @ symbols + ] + + for url in invalid_git_at_urls: + with pytest.raises(ValueError) as error: + git_utils._sanitize_git_url(url) + assert "exactly one @ symbol" in str(error.value) + + def test_sanitize_git_url_blocks_url_encoding_obfuscation(self): + """Test that URL-encoded obfuscation attempts are blocked.""" + obfuscated_urls = [ + "https://github.com%25evil.com/repo.git", + "https://user@github.com%40attacker.com/repo.git", + "https://github.com%2Fevil.com/repo.git", + "https://github.com%3Aevil.com/repo.git", + ] + + for url in obfuscated_urls: + with pytest.raises(ValueError) as error: + git_utils._sanitize_git_url(url) + # The error could be either suspicious encoding or invalid characters + assert any( + phrase in str(error.value) + for phrase in ["Suspicious URL encoding detected", "Invalid characters in hostname"] + ) + + def test_sanitize_git_url_blocks_invalid_hostname_chars(self): + """Test that hostnames with invalid characters are blocked.""" + invalid_urls = [ + "https://github", + ] + + for url in unsupported_urls: + with pytest.raises(ValueError) as error: + git_utils._sanitize_git_url(url) + assert "Unsupported URL scheme" in str(error.value) + + def test_git_clone_repo_blocks_malicious_https_url(self): + """Test that git_clone_repo blocks malicious HTTPS URLs.""" + malicious_git_config = { + "repo": "https://user@attacker.com@github.com/legit/repo.git", + "branch": "main", + } + entry_point = "train.py" + + with pytest.raises(ValueError) as error: + git_utils.git_clone_repo(malicious_git_config, entry_point) + assert "multiple @ symbols detected" in str(error.value) + + def test_git_clone_repo_blocks_malicious_ssh_url(self): + """Test that git_clone_repo blocks malicious SSH URLs.""" + malicious_git_config = { + "repo": "git@OBVIOUS@github.com:sage-maker/temp-sev2.git", + "branch": "main", + } + entry_point = "train.py" + + with pytest.raises(ValueError) as error: + git_utils.git_clone_repo(malicious_git_config, entry_point) + assert "exactly one @ symbol" in str(error.value) + + def test_git_clone_repo_blocks_url_encoded_attack(self): + """Test that git_clone_repo blocks URL-encoded attacks.""" + malicious_git_config = { + "repo": "https://github.com%40attacker.com/repo.git", + "branch": "main", + } + entry_point = "train.py" + + with pytest.raises(ValueError) as error: + git_utils.git_clone_repo(malicious_git_config, entry_point) + assert "Suspicious URL encoding detected" in str(error.value) + + def test_sanitize_git_url_comprehensive_attack_scenarios(self): + attack_scenarios = [ + # Original PoC attack + "https://USER@YOUR_NGROK_OR_LOCALHOST/malicious.git@github.com%25legit%25repo.git", + # Variations of the attack + "https://user@malicious-host@github.com/legit/repo.git", + "git@attacker.com@github.com:user/repo.git", + "ssh://git@evil.com@github.com/repo.git", + # URL encoding variations + "https://github.com%40evil.com/repo.git", + "https://user@github.com%2Fevil.com/repo.git", + ] + + entry_point = "train.py" + + for malicious_url in attack_scenarios: + git_config = {"repo": malicious_url} + with pytest.raises(ValueError) as error: + git_utils.git_clone_repo(git_config, entry_point) + # Should be blocked by sanitization + assert any( + phrase in str(error.value) + for phrase in [ + "multiple @ symbols detected", + "exactly one @ symbol", + "Suspicious URL encoding detected", + "Invalid characters in hostname", + ] + )