Skip to content

Commit 171b8a1

Browse files
committed
fix: sanitize git clone repo input url
1 parent d408594 commit 171b8a1

File tree

2 files changed

+273
-4
lines changed

2 files changed

+273
-4
lines changed

src/sagemaker/git_utils.py

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,69 @@
1414
from __future__ import absolute_import
1515

1616
import os
17-
from pathlib import Path
17+
import re
1818
import subprocess
1919
import tempfile
2020
import warnings
21+
from pathlib import Path
22+
from urllib.parse import urlparse
23+
2124
import six
2225
from six.moves import urllib
2326

2427

28+
def _sanitize_git_url(repo_url):
29+
"""Sanitize Git repository URL to prevent URL injection attacks.
30+
31+
Args:
32+
repo_url (str): The Git repository URL to sanitize
33+
34+
Returns:
35+
str: The sanitized URL
36+
37+
Raises:
38+
ValueError: If the URL contains suspicious patterns that could indicate injection
39+
"""
40+
at_count = repo_url.count('@')
41+
42+
if repo_url.startswith('git@'):
43+
# git@ format requires exactly one @
44+
if at_count != 1:
45+
raise ValueError("Invalid SSH URL format: git@ URLs must have exactly one @ symbol")
46+
elif repo_url.startswith('ssh://'):
47+
# ssh:// format can have 0 or 1 @ symbols
48+
if at_count > 1:
49+
raise ValueError("Invalid SSH URL format: multiple @ symbols detected")
50+
elif repo_url.startswith('https://') or repo_url.startswith('http://'):
51+
# HTTPS format allows 0 or 1 @ symbols
52+
if at_count > 1:
53+
raise ValueError("Invalid HTTPS URL format: multiple @ symbols detected")
54+
55+
try:
56+
parsed = urlparse(repo_url)
57+
58+
# Check for suspicious characters in hostname that could indicate injection
59+
if parsed.hostname:
60+
# Check for URL-encoded characters that might be used for obfuscation
61+
suspicious_patterns = ['%25', '%40', '%2F', '%3A'] # encoded %, @, /, :
62+
for pattern in suspicious_patterns:
63+
if pattern in parsed.hostname.lower():
64+
raise ValueError(f"Suspicious URL encoding detected in hostname: {pattern}")
65+
66+
# Validate that the hostname looks legitimate
67+
if not re.match(r'^[a-zA-Z0-9.-]+$', parsed.hostname):
68+
raise ValueError("Invalid characters in hostname")
69+
70+
except Exception as e:
71+
if isinstance(e, ValueError):
72+
raise
73+
raise ValueError(f"Failed to parse URL: {str(e)}")
74+
else:
75+
raise ValueError("Unsupported URL scheme: only https://, http://, git@, and ssh:// are allowed")
76+
77+
return repo_url
78+
79+
2580
def git_clone_repo(git_config, entry_point, source_dir=None, dependencies=None):
2681
"""Git clone repo containing the training code and serving code.
2782
@@ -87,6 +142,10 @@ def git_clone_repo(git_config, entry_point, source_dir=None, dependencies=None):
87142
if entry_point is None:
88143
raise ValueError("Please provide an entry point.")
89144
_validate_git_config(git_config)
145+
146+
# SECURITY: Sanitize the repository URL to prevent injection attacks
147+
git_config["repo"] = _sanitize_git_url(git_config["repo"])
148+
90149
dest_dir = tempfile.mkdtemp()
91150
_generate_and_run_clone_command(git_config, dest_dir)
92151

tests/unit/test_git_utils.py

Lines changed: 213 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,12 @@
1212
# language governing permissions and limitations under the License.
1313
from __future__ import absolute_import
1414

15-
import pytest
1615
import os
17-
from pathlib import Path
1816
import subprocess
19-
from mock import patch, ANY
17+
from pathlib import Path
18+
19+
import pytest
20+
from mock import ANY, patch
2021

2122
from sagemaker import git_utils
2223

@@ -494,3 +495,212 @@ def test_git_clone_repo_codecommit_https_creds_not_stored_locally(tempdir, mkdte
494495
with pytest.raises(subprocess.CalledProcessError) as error:
495496
git_utils.git_clone_repo(git_config, entry_point)
496497
assert "returned non-zero exit status" in str(error.value)
498+
499+
500+
# ============================================================================
501+
# URL Sanitization Tests - Security vulnerability prevention
502+
# ============================================================================
503+
504+
class TestGitUrlSanitization:
505+
"""Test cases for Git URL sanitization to prevent injection attacks."""
506+
507+
def test_sanitize_git_url_valid_https_urls(self):
508+
"""Test that valid HTTPS URLs pass sanitization."""
509+
valid_urls = [
510+
"https://github.com/user/repo.git",
511+
"https://gitlab.com/user/repo.git",
512+
"https://[email protected]/user/repo.git",
513+
"https://user:[email protected]/user/repo.git",
514+
"http://internal-git.company.com/repo.git",
515+
]
516+
517+
for url in valid_urls:
518+
# Should not raise any exception
519+
result = git_utils._sanitize_git_url(url)
520+
assert result == url
521+
522+
def test_sanitize_git_url_valid_ssh_urls(self):
523+
"""Test that valid SSH URLs pass sanitization."""
524+
valid_urls = [
525+
"[email protected]:user/repo.git",
526+
"[email protected]:user/repo.git",
527+
"ssh://[email protected]/user/repo.git",
528+
"ssh://git-codecommit.us-west-2.amazonaws.com/v1/repos/test-repo/", # 0 @ symbols - valid for ssh://
529+
"[email protected]:repo.git",
530+
]
531+
532+
for url in valid_urls:
533+
# Should not raise any exception
534+
result = git_utils._sanitize_git_url(url)
535+
assert result == url
536+
537+
def test_sanitize_git_url_blocks_multiple_at_https(self):
538+
"""Test that HTTPS URLs with multiple @ symbols are blocked."""
539+
malicious_urls = [
540+
"https://[email protected]@github.com/repo.git",
541+
"https://[email protected]@gitlab.com/user/repo.git",
542+
"https://a@b@[email protected]/repo.git",
543+
"https://user@[email protected]/legit/repo.git",
544+
]
545+
546+
for url in malicious_urls:
547+
with pytest.raises(ValueError) as error:
548+
git_utils._sanitize_git_url(url)
549+
assert "multiple @ symbols detected" in str(error.value)
550+
551+
def test_sanitize_git_url_blocks_multiple_at_ssh(self):
552+
"""Test that SSH URLs with multiple @ symbols are blocked."""
553+
malicious_urls = [
554+
"[email protected]@github.com:repo.git",
555+
"git@[email protected]:user/repo.git",
556+
"ssh://git@[email protected]/repo.git",
557+
"git@a@b@c:repo.git",
558+
]
559+
560+
for url in malicious_urls:
561+
with pytest.raises(ValueError) as error:
562+
git_utils._sanitize_git_url(url)
563+
# git@ URLs should give "exactly one @ symbol" error
564+
# ssh:// URLs should give "multiple @ symbols detected" error
565+
assert any(phrase in str(error.value) for phrase in [
566+
"multiple @ symbols detected",
567+
"exactly one @ symbol"
568+
])
569+
570+
def test_sanitize_git_url_blocks_invalid_schemes_and_git_at_format(self):
571+
"""Test that invalid schemes and git@ format violations are blocked."""
572+
# Test unsupported schemes
573+
unsupported_scheme_urls = [
574+
"git-github.com:user/repo.git", # Doesn't start with git@, ssh://, http://, https://
575+
]
576+
577+
for url in unsupported_scheme_urls:
578+
with pytest.raises(ValueError) as error:
579+
git_utils._sanitize_git_url(url)
580+
assert "Unsupported URL scheme" in str(error.value)
581+
582+
# Test git@ URLs with wrong @ count
583+
invalid_git_at_urls = [
584+
"[email protected]@evil.com:repo.git", # 2 @ symbols
585+
]
586+
587+
for url in invalid_git_at_urls:
588+
with pytest.raises(ValueError) as error:
589+
git_utils._sanitize_git_url(url)
590+
assert "exactly one @ symbol" in str(error.value)
591+
592+
def test_sanitize_git_url_blocks_url_encoding_obfuscation(self):
593+
"""Test that URL-encoded obfuscation attempts are blocked."""
594+
obfuscated_urls = [
595+
"https://github.com%25evil.com/repo.git",
596+
"https://[email protected]%40attacker.com/repo.git",
597+
"https://github.com%2Fevil.com/repo.git",
598+
"https://github.com%3Aevil.com/repo.git",
599+
]
600+
601+
for url in obfuscated_urls:
602+
with pytest.raises(ValueError) as error:
603+
git_utils._sanitize_git_url(url)
604+
# The error could be either suspicious encoding or invalid characters
605+
assert any(phrase in str(error.value) for phrase in [
606+
"Suspicious URL encoding detected",
607+
"Invalid characters in hostname"
608+
])
609+
610+
def test_sanitize_git_url_blocks_invalid_hostname_chars(self):
611+
"""Test that hostnames with invalid characters are blocked."""
612+
invalid_urls = [
613+
"https://github<script>.com/repo.git",
614+
"https://github>.com/repo.git",
615+
"https://github[].com/repo.git",
616+
"https://github{}.com/repo.git",
617+
]
618+
619+
for url in invalid_urls:
620+
with pytest.raises(ValueError) as error:
621+
git_utils._sanitize_git_url(url)
622+
# The error could be various types due to URL parsing edge cases
623+
assert any(phrase in str(error.value) for phrase in [
624+
"Invalid characters in hostname",
625+
"Failed to parse URL",
626+
"does not appear to be an IPv4 or IPv6 address"
627+
])
628+
629+
def test_sanitize_git_url_blocks_unsupported_schemes(self):
630+
"""Test that unsupported URL schemes are blocked."""
631+
unsupported_urls = [
632+
"ftp://github.com/repo.git",
633+
"file:///local/repo.git",
634+
"javascript:alert('xss')",
635+
"data:text/html,<script>alert('xss')</script>",
636+
]
637+
638+
for url in unsupported_urls:
639+
with pytest.raises(ValueError) as error:
640+
git_utils._sanitize_git_url(url)
641+
assert "Unsupported URL scheme" in str(error.value)
642+
643+
def test_git_clone_repo_blocks_malicious_https_url(self):
644+
"""Test that git_clone_repo blocks malicious HTTPS URLs."""
645+
malicious_git_config = {
646+
"repo": "https://[email protected]@github.com/legit/repo.git",
647+
"branch": "main"
648+
}
649+
entry_point = "train.py"
650+
651+
with pytest.raises(ValueError) as error:
652+
git_utils.git_clone_repo(malicious_git_config, entry_point)
653+
assert "multiple @ symbols detected" in str(error.value)
654+
655+
def test_git_clone_repo_blocks_malicious_ssh_url(self):
656+
"""Test that git_clone_repo blocks malicious SSH URLs."""
657+
malicious_git_config = {
658+
"repo": "git@[email protected]:sage-maker/temp-sev2.git",
659+
"branch": "main"
660+
}
661+
entry_point = "train.py"
662+
663+
with pytest.raises(ValueError) as error:
664+
git_utils.git_clone_repo(malicious_git_config, entry_point)
665+
assert "exactly one @ symbol" in str(error.value)
666+
667+
def test_git_clone_repo_blocks_url_encoded_attack(self):
668+
"""Test that git_clone_repo blocks URL-encoded attacks."""
669+
malicious_git_config = {
670+
"repo": "https://github.com%40attacker.com/repo.git",
671+
"branch": "main"
672+
}
673+
entry_point = "train.py"
674+
675+
with pytest.raises(ValueError) as error:
676+
git_utils.git_clone_repo(malicious_git_config, entry_point)
677+
assert "Suspicious URL encoding detected" in str(error.value)
678+
679+
def test_sanitize_git_url_comprehensive_attack_scenarios(self):
680+
"""Test comprehensive attack scenarios from the vulnerability report."""
681+
# These are the exact attack patterns from the security report
682+
attack_scenarios = [
683+
# Original PoC attack
684+
"https://USER@YOUR_NGROK_OR_LOCALHOST/[email protected]%25legit%25repo.git",
685+
# Variations of the attack
686+
"https://user@[email protected]/legit/repo.git",
687+
"[email protected]@github.com:user/repo.git",
688+
"ssh://[email protected]@github.com/repo.git",
689+
# URL encoding variations
690+
"https://github.com%40evil.com/repo.git",
691+
"https://[email protected]%2Fevil.com/repo.git",
692+
]
693+
694+
entry_point = "train.py"
695+
696+
for malicious_url in attack_scenarios:
697+
git_config = {"repo": malicious_url}
698+
with pytest.raises(ValueError) as error:
699+
git_utils.git_clone_repo(git_config, entry_point)
700+
# Should be blocked by sanitization
701+
assert any(phrase in str(error.value) for phrase in [
702+
"multiple @ symbols detected",
703+
"exactly one @ symbol",
704+
"Suspicious URL encoding detected",
705+
"Invalid characters in hostname"
706+
])

0 commit comments

Comments
 (0)