Skip to content

Commit 8c4b8a0

Browse files
committed
Huggingface revision pinning
In much the same way as unpinned container images benefit from digest pinning, fixing a model, dataset or file to a revision digest uniquely and immutably fixes use to a paricular model snapshot (commit)
1 parent 090ba0f commit 8c4b8a0

File tree

5 files changed

+302
-0
lines changed

5 files changed

+302
-0
lines changed
Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
# Copyright (c) 2024 PyCQA
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
r"""
5+
================================================
6+
B615: Test for unsafe Hugging Face Hub downloads
7+
================================================
8+
9+
This plugin checks for unsafe downloads from Hugging Face Hub without proper
10+
integrity verification. Downloading models, datasets, or files without
11+
specifying a revision based on an immmutable revision (commit) can
12+
lead to supply chain attacks where malicious actors could
13+
replace model files and use an existing tag or branch name
14+
to serve malicious content.
15+
16+
The secure approach is to:
17+
18+
1. Pin to specific revisions/commits when downloading models, files or datasets
19+
20+
Common unsafe patterns:
21+
- ``AutoModel.from_pretrained("model-name")`` without revision
22+
- ``load_dataset("dataset-name")`` without revision
23+
- ``hf_hub_download()`` without revision parameter
24+
- ``snapshot_download()`` without revision parameter
25+
26+
:Example:
27+
28+
.. code-block:: none
29+
30+
>> Issue: Unsafe Hugging Face Hub download without revision pinning
31+
Severity: Medium Confidence: High
32+
CWE: CWE-494 (https://cwe.mitre.org/data/definitions/494.html)
33+
Location: examples/huggingface_unsafe_download.py:8
34+
7 # Unsafe: no revision specified
35+
8 model = AutoModel.from_pretrained("org/model_name")
36+
9
37+
38+
.. seealso::
39+
40+
- https://cwe.mitre.org/data/definitions/494.html
41+
- https://huggingface.co/docs/huggingface_hub/en/guides/download#from-specific-version
42+
- https://huggingface.co/docs/huggingface_hub/guides/download
43+
44+
.. versionadded:: 1.9.0
45+
46+
"""
47+
import bandit
48+
from bandit.core import issue
49+
from bandit.core import test_properties as test
50+
51+
52+
@test.checks("Call")
53+
@test.test_id("B615")
54+
def huggingface_unsafe_download(context):
55+
"""
56+
This plugin checks for unsafe artifact download from Hugging Face Hub
57+
without immutable/reproducible revision pinning.
58+
"""
59+
# Check if any HuggingFace-related modules are imported
60+
hf_modules = [
61+
"transformers",
62+
"datasets",
63+
"huggingface_hub",
64+
]
65+
66+
# Check if any HF modules are imported
67+
hf_imported = any(
68+
context.is_module_imported_like(module) for module in hf_modules
69+
)
70+
71+
if not hf_imported:
72+
return
73+
74+
qualname = context.call_function_name_qual
75+
if not isinstance(qualname, str):
76+
return
77+
78+
unsafe_patterns = {
79+
# transformers library patterns
80+
"from_pretrained": ["transformers"],
81+
# datasets library patterns
82+
"load_dataset": ["datasets"],
83+
# huggingface_hub patterns
84+
"hf_hub_download": ["huggingface_hub"],
85+
"snapshot_download": ["huggingface_hub"],
86+
"repository_id": ["huggingface_hub"],
87+
}
88+
89+
qualname_parts = qualname.split(".")
90+
func_name = qualname_parts[-1]
91+
92+
if func_name not in unsafe_patterns:
93+
return
94+
95+
required_modules = unsafe_patterns[func_name]
96+
if not any(module in qualname_parts for module in required_modules):
97+
return
98+
99+
# Check for revision parameter (the key security control)
100+
revision_value = context.get_call_arg_value("revision")
101+
commit_id_value = context.get_call_arg_value("commit_id")
102+
103+
# Check if a revision or commit_id is specified
104+
revision_to_check = revision_value or commit_id_value
105+
106+
if revision_to_check is not None:
107+
# Check if it's a secure revision (looks like a commit hash)
108+
# Commit hashes: 40 chars (full SHA) or 7+ chars (short SHA)
109+
if isinstance(revision_to_check, str):
110+
# Remove quotes if present
111+
revision_str = str(revision_to_check).strip('"\'')
112+
113+
# Check if it looks like a commit hash (hexadecimal string)
114+
# Must be at least 7 characters and all hexadecimal
115+
hex_chars = '0123456789abcdefABCDEF'
116+
is_hex = all(c in hex_chars for c in revision_str)
117+
if len(revision_str) >= 7 and is_hex:
118+
# This looks like a commit hash, which is secure
119+
return
120+
121+
# Edge case: check if this is a local path (starts with ./ or /)
122+
first_arg = context.get_call_arg_at_position(0)
123+
if first_arg and isinstance(first_arg, str):
124+
if first_arg.startswith(("./", "/", "../")):
125+
# Local paths are generally safer
126+
return
127+
128+
return bandit.Issue(
129+
severity=bandit.MEDIUM,
130+
confidence=bandit.HIGH,
131+
text=(
132+
f"Unsafe Hugging Face Hub download without revision pinning "
133+
f"in {func_name}()"
134+
),
135+
cwe=issue.Cwe.IMPROPER_INPUT_VALIDATION,
136+
lineno=context.get_lineno_for_call_arg(func_name),
137+
)
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
---------------------------------
2+
B615: huggingface_unsafe_download
3+
---------------------------------
4+
5+
.. automodule:: bandit.plugins.huggingface_unsafe_download
6+
:no-index:
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
from datasets import load_dataset
2+
from huggingface_hub import hf_hub_download, snapshot_download
3+
from transformers import AutoModel, AutoTokenizer
4+
5+
# UNSAFE USAGE
6+
7+
# AutoModel (Model Loading)
8+
9+
# Example #1: No revision (defaults to floating 'main')
10+
unsafe_model_no_revision = AutoModel.from_pretrained("org/model_name")
11+
12+
# Example #2: Floating revision: 'main'
13+
unsafe_model_main = AutoModel.from_pretrained(
14+
"org/model_name",
15+
revision="main"
16+
)
17+
18+
# Example #3: Floating tag revision: 'v1.0.0'
19+
unsafe_model_tag = AutoModel.from_pretrained(
20+
"org/model_name",
21+
revision="v1.0.0"
22+
)
23+
24+
25+
# AutoTokenizer (Tokenizer Loading)
26+
27+
# Example #4: No revision
28+
unsafe_tokenizer_no_revision = AutoTokenizer.from_pretrained("org/model_name")
29+
30+
# Example #5: Floating revision: 'main'
31+
unsafe_tokenizer_main = AutoTokenizer.from_pretrained(
32+
"org/model_name",
33+
revision="main"
34+
)
35+
36+
# Example #6: Floating tag revision: 'v1.0.0'
37+
unsafe_tokenizer_tag = AutoTokenizer.from_pretrained(
38+
"org/model_name",
39+
revision="v1.0.0"
40+
)
41+
42+
43+
# Example #7: load_dataset (Dataset Loading)
44+
45+
# Example #8: No revision
46+
unsafe_dataset_no_revision = load_dataset("org_dataset")
47+
48+
# Example #9: Floating revision: 'main'
49+
unsafe_dataset_main = load_dataset("org_dataset", revision="main")
50+
51+
# Example #10: Floating tag revision: 'v1.0.0'
52+
unsafe_dataset_tag = load_dataset("org_dataset", revision="v1.0.0")
53+
54+
55+
# f_hub_download (File Download)
56+
57+
# Example #11: No revision
58+
unsafe_file_no_revision = hf_hub_download(
59+
repo_id="org/model_name",
60+
filename="config.json"
61+
)
62+
63+
# Example #12: Floating revision: 'main'
64+
unsafe_file_main = hf_hub_download(
65+
repo_id="org/model_name",
66+
filename="config.json",
67+
revision="main"
68+
)
69+
70+
# Example #13: Floating tag revision: 'v1.0.0'
71+
unsafe_file_tag = hf_hub_download(
72+
repo_id="org/model_name",
73+
filename="config.json",
74+
revision="v1.0.0"
75+
)
76+
77+
78+
# snapshot_download (Repo Snapshot)
79+
80+
# Example #14: No revision
81+
unsafe_snapshot_no_revision = snapshot_download(repo_id="org/model_name")
82+
83+
# Example #15: Floating revision: 'main'
84+
unsafe_snapshot_main = snapshot_download(
85+
repo_id="org/model_name",
86+
revision="main"
87+
)
88+
89+
# Example #16: Floating tag revision: 'v1.0.0'
90+
unsafe_snapshot_tag = snapshot_download(
91+
repo_id="org/model_name",
92+
revision="v1.0.0"
93+
)
94+
95+
96+
# -------------------------------
97+
# SAFE USAGE
98+
# -------------------------------
99+
100+
# AutoModel
101+
102+
# Example #17: Pinned commit hash
103+
safe_model_commit = AutoModel.from_pretrained(
104+
"org/model_name",
105+
revision="5d0f2e8a7f1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d"
106+
)
107+
108+
# Example #18: Local path
109+
safe_model_local = AutoModel.from_pretrained("./local_model")
110+
safe_model_local_abs = AutoModel.from_pretrained("/path/to/model")
111+
112+
# AutoTokenizer
113+
114+
# Example #19: Pinned commit hash
115+
safe_tokenizer_commit = AutoTokenizer.from_pretrained(
116+
"org/model_name",
117+
revision="5d0f2e8a7f1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d"
118+
)
119+
120+
# Example #20: Local path
121+
safe_tokenizer_local = AutoTokenizer.from_pretrained("./local_tokenizer")
122+
123+
124+
# load_dataset
125+
126+
# Example #21: Pinned commit hash
127+
safe_dataset_commit = load_dataset(
128+
"org_dataset",
129+
revision="5d0f2e8a7f1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d"
130+
)
131+
132+
133+
# hf_hub_download
134+
135+
# Example #22: Pinned commit hash
136+
safe_file_commit = hf_hub_download(
137+
repo_id="org/model_name",
138+
filename="config.json",
139+
revision="5d0f2e8a7f1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d"
140+
)
141+
142+
143+
# snapshot_download
144+
145+
# Example #23: Pinned commit hash
146+
safe_snapshot_commit = snapshot_download(
147+
repo_id="org/model_name",
148+
revision="5d0f2e8a7f1b2c3d4e5f6a7b8c9d0e1f2a3b4c5d"
149+
)

setup.cfg

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,9 @@ bandit.plugins =
157157
#bandit/plugins/pytorch_load.py
158158
pytorch_load = bandit.plugins.pytorch_load:pytorch_load
159159

160+
# bandit/plugins/huggingface_unsafe_download.py
161+
huggingface_unsafe_download = bandit.plugins.huggingface_unsafe_download:huggingface_unsafe_download
162+
160163
# bandit/plugins/trojansource.py
161164
trojansource = bandit.plugins.trojansource:trojansource
162165

tests/functional/test_functional.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -926,3 +926,10 @@ def test_markupsafe_markup_xss_allowed_calls(self):
926926
self.check_example(
927927
"markupsafe_markup_xss_allowed_calls.py", expect
928928
)
929+
930+
def test_huggingface_unsafe_download(self):
931+
expect = {
932+
"SEVERITY": {"UNDEFINED": 0, "LOW": 0, "MEDIUM": 15, "HIGH": 0},
933+
"CONFIDENCE": {"UNDEFINED": 0, "LOW": 0, "MEDIUM": 0, "HIGH": 15},
934+
}
935+
self.check_example("huggingface_unsafe_download.py", expect)

0 commit comments

Comments
 (0)