Skip to content

Commit 4511358

Browse files
Replace bad UTF-8 bytes when parsing git blame (#98)
* Replace bad UTF-8 bytes when parsing git blame This handles bad UTF-8 bytes by replacing them with the � character, fixing output parsing for files with invalid byte sequences. This fixes #97 * Unit test * Ruff hygiene * Fix dict usage on 3.8 --------- Co-authored-by: Erik De Bonte <erikd@microsoft.com>
1 parent d06defe commit 4511358

File tree

2 files changed

+79
-21
lines changed

2 files changed

+79
-21
lines changed

sarif/operations/blame_op.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ def _run_git_blame_on_files(
148148

149149
for line_bytes in git_blame_output:
150150
# Convert byte sequence to string and remove trailing LF
151-
line_string = line_bytes.decode("utf-8")[:-1]
151+
line_string = line_bytes.decode("utf-8", errors="replace")[:-1]
152152
# Now parse output from git blame --porcelain
153153
if commit_hash:
154154
if line_string.startswith("\t"):

tests/ops/blame/test_blame.py

Lines changed: 78 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import jsonschema
55
import os
66
import tempfile
7-
from typing import List
7+
from typing import Callable, Dict, List
88

99
from sarif.operations import blame_op
1010
from sarif import sarif_file
@@ -110,7 +110,10 @@ def test_blame_no_blame_info():
110110
assert not os.path.isfile(output_file_path)
111111

112112

113-
def test_blame_success():
113+
def blame_test(
114+
run_git_blame: Callable[[str, str], List[bytes]],
115+
expected_blame_properties: Dict[str, Dict[str, str]],
116+
):
114117
input_sarif_file = sarif_file.SarifFile(
115118
"SARIF_FILE", SARIF_FILE, mtime=datetime.datetime.now()
116119
)
@@ -122,37 +125,92 @@ def test_blame_success():
122125
os.makedirs(repo_path)
123126
output_file_path = os.path.join(tmp, "blamed.json")
124127

125-
def run_git_blame(blame_repo_path: str, blame_file_path: str) -> List[bytes]:
128+
def run_git_blame_wrapper(
129+
blame_repo_path: str, blame_file_path: str
130+
) -> List[bytes]:
126131
assert blame_repo_path == repo_path
127132
assert blame_file_path == ERROR_FILE_ABSOLUTE_PATH
128-
return [x.encode() for x in GIT_BLAME_OUTPUT]
133+
return run_git_blame(blame_repo_path, blame_file_path)
129134

130135
blame_op.enhance_with_blame(
131136
input_sarif_file_set,
132137
repo_path,
133138
output_file_path,
134139
output_multiple_files=False,
135-
run_git_blame=run_git_blame,
140+
run_git_blame=run_git_blame_wrapper,
136141
)
137142

138143
with open(output_file_path, "rb") as f_out:
139144
output_sarif = json.load(f_out)
140145
jsonschema.validate(output_sarif, schema=get_sarif_schema())
141146

142147
expected_sarif = deepcopy(input_sarif_file.data)
143-
blame_properties = {
144-
"blame": {
145-
"author": "Taylor Developer",
146-
"author-mail": "<taylor@developer.com>",
147-
"author-time": "1699272533",
148-
"author-tz": "+0000",
149-
"committer": "GitHub",
150-
"committer-mail": "<noreply@github.com>",
151-
"committer-time": "1699272533",
152-
"committer-tz": "+0000",
153-
"summary": "Commit message 1",
154-
"filename": ERROR_FILE_RELATIVE_PATH,
155-
}
156-
}
157-
expected_sarif["runs"][0]["results"][0]["properties"] = blame_properties
148+
expected_sarif["runs"][0]["results"][0]["properties"] = (
149+
expected_blame_properties
150+
)
158151
assert output_sarif == expected_sarif
152+
153+
154+
def test_blame_success():
155+
def run_git_blame(blame_repo_path: str, blame_file_path: str) -> List[bytes]:
156+
return [x.encode() for x in GIT_BLAME_OUTPUT]
157+
158+
expected_blame_properties = {
159+
"blame": {
160+
"author": "Taylor Developer",
161+
"author-mail": "<taylor@developer.com>",
162+
"author-time": "1699272533",
163+
"author-tz": "+0000",
164+
"committer": "GitHub",
165+
"committer-mail": "<noreply@github.com>",
166+
"committer-time": "1699272533",
167+
"committer-tz": "+0000",
168+
"summary": "Commit message 1",
169+
"filename": ERROR_FILE_RELATIVE_PATH,
170+
}
171+
}
172+
173+
blame_test(run_git_blame, expected_blame_properties)
174+
175+
176+
GIT_BLAME_OUTPUT_WITH_INVALID_UTF8 = [
177+
b"f9db03438aba52affc5c3fcdb619afa620ad603a 1 1 7\n",
178+
b"author Taylor Developer\n",
179+
b"author-mail <taylor@developer.com>\n",
180+
b"author-time 1699272533\n",
181+
b"author-tz +0000\n",
182+
b"committer GitHub\n",
183+
b"committer-mail <noreply@github.com>\n",
184+
b"committer-time 1699272533\n",
185+
b"committer-tz +0000\n",
186+
b"summary Commit message \x80\n",
187+
b"filename " + ERROR_FILE_RELATIVE_PATH.encode() + b"\n",
188+
b"\tFile text line 1\n",
189+
b"f9db03438aba52affc5c3fcdb619afa620ad603a 2 2\n",
190+
b"\tFile text line 2\n",
191+
b"f9db03438aba52affc5c3fcdb619afa620ad603a 3 3\n",
192+
b"\tFile text line 3\n",
193+
b"eec0471db074a037d820abdda1f210f8a8c987ca 4 4 1\n",
194+
]
195+
196+
197+
def test_blame_invalid_utf8():
198+
def run_git_blame(blame_repo_path: str, blame_file_path: str) -> List[bytes]:
199+
return GIT_BLAME_OUTPUT_WITH_INVALID_UTF8
200+
201+
expected_blame_properties = {
202+
"blame": {
203+
"author": "Taylor Developer",
204+
"author-mail": "<taylor@developer.com>",
205+
"author-time": "1699272533",
206+
"author-tz": "+0000",
207+
"committer": "GitHub",
208+
"committer-mail": "<noreply@github.com>",
209+
"committer-time": "1699272533",
210+
"committer-tz": "+0000",
211+
"summary": "Commit message �",
212+
"filename": ERROR_FILE_RELATIVE_PATH,
213+
}
214+
}
215+
216+
blame_test(run_git_blame, expected_blame_properties)

0 commit comments

Comments
 (0)