Skip to content

Commit 428ffa8

Browse files
authored
CWE-20 - Code Security (#632)
* Refactor SourceDocument class to handle SAS tokens for blob storage URLs * Added tests to exercise the sas_placeholder
1 parent d548e41 commit 428ffa8

File tree

2 files changed

+60
-9
lines changed

2 files changed

+60
-9
lines changed

code/backend/batch/utilities/common/SourceDocument.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,8 @@ def from_metadata(
6161
hash_key = f"doc_{hash_key}"
6262
sas_placeholder = (
6363
"_SAS_TOKEN_PLACEHOLDER_"
64-
if "blob.core.windows.net" in parsed_url.netloc
64+
if parsed_url.netloc
65+
and parsed_url.netloc.endswith(".blob.core.windows.net")
6566
else ""
6667
)
6768
return cls(

code/tests/common/test_SourceDocument.py

Lines changed: 58 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -75,18 +75,72 @@ def test_get_markdown_url(azure_blob_service_mock):
7575
assert markdown_url == "[A title](http://example.com/path/to/file.txt_12345)"
7676

7777

78+
def test_from_metadata_returns_empty_sas_placeholder():
79+
# Given
80+
content = "Some content"
81+
metadata = {}
82+
# blob.core.windows.net needs to be the domain name - not a faked one as per CWE-20
83+
document_url = "http://blob.core.windows.net.example.com/path/to/file.txt"
84+
expectedFileName = "/path/to/file.txt"
85+
idx = 0
86+
87+
# When
88+
source_document = SourceDocument.from_metadata(content, metadata, document_url, idx)
89+
90+
# Then
91+
parsed_url = urlparse(document_url)
92+
file_url = parsed_url.scheme + "://" + parsed_url.netloc + parsed_url.path
93+
hash_key = hashlib.sha1(f"{file_url}_{idx}".encode("utf-8")).hexdigest()
94+
hash_key = f"doc_{hash_key}"
95+
96+
assert source_document.id == hash_key
97+
assert source_document.content == content
98+
assert source_document.source == document_url
99+
assert source_document.title == expectedFileName
100+
assert source_document.chunk == idx
101+
assert source_document.offset is None
102+
assert source_document.page_number is None
103+
104+
105+
def test_from_metadata_returns_sas_placeholder():
106+
# Given
107+
content = "Some content"
108+
metadata = {}
109+
document_url = "http://example.blob.core.windows.net/path/to/file.txt"
110+
expectedFileName = "/path/to/file.txt"
111+
expected_sas_placeholder = "_SAS_TOKEN_PLACEHOLDER_"
112+
idx = 0
113+
114+
# When
115+
source_document = SourceDocument.from_metadata(content, metadata, document_url, idx)
116+
117+
# Then
118+
parsed_url = urlparse(document_url)
119+
file_url = parsed_url.scheme + "://" + parsed_url.netloc + parsed_url.path
120+
hash_key = hashlib.sha1(f"{file_url}_{idx}".encode("utf-8")).hexdigest()
121+
hash_key = f"doc_{hash_key}"
122+
123+
assert source_document.id == hash_key
124+
assert source_document.content == content
125+
assert source_document.source == f"{file_url}{expected_sas_placeholder}"
126+
assert source_document.title == expectedFileName
127+
assert source_document.chunk == idx
128+
assert source_document.offset is None
129+
assert source_document.page_number is None
130+
131+
78132
def test_from_metadata():
79133
# Given
80134
content = "Some content"
81135
metadata = {
82136
"id": "1",
83-
"source": "http://example.com/path/to/file.txt_SAS_TOKEN_PLACEHOLDER_",
137+
"source": "http://example.com/path/to/file.txt",
84138
"title": "A title",
85139
"chunk": "A chunk",
86140
"offset": "An offset",
87141
"page_number": "1",
88142
}
89-
document_url = "http://example.com/path/to/file.txt_SAS_TOKEN_PLACEHOLDER_"
143+
document_url = "http://example.com/path/to/file.txt"
90144
idx = 0
91145

92146
# When
@@ -98,15 +152,11 @@ def test_from_metadata():
98152
filename = parsed_url.path
99153
hash_key = hashlib.sha1(f"{file_url}_{idx}".encode("utf-8")).hexdigest()
100154
hash_key = f"doc_{hash_key}"
101-
sas_placeholder = (
102-
"_SAS_TOKEN_PLACEHOLDER_"
103-
if "blob.core.windows.net" in parsed_url.netloc
104-
else ""
105-
)
155+
106156
expected_source_document = SourceDocument(
107157
id=metadata.get("id", hash_key),
108158
content=content,
109-
source=metadata.get("source", f"{file_url}{sas_placeholder}"),
159+
source=metadata.get("source", document_url),
110160
title=metadata.get("title", filename),
111161
chunk=metadata.get("chunk", idx),
112162
offset=metadata.get("offset"),

0 commit comments

Comments
 (0)