@@ -75,18 +75,72 @@ def test_get_markdown_url(azure_blob_service_mock):
75
75
assert markdown_url == "[A title](http://example.com/path/to/file.txt_12345)"
76
76
77
77
78
+ def test_from_metadata_returns_empty_sas_placeholder ():
79
+ # Given
80
+ content = "Some content"
81
+ metadata = {}
82
+ # blob.core.windows.net needs to be the domain name - not a faked one as per CWE-20
83
+ document_url = "http://blob.core.windows.net.example.com/path/to/file.txt"
84
+ expectedFileName = "/path/to/file.txt"
85
+ idx = 0
86
+
87
+ # When
88
+ source_document = SourceDocument .from_metadata (content , metadata , document_url , idx )
89
+
90
+ # Then
91
+ parsed_url = urlparse (document_url )
92
+ file_url = parsed_url .scheme + "://" + parsed_url .netloc + parsed_url .path
93
+ hash_key = hashlib .sha1 (f"{ file_url } _{ idx } " .encode ("utf-8" )).hexdigest ()
94
+ hash_key = f"doc_{ hash_key } "
95
+
96
+ assert source_document .id == hash_key
97
+ assert source_document .content == content
98
+ assert source_document .source == document_url
99
+ assert source_document .title == expectedFileName
100
+ assert source_document .chunk == idx
101
+ assert source_document .offset is None
102
+ assert source_document .page_number is None
103
+
104
+
105
+ def test_from_metadata_returns_sas_placeholder ():
106
+ # Given
107
+ content = "Some content"
108
+ metadata = {}
109
+ document_url = "http://example.blob.core.windows.net/path/to/file.txt"
110
+ expectedFileName = "/path/to/file.txt"
111
+ expected_sas_placeholder = "_SAS_TOKEN_PLACEHOLDER_"
112
+ idx = 0
113
+
114
+ # When
115
+ source_document = SourceDocument .from_metadata (content , metadata , document_url , idx )
116
+
117
+ # Then
118
+ parsed_url = urlparse (document_url )
119
+ file_url = parsed_url .scheme + "://" + parsed_url .netloc + parsed_url .path
120
+ hash_key = hashlib .sha1 (f"{ file_url } _{ idx } " .encode ("utf-8" )).hexdigest ()
121
+ hash_key = f"doc_{ hash_key } "
122
+
123
+ assert source_document .id == hash_key
124
+ assert source_document .content == content
125
+ assert source_document .source == f"{ file_url } { expected_sas_placeholder } "
126
+ assert source_document .title == expectedFileName
127
+ assert source_document .chunk == idx
128
+ assert source_document .offset is None
129
+ assert source_document .page_number is None
130
+
131
+
78
132
def test_from_metadata ():
79
133
# Given
80
134
content = "Some content"
81
135
metadata = {
82
136
"id" : "1" ,
83
- "source" : "http://example.com/path/to/file.txt_SAS_TOKEN_PLACEHOLDER_ " ,
137
+ "source" : "http://example.com/path/to/file.txt " ,
84
138
"title" : "A title" ,
85
139
"chunk" : "A chunk" ,
86
140
"offset" : "An offset" ,
87
141
"page_number" : "1" ,
88
142
}
89
- document_url = "http://example.com/path/to/file.txt_SAS_TOKEN_PLACEHOLDER_ "
143
+ document_url = "http://example.com/path/to/file.txt "
90
144
idx = 0
91
145
92
146
# When
@@ -98,15 +152,11 @@ def test_from_metadata():
98
152
filename = parsed_url .path
99
153
hash_key = hashlib .sha1 (f"{ file_url } _{ idx } " .encode ("utf-8" )).hexdigest ()
100
154
hash_key = f"doc_{ hash_key } "
101
- sas_placeholder = (
102
- "_SAS_TOKEN_PLACEHOLDER_"
103
- if "blob.core.windows.net" in parsed_url .netloc
104
- else ""
105
- )
155
+
106
156
expected_source_document = SourceDocument (
107
157
id = metadata .get ("id" , hash_key ),
108
158
content = content ,
109
- source = metadata .get ("source" , f" { file_url } { sas_placeholder } " ),
159
+ source = metadata .get ("source" , document_url ),
110
160
title = metadata .get ("title" , filename ),
111
161
chunk = metadata .get ("chunk" , idx ),
112
162
offset = metadata .get ("offset" ),
0 commit comments