Skip to content

Commit 0c4c55c

Browse files
authored
Escape single quote marks for search filters (#1599)
* Escape filters * Fix for mypy
1 parent e1ff6a4 commit 0c4c55c

File tree

5 files changed

+24
-16
lines changed

5 files changed

+24
-16
lines changed

app/backend/core/authentication.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,10 @@ async def check_path_auth(self, path: str, auth_claims: dict[str, Any], search_c
253253

254254
# Filter down to only chunks that are from the specific source file
255255
# Sourcepage is used for GPT-4V
256-
filter = f"{security_filter} and ((sourcefile eq '{path}') or (sourcepage eq '{path}'))"
256+
# Replace ' with '' to escape the single quote for the filter
257+
# https://learn.microsoft.com/azure/search/query-odata-filter-orderby-syntax#escaping-special-characters-in-string-constants
258+
path_for_filter = path.replace("'", "''")
259+
filter = f"{security_filter} and ((sourcefile eq '{path_for_filter}') or (sourcepage eq '{path_for_filter}'))"
257260

258261
# If the filter returns any results, the user is allowed to access the document
259262
# Otherwise, access is denied

app/backend/prepdocslib/searchmanager.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,12 @@ async def remove_content(self, path: Optional[str] = None, only_oid: Optional[st
250250
)
251251
async with self.search_info.create_search_client() as search_client:
252252
while True:
253-
filter = None if path is None else f"sourcefile eq '{os.path.basename(path)}'"
253+
filter = None
254+
if path is not None:
255+
# Replace ' with '' to escape the single quote for the filter
256+
# https://learn.microsoft.com/azure/search/query-odata-filter-orderby-syntax#escaping-special-characters-in-string-constants
257+
path_for_filter = os.path.basename(path).replace("'", "''")
258+
filter = f"sourcefile eq '{path_for_filter}'"
254259
max_results = 1000
255260
result = await search_client.search(
256261
search_text="", filter=filter, top=max_results, include_total_count=True

tests/test_authenticationhelper.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -206,15 +206,15 @@ async def mock_search(self, *args, **kwargs):
206206

207207
assert (
208208
await auth_helper_require_access_control.check_path_auth(
209-
path="Benefit_Options-2.pdf",
209+
path="Benefit_Options-2's complement.pdf",
210210
auth_claims={"oid": "OID_X", "groups": ["GROUP_Y", "GROUP_Z"]},
211211
search_client=create_search_client(),
212212
)
213213
is True
214214
)
215215
assert (
216216
filter
217-
== "(oids/any(g:search.in(g, 'OID_X')) or groups/any(g:search.in(g, 'GROUP_Y, GROUP_Z'))) and ((sourcefile eq 'Benefit_Options-2.pdf') or (sourcepage eq 'Benefit_Options-2.pdf'))"
217+
== "(oids/any(g:search.in(g, 'OID_X')) or groups/any(g:search.in(g, 'GROUP_Y, GROUP_Z'))) and ((sourcefile eq 'Benefit_Options-2''s complement.pdf') or (sourcepage eq 'Benefit_Options-2''s complement.pdf'))"
218218
)
219219

220220

tests/test_searchmanager.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -334,8 +334,8 @@ async def test_remove_content(monkeypatch, search_info):
334334
"id": "file-foo_pdf-666F6F2E706466-page-0",
335335
"content": "test content",
336336
"category": "test",
337-
"sourcepage": "foo.pdf#page=1",
338-
"sourcefile": "foo.pdf",
337+
"sourcepage": "foo's bar.pdf#page=1",
338+
"sourcefile": "foo's bar.pdf",
339339
}
340340
]
341341
)
@@ -359,10 +359,10 @@ async def mock_delete_documents(self, documents):
359359

360360
manager = SearchManager(search_info)
361361

362-
await manager.remove_content("foo.pdf")
362+
await manager.remove_content("foo's bar.pdf")
363363

364364
assert len(searched_filters) == 2, "It should have searched twice (with no results on second try)"
365-
assert searched_filters[0] == "sourcefile eq 'foo.pdf'"
365+
assert searched_filters[0] == "sourcefile eq 'foo''s bar.pdf'"
366366
assert len(deleted_documents) == 1, "It should have deleted one document"
367367
assert deleted_documents[0]["id"] == "file-foo_pdf-666F6F2E706466-page-0"
368368

tests/test_upload.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -159,8 +159,8 @@ class AsyncSearchResultsIterator:
159159
def __init__(self):
160160
self.results = [
161161
{
162-
"sourcepage": "a.txt",
163-
"sourcefile": "a.txt",
162+
"sourcepage": "a's doc.txt",
163+
"sourcefile": "a's doc.txt",
164164
"content": "This is a test document.",
165165
"embedding": [],
166166
"category": None,
@@ -170,8 +170,8 @@ def __init__(self):
170170
"@search.reranker_score": 3.4577205181121826,
171171
},
172172
{
173-
"sourcepage": "a.txt",
174-
"sourcefile": "a.txt",
173+
"sourcepage": "a's doc.txt",
174+
"sourcefile": "a's doc.txt",
175175
"content": "This is a test document.",
176176
"embedding": [],
177177
"category": None,
@@ -181,8 +181,8 @@ def __init__(self):
181181
"@search.reranker_score": 3.4577205181121826,
182182
},
183183
{
184-
"sourcepage": "a.txt",
185-
"sourcefile": "a.txt",
184+
"sourcepage": "a's doc.txt",
185+
"sourcefile": "a's doc.txt",
186186
"content": "This is a test document.",
187187
"embedding": [],
188188
"category": None,
@@ -224,10 +224,10 @@ async def mock_delete_documents(self, documents):
224224
monkeypatch.setattr(SearchClient, "delete_documents", mock_delete_documents)
225225

226226
response = await auth_client.post(
227-
"/delete_uploaded", headers={"Authorization": "Bearer test"}, json={"filename": "a.txt"}
227+
"/delete_uploaded", headers={"Authorization": "Bearer test"}, json={"filename": "a's doc.txt"}
228228
)
229229
assert response.status_code == 200
230230
assert len(searched_filters) == 2, "It should have searched twice (with no results on second try)"
231-
assert searched_filters[0] == "sourcefile eq 'a.txt'"
231+
assert searched_filters[0] == "sourcefile eq 'a''s doc.txt'"
232232
assert len(deleted_documents) == 1, "It should have only deleted the document solely owned by OID_X"
233233
assert deleted_documents[0]["id"] == "file-a_txt-7465737420646F63756D656E742E706466"

0 commit comments

Comments
 (0)