Skip to content

Commit 88056e1

Browse files
Fix ADLS Gen2 scripts and authentication checks (#1272)
* feedback from github issue * fix authentication * remove print, add test --------- Co-authored-by: Matt Gotteiner <[email protected]>
1 parent 29355ec commit 88056e1

File tree

5 files changed

+102
-10
lines changed

5 files changed

+102
-10
lines changed

LoginAndAclSetup.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -200,8 +200,8 @@ The following environment variables are used to setup the optional login and doc
200200
* `AZURE_CLIENT_APP_ID`: Application ID of the Azure AD app for the client UI.
201201
* `AZURE_AUTH_TENANT_ID`: [Tenant ID](https://learn.microsoft.com/azure/active-directory/fundamentals/how-to-find-tenant) associated with the Azure AD used for login and document level access control. Defaults to `AZURE_TENANT_ID` if not defined.
202202
* `AZURE_ADLS_GEN2_STORAGE_ACCOUNT`: (Optional) Name of existing [Data Lake Storage Gen2 storage account](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-introduction) for storing sample data with [access control lists](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-access-control). Only used with the optional Data Lake Storage Gen2 [setup](#azure-data-lake-storage-gen2-setup) and [prep docs](#azure-data-lake-storage-gen2-prep-docs) scripts.
203-
* `AZURE_ADLS_GEN2_STORAGE_FILESYSTEM`: (Optional) Name of existing [Data Lake Storage Gen2 filesystem](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-introduction) for storing sample data with [access control lists](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-access-control). Only used with the optional Data Lake Storage Gen2 [setup](#azure-data-lake-storage-gen2-setup) and [prep docs](#azure-data-lake-storage-gen2-prep-docs) scripts.
204-
* `AZURE_ADLS_GEN2_STORAGE_FILESYSTEM_PATH`: (Optional) Name of existing path in a [Data Lake Storage Gen2 filesystem](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-introduction) for storing sample data with [access control lists](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-access-control). Only used with the optional Data Lake Storage Gen2 [prep docs](#azure-data-lake-storage-gen2-prep-docs) script.
203+
* `AZURE_ADLS_GEN2_FILESYSTEM`: (Optional) Name of existing [Data Lake Storage Gen2 filesystem](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-introduction) for storing sample data with [access control lists](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-access-control). Only used with the optional Data Lake Storage Gen2 [setup](#azure-data-lake-storage-gen2-setup) and [prep docs](#azure-data-lake-storage-gen2-prep-docs) scripts.
204+
* `AZURE_ADLS_GEN2_FILESYSTEM_PATH`: (Optional) Name of existing path in a [Data Lake Storage Gen2 filesystem](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-introduction) for storing sample data with [access control lists](https://learn.microsoft.com/azure/storage/blobs/data-lake-storage-access-control). Only used with the optional Data Lake Storage Gen2 [prep docs](#azure-data-lake-storage-gen2-prep-docs) script.
205205

206206
### Authentication behavior by environment
207207

app/backend/core/authentication.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -241,12 +241,18 @@ async def get_auth_claims_if_enabled(self, headers: dict) -> dict[str, Any]:
241241
async def check_path_auth(self, path: str, auth_claims: dict[str, Any], search_client: SearchClient) -> bool:
242242
# Start with the standard security filter for all queries
243243
security_filter = self.build_security_filters(overrides={}, auth_claims=auth_claims)
244-
# If there was no security filter, then the path is allowed
245-
if not security_filter:
244+
# If there was no security filter or no path, then the path is allowed
245+
if not security_filter or len(path) == 0:
246246
return True
247247

248+
# Remove any fragment string from the path before checking
249+
fragment_index = path.find("#")
250+
if fragment_index != -1:
251+
path = path[:fragment_index]
252+
248253
# Filter down to only chunks that are from the specific source file
249-
filter = f"{security_filter} and (sourcepage eq '{path}')"
254+
# Sourcepage is used for GPT-4V
255+
filter = f"{security_filter} and ((sourcefile eq '{path}') or (sourcepage eq '{path}'))"
250256

251257
# If the filter returns any results, the user is allowed to access the document
252258
# Otherwise, access is denied

scripts/adlsgen2setup.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,9 @@ async def run(self):
9999
await directory_client.update_access_control_recursive(
100100
acl=f"group:{groups[group_name]}:r-x"
101101
)
102+
if "oids" in access_control:
103+
for oid in access_control["oids"]:
104+
await directory_client.update_access_control_recursive(acl=f"user:{oid}:r-x")
102105
finally:
103106
for directory_client in directories.values():
104107
await directory_client.close()

scripts/prepdocs.ps1

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,11 @@ if ($env:AZURE_ADLS_GEN2_STORAGE_ACCOUNT) {
1313
$adlsGen2StorageAccountArg = "--datalakestorageaccount $env:AZURE_ADLS_GEN2_STORAGE_ACCOUNT"
1414
$adlsGen2FilesystemPathArg = ""
1515
if ($env:AZURE_ADLS_GEN2_FILESYSTEM_PATH) {
16-
$adlsGen2FilesystemPathArg = "--datalakefilesystempath $env:ADLS_GEN2_FILESYSTEM_PATH"
16+
$adlsGen2FilesystemPathArg = "--datalakefilesystempath $env:AZURE_ADLS_GEN2_FILESYSTEM_PATH"
1717
}
1818
$adlsGen2FilesystemArg = ""
1919
if ($env:AZURE_ADLS_GEN2_FILESYSTEM) {
20-
$adlsGen2FilesystemArg = "--datalakefilesystem $env:ADLS_GEN2_FILESYSTEM"
20+
$adlsGen2FilesystemArg = "--datalakefilesystem $env:AZURE_ADLS_GEN2_FILESYSTEM"
2121
}
2222
}
2323
if ($env:AZURE_USE_AUTHENTICATION) {

tests/test_authenticationhelper.py

Lines changed: 86 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -220,12 +220,14 @@ async def mock_search(self, *args, **kwargs):
220220
)
221221
assert (
222222
filter
223-
== "(oids/any(g:search.in(g, 'OID_X')) or groups/any(g:search.in(g, 'GROUP_Y, GROUP_Z'))) and (sourcepage eq 'Benefit_Options-2.pdf')"
223+
== "(oids/any(g:search.in(g, 'OID_X')) or groups/any(g:search.in(g, 'GROUP_Y, GROUP_Z'))) and ((sourcefile eq 'Benefit_Options-2.pdf') or (sourcepage eq 'Benefit_Options-2.pdf'))"
224224
)
225225

226226

227227
@pytest.mark.asyncio
228-
async def test_check_path_auth_allowed(monkeypatch, mock_confidential_client_success, mock_validate_token_success):
228+
async def test_check_path_auth_allowed_sourcepage(
229+
monkeypatch, mock_confidential_client_success, mock_validate_token_success
230+
):
229231
auth_helper_require_access_control = create_authentication_helper(require_access_control=True)
230232
filter = None
231233

@@ -246,7 +248,88 @@ async def mock_search(self, *args, **kwargs):
246248
)
247249
assert (
248250
filter
249-
== "(oids/any(g:search.in(g, 'OID_X')) or groups/any(g:search.in(g, 'GROUP_Y, GROUP_Z'))) and (sourcepage eq 'Benefit_Options-2.pdf')"
251+
== "(oids/any(g:search.in(g, 'OID_X')) or groups/any(g:search.in(g, 'GROUP_Y, GROUP_Z'))) and ((sourcefile eq 'Benefit_Options-2.pdf') or (sourcepage eq 'Benefit_Options-2.pdf'))"
252+
)
253+
254+
255+
@pytest.mark.asyncio
256+
async def test_check_path_auth_allowed_sourcefile(
257+
monkeypatch, mock_confidential_client_success, mock_validate_token_success
258+
):
259+
auth_helper_require_access_control = create_authentication_helper(require_access_control=True)
260+
filter = None
261+
262+
async def mock_search(self, *args, **kwargs):
263+
nonlocal filter
264+
filter = kwargs.get("filter")
265+
return MockAsyncPageIterator(data=[{"sourcefile": "Benefit_Options.pdf"}])
266+
267+
monkeypatch.setattr(SearchClient, "search", mock_search)
268+
269+
assert (
270+
await auth_helper_require_access_control.check_path_auth(
271+
path="Benefit_Options.pdf",
272+
auth_claims={"oid": "OID_X", "groups": ["GROUP_Y", "GROUP_Z"]},
273+
search_client=create_search_client(),
274+
)
275+
is True
276+
)
277+
assert (
278+
filter
279+
== "(oids/any(g:search.in(g, 'OID_X')) or groups/any(g:search.in(g, 'GROUP_Y, GROUP_Z'))) and ((sourcefile eq 'Benefit_Options.pdf') or (sourcepage eq 'Benefit_Options.pdf'))"
280+
)
281+
282+
283+
@pytest.mark.asyncio
284+
async def test_check_path_auth_allowed_empty(
285+
monkeypatch, mock_confidential_client_success, mock_validate_token_success
286+
):
287+
auth_helper_require_access_control = create_authentication_helper(require_access_control=True)
288+
filter = None
289+
290+
async def mock_search(self, *args, **kwargs):
291+
nonlocal filter
292+
filter = kwargs.get("filter")
293+
return MockAsyncPageIterator(data=[{"sourcefile": "Benefit_Options.pdf"}])
294+
295+
monkeypatch.setattr(SearchClient, "search", mock_search)
296+
297+
assert (
298+
await auth_helper_require_access_control.check_path_auth(
299+
path="",
300+
auth_claims={"oid": "OID_X", "groups": ["GROUP_Y", "GROUP_Z"]},
301+
search_client=create_search_client(),
302+
)
303+
is True
304+
)
305+
assert filter is None
306+
307+
308+
@pytest.mark.asyncio
309+
async def test_check_path_auth_allowed_fragment(
310+
monkeypatch, mock_confidential_client_success, mock_validate_token_success
311+
):
312+
auth_helper_require_access_control = create_authentication_helper(require_access_control=True)
313+
filter = None
314+
315+
async def mock_search(self, *args, **kwargs):
316+
nonlocal filter
317+
filter = kwargs.get("filter")
318+
return MockAsyncPageIterator(data=[{"sourcefile": "Benefit_Options.pdf"}])
319+
320+
monkeypatch.setattr(SearchClient, "search", mock_search)
321+
322+
assert (
323+
await auth_helper_require_access_control.check_path_auth(
324+
path="Benefit_Options.pdf#textafterfragment",
325+
auth_claims={"oid": "OID_X", "groups": ["GROUP_Y", "GROUP_Z"]},
326+
search_client=create_search_client(),
327+
)
328+
is True
329+
)
330+
assert (
331+
filter
332+
== "(oids/any(g:search.in(g, 'OID_X')) or groups/any(g:search.in(g, 'GROUP_Y, GROUP_Z'))) and ((sourcefile eq 'Benefit_Options.pdf') or (sourcepage eq 'Benefit_Options.pdf'))"
250333
)
251334

252335

0 commit comments

Comments
 (0)