Skip to content

Commit 0124725

Browse files
authored
Adds storageURL field to track file location (#1535)
* Adds storageURL field to track file location * Fix tests, update mocks * Adding script * Test adls2 strategy * Fixes to manageacl script to let you call update_storage_urls * Add PNG for document intelligence * Set facetable to false
1 parent 12ed699 commit 0124725

File tree

15 files changed

+363
-149
lines changed

15 files changed

+363
-149
lines changed

app/backend/app.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -260,8 +260,8 @@ async def upload(auth_claims: dict[str, Any]):
260260
file_io = io.BufferedReader(file_io)
261261
await file_client.upload_data(file_io, overwrite=True, metadata={"UploadedBy": user_oid})
262262
file_io.seek(0)
263-
ingester = current_app.config[CONFIG_INGESTER]
264-
await ingester.add_file(File(content=file_io, acls={"oids": [user_oid]}))
263+
ingester: UploadUserFileStrategy = current_app.config[CONFIG_INGESTER]
264+
await ingester.add_file(File(content=file_io, acls={"oids": [user_oid]}, url=file_client.url))
265265
return jsonify({"message": "File uploaded successfully"}), 200
266266

267267

app/backend/prepdocslib/blobmanager.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,12 @@ async def upload_blob(self, file: File) -> Optional[List[str]]:
5353
await container_client.create_container()
5454

5555
# Re-open and upload the original file
56-
with open(file.content.name, "rb") as reopened_file:
57-
blob_name = BlobManager.blob_name_from_file_name(file.content.name)
58-
logger.info("Uploading blob for whole file -> %s", blob_name)
59-
await container_client.upload_blob(blob_name, reopened_file, overwrite=True)
56+
if file.url is None:
57+
with open(file.content.name, "rb") as reopened_file:
58+
blob_name = BlobManager.blob_name_from_file_name(file.content.name)
59+
logger.info("Uploading blob for whole file -> %s", blob_name)
60+
blob_client = await container_client.upload_blob(blob_name, reopened_file, overwrite=True)
61+
file.url = blob_client.url
6062

6163
if self.store_page_images:
6264
if os.path.splitext(file.content.name)[1].lower() == ".pdf":

app/backend/prepdocslib/filestrategy.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ async def run(self):
8787
blob_image_embeddings: Optional[List[List[float]]] = None
8888
if self.image_embeddings and blob_sas_uris:
8989
blob_image_embeddings = await self.image_embeddings.create_embeddings(blob_sas_uris)
90-
await search_manager.update_content(sections, blob_image_embeddings)
90+
await search_manager.update_content(sections, blob_image_embeddings, url=file.url)
9191
finally:
9292
if file:
9393
file.close()
@@ -124,7 +124,7 @@ async def add_file(self, file: File):
124124
logging.warning("Image embeddings are not currently supported for the user upload feature")
125125
sections = await parse_file(file, self.file_processors)
126126
if sections:
127-
await self.search_manager.update_content(sections)
127+
await self.search_manager.update_content(sections, url=file.url)
128128

129129
async def remove_file(self, filename: str, oid: str):
130130
if filename is None or filename == "":

app/backend/prepdocslib/listfilestrategy.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,10 @@ class File:
2222
This file might contain access control information about which users or groups can access it
2323
"""
2424

25-
def __init__(self, content: IO, acls: Optional[dict[str, list]] = None):
25+
def __init__(self, content: IO, acls: Optional[dict[str, list]] = None, url: Optional[str] = None):
2626
self.content = content
2727
self.acls = acls or {}
28+
self.url = url
2829

2930
def filename(self):
3031
return os.path.basename(self.content.name)
@@ -167,7 +168,7 @@ async def list(self) -> AsyncGenerator[File, None]:
167168
acls["oids"].append(acl_parts[1])
168169
if acl_parts[0] == "group" and "r" in acl_parts[2]:
169170
acls["groups"].append(acl_parts[1])
170-
yield File(content=open(temp_file_path, "rb"), acls=acls)
171+
yield File(content=open(temp_file_path, "rb"), acls=acls, url=file_client.url)
171172
except Exception as data_lake_exception:
172173
logger.error(f"\tGot an error while reading {path} -> {data_lake_exception} --> skipping file")
173174
try:

app/backend/prepdocslib/searchmanager.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,12 @@ async def create_index(self, vectorizers: Optional[List[VectorSearchVectorizer]]
111111
filterable=True,
112112
facetable=True,
113113
),
114+
SimpleField(
115+
name="storageUrl",
116+
type="Edm.String",
117+
filterable=True,
118+
facetable=False,
119+
),
114120
]
115121
if self.use_acls:
116122
fields.append(
@@ -182,7 +188,9 @@ async def create_index(self, vectorizers: Optional[List[VectorSearchVectorizer]]
182188
else:
183189
logger.info("Search index %s already exists", self.search_info.index_name)
184190

185-
async def update_content(self, sections: List[Section], image_embeddings: Optional[List[List[float]]] = None):
191+
async def update_content(
192+
self, sections: List[Section], image_embeddings: Optional[List[List[float]]] = None, url: Optional[str] = None
193+
):
186194
MAX_BATCH_SIZE = 1000
187195
section_batches = [sections[i : i + MAX_BATCH_SIZE] for i in range(0, len(sections), MAX_BATCH_SIZE)]
188196

@@ -209,6 +217,9 @@ async def update_content(self, sections: List[Section], image_embeddings: Option
209217
}
210218
for section_index, section in enumerate(batch)
211219
]
220+
if url:
221+
for document in documents:
222+
document["storageUrl"] = url
212223
if self.embeddings:
213224
embeddings = await self.embeddings.create_embeddings(
214225
texts=[section.split_page.text for section in batch]

app/frontend/src/components/UploadFile/UploadFile.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ export const UploadFile: React.FC<Props> = ({ className, disabled }: Props) => {
120120
<div>
121121
<Label>Upload file:</Label>
122122
<input
123-
accept=".txt, .md, .json, .jpg, .jpeg, .bmp, .heic, .tiff, .pdf, .docx, .xlsx, .pptx, .html"
123+
accept=".txt, .md, .json, .png, .jpg, .jpeg, .bmp, .heic, .tiff, .pdf, .docx, .xlsx, .pptx, .html"
124124
className={styles.chooseFiles}
125125
type="file"
126126
onChange={handleUploadFile}

docs/deploy_features.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,21 @@ Then you'll need to run `azd up` to provision an Azure Data Lake Storage Gen2 ac
136136
When the user uploads a document, it will be stored in a directory in that account with the same name as the user's Entra object id,
137137
and will have ACLs associated with that directory. When the ingester runs, it will also set the `oids` of the indexed chunks to the user's Entra object id.
138138
139+
If you are enabling this feature on an existing index, you should also update your index to have the new `storageUrl` field:
140+
141+
```shell
142+
./scripts/manageacl.ps1 -v --acl-action enable_acls
143+
```
144+
145+
And then update existing search documents with the storage URL of the main Blob container:
146+
147+
```shell
148+
./scripts/manageacl.ps1 -v --acl-action update_storage_urls --url <https://YOUR-MAIN-STORAGE-ACCOUNT.blob.core.windows.net/content/>
149+
```
150+
151+
Going forward, all uploaded documents will have their `storageUrl` set in the search index.
152+
This is necessary to disambiguate user-uploaded documents from admin-uploaded documents.
153+
139154
140155
## Enabling CORS for an alternate frontend
141156

docs/login_and_acl.md

Lines changed: 40 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -150,17 +150,46 @@ Manually enable document level access control on a search index and manually set
150150

151151
Run `azd up` or use `azd env set` to manually set `AZURE_SEARCH_SERVICE` and `AZURE_SEARCH_INDEX` environment variables prior to running the script.
152152

153-
The script supports the following commands. Note that the syntax is the same regardless of whether [manageacl.ps1](../scripts/manageacl.ps1) or [manageacl.sh](../scripts/manageacl.sh) is used.
154-
* `./scripts/manageacl.ps1 --acl-action enable_acls`: Creates the required `oids` (User ID) and `groups` (Group IDs) [security filter](https://learn.microsoft.com/azure/search/search-security-trimming-for-azure-search) fields for document level access control on your index. Does nothing if these fields already exist.
155-
* Example usage: `./scripts/manageacl.ps1 --acl-action enable_acls`
156-
* `./scripts/manageacl.ps1 --document [name-of-pdf.pdf] --acl-type [oids or groups]--acl-action view`: Prints access control values associated with either User IDs or Group IDs for a specific document.
157-
* Example to view all Group IDs from the Benefit_Options PDF: `./scripts/manageacl.ps1 --document Benefit_Options.pdf --acl-type oids --acl-action view`.
158-
* `./scripts/manageacl.ps1 --document [name-of-pdf.pdf] --acl-type [oids or groups]--acl-action add --acl [ID of group or user]`: Adds an access control value associated with either User IDs or Group IDs for a specific document.
159-
* Example to add a Group ID to the Benefit_Options PDF: `./scripts/manageacl.ps1 --document Benefit_Options.pdf --acl-type groups --acl-action add --acl xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`.
160-
* `./scripts/manageacl.ps1 --document [name-of-pdf.pdf] --acl-type [oids or groups]--acl-action remove_all`: Removes all access control values associated with either User IDs or Group IDs for a specific document.
161-
* Example to remove all Group IDs from the Benefit_Options PDF: `./scripts/manageacl.ps1 --document Benefit_Options.pdf --acl-type groups --acl-action remove_all`.
162-
* `./scripts/manageacl.ps1 --document [name-of-pdf.pdf] --acl-type [oids or groups]--acl-action remove --acl [ID of group or user]`: Removes an access control value associated with either User IDs or Group IDs for a specific document.
163-
* Example to remove a specific User ID from the Benefit_Options PDF: `./scripts/manageacl.ps1 --document Benefit_Options.pdf --acl-type oids --acl-action remove --acl xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx`.
153+
The script supports the following commands. Note that the syntax is the same regardless of whether [manageacl.ps1](../scripts/manageacl.ps1) or [manageacl.sh](../scripts/manageacl.sh) is used. All commands support `-v` for verbose logging.
154+
* `./scripts/manageacl.ps1 --acl-action enable_acls`: Creates the required `oids` (User ID) and `groups` (Group IDs) [security filter](https://learn.microsoft.com/azure/search/search-security-trimming-for-azure-search) fields for document level access control on your index, as well as the `storageUrl` field for storing the Blob storage URL. Does nothing if these fields already exist.
155+
156+
Example usage:
157+
158+
```shell
159+
./scripts/manageacl.ps1 -v --acl-action enable_acls
160+
```
161+
162+
* `./scripts/manageacl.ps1 --acl-type [oids or groups]--acl-action view --url [https://url.pdf]`: Prints access control values associated with either User IDs or Group IDs for the document at the specified URL.
163+
164+
Example to view all Group IDs:
165+
166+
```shell
167+
./scripts/manageacl.ps1 -v --acl-type groups --acl-action view --url https://st12345.blob.core.windows.net/content/Benefit_Options.pdf
168+
```
169+
170+
* `./scripts/manageacl.ps1 --url [https://url.pdf] --acl-type [oids or groups]--acl-action add --acl [ID of group or user]`: Adds an access control value associated with either User IDs or Group IDs for the document at the specified URL.
171+
172+
Example to add a Group ID:
173+
174+
```shell
175+
./scripts/manageacl.ps1 -v --acl-type groups --acl-action add --acl xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx --url https://st12345.blob.core.windows.net/content/Benefit_Options.pdf
176+
```
177+
178+
* `./scripts/manageacl.ps1 --url [https://url.pdf] --acl-type [oids or groups]--acl-action remove_all`: Removes all access control values associated with either User IDs or Group IDs for a specific document.
179+
180+
Example to remove all Group IDs:
181+
182+
```shell
183+
./scripts/manageacl.ps1 -v --acl-type groups --acl-action remove_all --url https://st12345.blob.core.windows.net/content/Benefit_Options.pdf
184+
```
185+
186+
* `./scripts/manageacl.ps1 --url [https://url.pdf] --acl-type [oids or groups]--acl-action remove --acl [ID of group or user]`: Removes an access control value associated with either User IDs or Group IDs for a specific document.
187+
188+
Example to remove a specific User ID:
189+
190+
```shell
191+
./scripts/manageacl.ps1 -v --acl-type oids --acl-action remove --acl xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx --url https://st12345.blob.core.windows.net/content/Benefit_Options.pdf
192+
```
164193

165194
### Azure Data Lake Storage Gen2 Setup
166195

0 commit comments

Comments
 (0)