|
1 |
| -import os |
2 | 1 | import argparse
|
| 2 | +import base64 |
3 | 3 | import glob
|
4 | 4 | import html
|
5 | 5 | import io
|
| 6 | +import os |
6 | 7 | import re
|
7 | 8 | import time
|
8 |
| -from pypdf import PdfReader, PdfWriter |
9 |
| -from azure.identity import AzureDeveloperCliCredential |
| 9 | + |
| 10 | +from azure.ai.formrecognizer import DocumentAnalysisClient |
10 | 11 | from azure.core.credentials import AzureKeyCredential
|
11 |
| -from azure.storage.blob import BlobServiceClient |
| 12 | +from azure.identity import AzureDeveloperCliCredential |
| 13 | +from azure.search.documents import SearchClient |
12 | 14 | from azure.search.documents.indexes import SearchIndexClient
|
13 | 15 | from azure.search.documents.indexes.models import *
|
14 |
| -from azure.search.documents import SearchClient |
15 |
| -from azure.ai.formrecognizer import DocumentAnalysisClient |
| 16 | +from azure.storage.blob import BlobServiceClient |
| 17 | +from pypdf import PdfReader, PdfWriter |
16 | 18 |
|
17 | 19 | MAX_SECTION_LENGTH = 1000
|
18 | 20 | SENTENCE_SEARCH_LIMIT = 100
|
19 | 21 | SECTION_OVERLAP = 100
|
20 | 22 |
|
21 |
| -parser = argparse.ArgumentParser( |
22 |
| - description="Prepare documents by extracting content from PDFs, splitting content into sections, uploading to blob storage, and indexing in a search index.", |
23 |
| - epilog="Example: prepdocs.py '..\data\*' --storageaccount myaccount --container mycontainer --searchservice mysearch --index myindex -v" |
24 |
| - ) |
25 |
| -parser.add_argument("files", help="Files to be processed") |
26 |
| -parser.add_argument("--category", help="Value for the category field in the search index for all sections indexed in this run") |
27 |
| -parser.add_argument("--skipblobs", action="store_true", help="Skip uploading individual pages to Azure Blob Storage") |
28 |
| -parser.add_argument("--storageaccount", help="Azure Blob Storage account name") |
29 |
| -parser.add_argument("--container", help="Azure Blob Storage container name") |
30 |
| -parser.add_argument("--storagekey", required=False, help="Optional. Use this Azure Blob Storage account key instead of the current user identity to login (use az login to set current user for Azure)") |
31 |
| -parser.add_argument("--tenantid", required=False, help="Optional. Use this to define the Azure directory where to authenticate)") |
32 |
| -parser.add_argument("--searchservice", help="Name of the Azure Cognitive Search service where content should be indexed (must exist already)") |
33 |
| -parser.add_argument("--index", help="Name of the Azure Cognitive Search index where content should be indexed (will be created if it doesn't exist)") |
34 |
| -parser.add_argument("--searchkey", required=False, help="Optional. Use this Azure Cognitive Search account key instead of the current user identity to login (use az login to set current user for Azure)") |
35 |
| -parser.add_argument("--remove", action="store_true", help="Remove references to this document from blob storage and the search index") |
36 |
| -parser.add_argument("--removeall", action="store_true", help="Remove all blobs from blob storage and documents from the search index") |
37 |
| -parser.add_argument("--localpdfparser", action="store_true", help="Use PyPdf local PDF parser (supports only digital PDFs) instead of Azure Form Recognizer service to extract text, tables and layout from the documents") |
38 |
| -parser.add_argument("--formrecognizerservice", required=False, help="Optional. Name of the Azure Form Recognizer service which will be used to extract text, tables and layout from the documents (must exist already)") |
39 |
| -parser.add_argument("--formrecognizerkey", required=False, help="Optional. Use this Azure Form Recognizer account key instead of the current user identity to login (use az login to set current user for Azure)") |
40 |
| -parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output") |
41 |
| -args = parser.parse_args() |
42 |
| - |
43 |
| -# Use the current user identity to connect to Azure services unless a key is explicitly set for any of them |
44 |
| -azd_credential = AzureDeveloperCliCredential() if args.tenantid == None else AzureDeveloperCliCredential(tenant_id=args.tenantid, process_timeout=60) |
45 |
| -default_creds = azd_credential if args.searchkey == None or args.storagekey == None else None |
46 |
| -search_creds = default_creds if args.searchkey == None else AzureKeyCredential(args.searchkey) |
47 |
| -if not args.skipblobs: |
48 |
| - storage_creds = default_creds if args.storagekey == None else args.storagekey |
49 |
| -if not args.localpdfparser: |
50 |
| - # check if Azure Form Recognizer credentials are provided |
51 |
| - if args.formrecognizerservice == None: |
52 |
| - print("Error: Azure Form Recognizer service is not provided. Please provide formrecognizerservice or use --localpdfparser for local pypdf parser.") |
53 |
| - exit(1) |
54 |
| - formrecognizer_creds = default_creds if args.formrecognizerkey == None else AzureKeyCredential(args.formrecognizerkey) |
55 | 23 |
|
56 | 24 | def blob_name_from_file_page(filename, page = 0):
|
57 | 25 | if os.path.splitext(filename)[1].lower() == ".pdf":
|
@@ -220,10 +188,16 @@ def find_page(offset):
|
220 | 188 | if start + SECTION_OVERLAP < end:
|
221 | 189 | yield (all_text[start:end], find_page(start))
|
222 | 190 |
|
| 191 | +def filename_to_id(filename): |
| 192 | + filename_ascii = re.sub("[^0-9a-zA-Z_-]", "_", filename) |
| 193 | + filename_hash = base64.b16encode(filename.encode('utf-8')).decode('ascii') |
| 194 | + return f"file-{filename_ascii}-{filename_hash}" |
| 195 | + |
223 | 196 | def create_sections(filename, page_map):
|
| 197 | + file_id = filename_to_id(filename) |
224 | 198 | for i, (section, pagenum) in enumerate(split_text(page_map)):
|
225 | 199 | yield {
|
226 |
| - "id": re.sub("[^0-9a-zA-Z_-]","_",f"{filename}-{i}"), |
| 200 | + "id": f"{file_id}-page-{i}", |
227 | 201 | "content": section,
|
228 | 202 | "category": args.category,
|
229 | 203 | "sourcepage": blob_name_from_file_page(filename, pagenum),
|
@@ -291,25 +265,64 @@ def remove_from_index(filename):
|
291 | 265 | # It can take a few seconds for search results to reflect changes, so wait a bit
|
292 | 266 | time.sleep(2)
|
293 | 267 |
|
294 |
| -if args.removeall: |
295 |
| - remove_blobs(None) |
296 |
| - remove_from_index(None) |
297 |
| -else: |
298 |
| - if not args.remove: |
299 |
| - create_search_index() |
300 |
| - |
301 |
| - print(f"Processing files...") |
302 |
| - for filename in glob.glob(args.files): |
303 |
| - if args.verbose: print(f"Processing '{filename}'") |
304 |
| - if args.remove: |
305 |
| - remove_blobs(filename) |
306 |
| - remove_from_index(filename) |
307 |
| - elif args.removeall: |
308 |
| - remove_blobs(None) |
309 |
| - remove_from_index(None) |
310 |
| - else: |
311 |
| - if not args.skipblobs: |
312 |
| - upload_blobs(filename) |
313 |
| - page_map = get_document_text(filename) |
314 |
| - sections = create_sections(os.path.basename(filename), page_map) |
315 |
| - index_sections(os.path.basename(filename), sections) |
| 268 | + |
| 269 | +if __name__ == "__main__": |
| 270 | + |
| 271 | + parser = argparse.ArgumentParser( |
| 272 | + description="Prepare documents by extracting content from PDFs, splitting content into sections, uploading to blob storage, and indexing in a search index.", |
| 273 | + epilog="Example: prepdocs.py '..\data\*' --storageaccount myaccount --container mycontainer --searchservice mysearch --index myindex -v" |
| 274 | + ) |
| 275 | + parser.add_argument("files", help="Files to be processed") |
| 276 | + parser.add_argument("--category", help="Value for the category field in the search index for all sections indexed in this run") |
| 277 | + parser.add_argument("--skipblobs", action="store_true", help="Skip uploading individual pages to Azure Blob Storage") |
| 278 | + parser.add_argument("--storageaccount", help="Azure Blob Storage account name") |
| 279 | + parser.add_argument("--container", help="Azure Blob Storage container name") |
| 280 | + parser.add_argument("--storagekey", required=False, help="Optional. Use this Azure Blob Storage account key instead of the current user identity to login (use az login to set current user for Azure)") |
| 281 | + parser.add_argument("--tenantid", required=False, help="Optional. Use this to define the Azure directory where to authenticate)") |
| 282 | + parser.add_argument("--searchservice", help="Name of the Azure Cognitive Search service where content should be indexed (must exist already)") |
| 283 | + parser.add_argument("--index", help="Name of the Azure Cognitive Search index where content should be indexed (will be created if it doesn't exist)") |
| 284 | + parser.add_argument("--searchkey", required=False, help="Optional. Use this Azure Cognitive Search account key instead of the current user identity to login (use az login to set current user for Azure)") |
| 285 | + parser.add_argument("--remove", action="store_true", help="Remove references to this document from blob storage and the search index") |
| 286 | + parser.add_argument("--removeall", action="store_true", help="Remove all blobs from blob storage and documents from the search index") |
| 287 | + parser.add_argument("--localpdfparser", action="store_true", help="Use PyPdf local PDF parser (supports only digital PDFs) instead of Azure Form Recognizer service to extract text, tables and layout from the documents") |
| 288 | + parser.add_argument("--formrecognizerservice", required=False, help="Optional. Name of the Azure Form Recognizer service which will be used to extract text, tables and layout from the documents (must exist already)") |
| 289 | + parser.add_argument("--formrecognizerkey", required=False, help="Optional. Use this Azure Form Recognizer account key instead of the current user identity to login (use az login to set current user for Azure)") |
| 290 | + parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output") |
| 291 | + args = parser.parse_args() |
| 292 | + |
| 293 | + # Use the current user identity to connect to Azure services unless a key is explicitly set for any of them |
| 294 | + azd_credential = AzureDeveloperCliCredential() if args.tenantid == None else AzureDeveloperCliCredential(tenant_id=args.tenantid, process_timeout=60) |
| 295 | + default_creds = azd_credential if args.searchkey == None or args.storagekey == None else None |
| 296 | + search_creds = default_creds if args.searchkey == None else AzureKeyCredential(args.searchkey) |
| 297 | + if not args.skipblobs: |
| 298 | + storage_creds = default_creds if args.storagekey == None else args.storagekey |
| 299 | + if not args.localpdfparser: |
| 300 | + # check if Azure Form Recognizer credentials are provided |
| 301 | + if args.formrecognizerservice == None: |
| 302 | + print("Error: Azure Form Recognizer service is not provided. Please provide formrecognizerservice or use --localpdfparser for local pypdf parser.") |
| 303 | + exit(1) |
| 304 | + formrecognizer_creds = default_creds if args.formrecognizerkey == None else AzureKeyCredential(args.formrecognizerkey) |
| 305 | + |
| 306 | + |
| 307 | + if args.removeall: |
| 308 | + remove_blobs(None) |
| 309 | + remove_from_index(None) |
| 310 | + else: |
| 311 | + if not args.remove: |
| 312 | + create_search_index() |
| 313 | + |
| 314 | + print(f"Processing files...") |
| 315 | + for filename in glob.glob(args.files): |
| 316 | + if args.verbose: print(f"Processing '{filename}'") |
| 317 | + if args.remove: |
| 318 | + remove_blobs(filename) |
| 319 | + remove_from_index(filename) |
| 320 | + elif args.removeall: |
| 321 | + remove_blobs(None) |
| 322 | + remove_from_index(None) |
| 323 | + else: |
| 324 | + if not args.skipblobs: |
| 325 | + upload_blobs(filename) |
| 326 | + page_map = get_document_text(filename) |
| 327 | + sections = create_sections(os.path.basename(filename), page_map) |
| 328 | + index_sections(os.path.basename(filename), sections) |
0 commit comments