Handle non-ascii filenames (#418)

pamelafox · web-flow · commit 755188cf7c41 · 2023-07-12T20:32:45.000-07:00
* Handle non-ascii filenames

* Revert port change

* Pytest

* New line

* Rename tests
diff --git a/.github/workflows/python-test.yaml b/.github/workflows/python-test.yaml
@@ -0,0 +1,30 @@
+name: Python check
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  test_package:
+    name: Test ${{ matrix.os }} Python ${{ matrix.python_version }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: ["ubuntu-20.04"]
+        python_version: ["3.10", "3.11"]
+    steps:
+        - uses: actions/checkout@v3
+        - name: Setup python
+          uses: actions/setup-python@v2
+          with:
+            python-version: ${{ matrix.python_version }}
+            architecture: x64
+        - name: Install dependencies
+          run: |
+            python -m pip install --upgrade pip
+            pip install -r requirements-dev.txt
+        - name: Run Python tests
+          run: python3 -m pytest
diff --git a/app/backend/requirements.txt b/app/backend/requirements.txt
@@ -2,6 +2,6 @@ azure-identity==1.13.0
 Flask==2.2.5
 langchain==0.0.187
 openai==0.27.8
-tiktoken==0.3.0
+tiktoken==0.4.0
 azure-search-documents==11.4.0b3
 azure-storage-blob==12.14.1
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -0,0 +1,3 @@
+-r app/backend/requirements.txt
+-r scripts/requirements.txt
+pytest
diff --git a/scripts/prepdocs.py b/scripts/prepdocs.py
@@ -1,57 +1,25 @@
-import os
 import argparse
+import base64
 import glob
 import html
 import io
+import os
 import re
 import time
-from pypdf import PdfReader, PdfWriter
-from azure.identity import AzureDeveloperCliCredential
+
+from azure.ai.formrecognizer import DocumentAnalysisClient
 from azure.core.credentials import AzureKeyCredential
-from azure.storage.blob import BlobServiceClient
+from azure.identity import AzureDeveloperCliCredential
+from azure.search.documents import SearchClient
 from azure.search.documents.indexes import SearchIndexClient
 from azure.search.documents.indexes.models import *
-from azure.search.documents import SearchClient
-from azure.ai.formrecognizer import DocumentAnalysisClient
+from azure.storage.blob import BlobServiceClient
+from pypdf import PdfReader, PdfWriter
 
 MAX_SECTION_LENGTH = 1000
 SENTENCE_SEARCH_LIMIT = 100
 SECTION_OVERLAP = 100
 
-parser = argparse.ArgumentParser(
-    description="Prepare documents by extracting content from PDFs, splitting content into sections, uploading to blob storage, and indexing in a search index.",
-    epilog="Example: prepdocs.py '..\data\*' --storageaccount myaccount --container mycontainer --searchservice mysearch --index myindex -v"
-    )
-parser.add_argument("files", help="Files to be processed")
-parser.add_argument("--category", help="Value for the category field in the search index for all sections indexed in this run")
-parser.add_argument("--skipblobs", action="store_true", help="Skip uploading individual pages to Azure Blob Storage")
-parser.add_argument("--storageaccount", help="Azure Blob Storage account name")
-parser.add_argument("--container", help="Azure Blob Storage container name")
-parser.add_argument("--storagekey", required=False, help="Optional. Use this Azure Blob Storage account key instead of the current user identity to login (use az login to set current user for Azure)")
-parser.add_argument("--tenantid", required=False, help="Optional. Use this to define the Azure directory where to authenticate)")
-parser.add_argument("--searchservice", help="Name of the Azure Cognitive Search service where content should be indexed (must exist already)")
-parser.add_argument("--index", help="Name of the Azure Cognitive Search index where content should be indexed (will be created if it doesn't exist)")
-parser.add_argument("--searchkey", required=False, help="Optional. Use this Azure Cognitive Search account key instead of the current user identity to login (use az login to set current user for Azure)")
-parser.add_argument("--remove", action="store_true", help="Remove references to this document from blob storage and the search index")
-parser.add_argument("--removeall", action="store_true", help="Remove all blobs from blob storage and documents from the search index")
-parser.add_argument("--localpdfparser", action="store_true", help="Use PyPdf local PDF parser (supports only digital PDFs) instead of Azure Form Recognizer service to extract text, tables and layout from the documents")
-parser.add_argument("--formrecognizerservice", required=False, help="Optional. Name of the Azure Form Recognizer service which will be used to extract text, tables and layout from the documents (must exist already)")
-parser.add_argument("--formrecognizerkey", required=False, help="Optional. Use this Azure Form Recognizer account key instead of the current user identity to login (use az login to set current user for Azure)")
-parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
-args = parser.parse_args()
-
-# Use the current user identity to connect to Azure services unless a key is explicitly set for any of them
-azd_credential = AzureDeveloperCliCredential() if args.tenantid == None else AzureDeveloperCliCredential(tenant_id=args.tenantid, process_timeout=60)
-default_creds = azd_credential if args.searchkey == None or args.storagekey == None else None
-search_creds = default_creds if args.searchkey == None else AzureKeyCredential(args.searchkey)
-if not args.skipblobs:
-    storage_creds = default_creds if args.storagekey == None else args.storagekey
-if not args.localpdfparser:
-    # check if Azure Form Recognizer credentials are provided
-    if args.formrecognizerservice == None:
-        print("Error: Azure Form Recognizer service is not provided. Please provide formrecognizerservice or use --localpdfparser for local pypdf parser.")
-        exit(1)
-    formrecognizer_creds = default_creds if args.formrecognizerkey == None else AzureKeyCredential(args.formrecognizerkey)
 
 def blob_name_from_file_page(filename, page = 0):
     if os.path.splitext(filename)[1].lower() == ".pdf":
@@ -220,10 +188,16 @@ def find_page(offset):
     if start + SECTION_OVERLAP < end:
         yield (all_text[start:end], find_page(start))
 
+def filename_to_id(filename):
+    filename_ascii = re.sub("[^0-9a-zA-Z_-]", "_", filename)
+    filename_hash = base64.b16encode(filename.encode('utf-8')).decode('ascii')
+    return f"file-{filename_ascii}-{filename_hash}"
+
 def create_sections(filename, page_map):
+    file_id = filename_to_id(filename)
     for i, (section, pagenum) in enumerate(split_text(page_map)):
         yield {
-            "id": re.sub("[^0-9a-zA-Z_-]","_",f"{filename}-{i}"),
+            "id": f"{file_id}-page-{i}",
             "content": section,
             "category": args.category,
             "sourcepage": blob_name_from_file_page(filename, pagenum),
@@ -291,25 +265,64 @@ def remove_from_index(filename):
         # It can take a few seconds for search results to reflect changes, so wait a bit
         time.sleep(2)
 
-if args.removeall:
-    remove_blobs(None)
-    remove_from_index(None)
-else:
-    if not args.remove:
-        create_search_index()
-    
-    print(f"Processing files...")
-    for filename in glob.glob(args.files):
-        if args.verbose: print(f"Processing '{filename}'")
-        if args.remove:
-            remove_blobs(filename)
-            remove_from_index(filename)
-        elif args.removeall:
-            remove_blobs(None)
-            remove_from_index(None)
-        else:
-            if not args.skipblobs:
-                upload_blobs(filename)
-            page_map = get_document_text(filename)
-            sections = create_sections(os.path.basename(filename), page_map)
-            index_sections(os.path.basename(filename), sections)
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(
+        description="Prepare documents by extracting content from PDFs, splitting content into sections, uploading to blob storage, and indexing in a search index.",
+        epilog="Example: prepdocs.py '..\data\*' --storageaccount myaccount --container mycontainer --searchservice mysearch --index myindex -v"
+        )
+    parser.add_argument("files", help="Files to be processed")
+    parser.add_argument("--category", help="Value for the category field in the search index for all sections indexed in this run")
+    parser.add_argument("--skipblobs", action="store_true", help="Skip uploading individual pages to Azure Blob Storage")
+    parser.add_argument("--storageaccount", help="Azure Blob Storage account name")
+    parser.add_argument("--container", help="Azure Blob Storage container name")
+    parser.add_argument("--storagekey", required=False, help="Optional. Use this Azure Blob Storage account key instead of the current user identity to login (use az login to set current user for Azure)")
+    parser.add_argument("--tenantid", required=False, help="Optional. Use this to define the Azure directory where to authenticate)")
+    parser.add_argument("--searchservice", help="Name of the Azure Cognitive Search service where content should be indexed (must exist already)")
+    parser.add_argument("--index", help="Name of the Azure Cognitive Search index where content should be indexed (will be created if it doesn't exist)")
+    parser.add_argument("--searchkey", required=False, help="Optional. Use this Azure Cognitive Search account key instead of the current user identity to login (use az login to set current user for Azure)")
+    parser.add_argument("--remove", action="store_true", help="Remove references to this document from blob storage and the search index")
+    parser.add_argument("--removeall", action="store_true", help="Remove all blobs from blob storage and documents from the search index")
+    parser.add_argument("--localpdfparser", action="store_true", help="Use PyPdf local PDF parser (supports only digital PDFs) instead of Azure Form Recognizer service to extract text, tables and layout from the documents")
+    parser.add_argument("--formrecognizerservice", required=False, help="Optional. Name of the Azure Form Recognizer service which will be used to extract text, tables and layout from the documents (must exist already)")
+    parser.add_argument("--formrecognizerkey", required=False, help="Optional. Use this Azure Form Recognizer account key instead of the current user identity to login (use az login to set current user for Azure)")
+    parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
+    args = parser.parse_args()
+
+    # Use the current user identity to connect to Azure services unless a key is explicitly set for any of them
+    azd_credential = AzureDeveloperCliCredential() if args.tenantid == None else AzureDeveloperCliCredential(tenant_id=args.tenantid, process_timeout=60)
+    default_creds = azd_credential if args.searchkey == None or args.storagekey == None else None
+    search_creds = default_creds if args.searchkey == None else AzureKeyCredential(args.searchkey)
+    if not args.skipblobs:
+        storage_creds = default_creds if args.storagekey == None else args.storagekey
+    if not args.localpdfparser:
+        # check if Azure Form Recognizer credentials are provided
+        if args.formrecognizerservice == None:
+            print("Error: Azure Form Recognizer service is not provided. Please provide formrecognizerservice or use --localpdfparser for local pypdf parser.")
+            exit(1)
+        formrecognizer_creds = default_creds if args.formrecognizerkey == None else AzureKeyCredential(args.formrecognizerkey)
+
+
+    if args.removeall:
+        remove_blobs(None)
+        remove_from_index(None)
+    else:
+        if not args.remove:
+            create_search_index()
+        
+        print(f"Processing files...")
+        for filename in glob.glob(args.files):
+            if args.verbose: print(f"Processing '{filename}'")
+            if args.remove:
+                remove_blobs(filename)
+                remove_from_index(filename)
+            elif args.removeall:
+                remove_blobs(None)
+                remove_from_index(None)
+            else:
+                if not args.skipblobs:
+                    upload_blobs(filename)
+                page_map = get_document_text(filename)
+                sections = create_sections(os.path.basename(filename), page_map)
+                index_sections(os.path.basename(filename), sections)
diff --git a/scripts/test_prepdocs.py b/scripts/test_prepdocs.py
@@ -0,0 +1,9 @@
+from prepdocs import filename_to_id
+
+def test_filename_to_id():
+    # test ascii filename
+    assert filename_to_id("foo.pdf") == "file-foo_pdf-666F6F2E706466"
+    # test filename containing unicode
+    assert filename_to_id("foo\u00A9.txt") == "file-foo__txt-666F6FC2A92E747874"
+    # test filenaming starting with unicode
+    assert filename_to_id("ファイル名.pdf") == "file-______pdf-E38395E382A1E382A4E383ABE5908D2E706466"

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+-r app/backend/requirements.txt`
	`2`	`+-r scripts/requirements.txt`
	`3`	`+pytest`