Skip to content

Commit 755188c

Browse files
authored
Handle non-ascii filenames (#418)
* Handle non-ascii filenames * Revert port change * Pytest * New line * Rename tests
1 parent e3dc087 commit 755188c

File tree

5 files changed

+119
-64
lines changed

5 files changed

+119
-64
lines changed

.github/workflows/python-test.yaml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
name: Python check
2+
3+
on:
4+
push:
5+
branches: [ main ]
6+
pull_request:
7+
branches: [ main ]
8+
9+
jobs:
10+
test_package:
11+
name: Test ${{ matrix.os }} Python ${{ matrix.python_version }}
12+
runs-on: ${{ matrix.os }}
13+
strategy:
14+
fail-fast: false
15+
matrix:
16+
os: ["ubuntu-20.04"]
17+
python_version: ["3.10", "3.11"]
18+
steps:
19+
- uses: actions/checkout@v3
20+
- name: Setup python
21+
uses: actions/setup-python@v2
22+
with:
23+
python-version: ${{ matrix.python_version }}
24+
architecture: x64
25+
- name: Install dependencies
26+
run: |
27+
python -m pip install --upgrade pip
28+
pip install -r requirements-dev.txt
29+
- name: Run Python tests
30+
run: python3 -m pytest

app/backend/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,6 @@ azure-identity==1.13.0
22
Flask==2.2.5
33
langchain==0.0.187
44
openai==0.27.8
5-
tiktoken==0.3.0
5+
tiktoken==0.4.0
66
azure-search-documents==11.4.0b3
77
azure-storage-blob==12.14.1

requirements-dev.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
-r app/backend/requirements.txt
2+
-r scripts/requirements.txt
3+
pytest

scripts/prepdocs.py

Lines changed: 76 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1,57 +1,25 @@
1-
import os
21
import argparse
2+
import base64
33
import glob
44
import html
55
import io
6+
import os
67
import re
78
import time
8-
from pypdf import PdfReader, PdfWriter
9-
from azure.identity import AzureDeveloperCliCredential
9+
10+
from azure.ai.formrecognizer import DocumentAnalysisClient
1011
from azure.core.credentials import AzureKeyCredential
11-
from azure.storage.blob import BlobServiceClient
12+
from azure.identity import AzureDeveloperCliCredential
13+
from azure.search.documents import SearchClient
1214
from azure.search.documents.indexes import SearchIndexClient
1315
from azure.search.documents.indexes.models import *
14-
from azure.search.documents import SearchClient
15-
from azure.ai.formrecognizer import DocumentAnalysisClient
16+
from azure.storage.blob import BlobServiceClient
17+
from pypdf import PdfReader, PdfWriter
1618

1719
MAX_SECTION_LENGTH = 1000
1820
SENTENCE_SEARCH_LIMIT = 100
1921
SECTION_OVERLAP = 100
2022

21-
parser = argparse.ArgumentParser(
22-
description="Prepare documents by extracting content from PDFs, splitting content into sections, uploading to blob storage, and indexing in a search index.",
23-
epilog="Example: prepdocs.py '..\data\*' --storageaccount myaccount --container mycontainer --searchservice mysearch --index myindex -v"
24-
)
25-
parser.add_argument("files", help="Files to be processed")
26-
parser.add_argument("--category", help="Value for the category field in the search index for all sections indexed in this run")
27-
parser.add_argument("--skipblobs", action="store_true", help="Skip uploading individual pages to Azure Blob Storage")
28-
parser.add_argument("--storageaccount", help="Azure Blob Storage account name")
29-
parser.add_argument("--container", help="Azure Blob Storage container name")
30-
parser.add_argument("--storagekey", required=False, help="Optional. Use this Azure Blob Storage account key instead of the current user identity to login (use az login to set current user for Azure)")
31-
parser.add_argument("--tenantid", required=False, help="Optional. Use this to define the Azure directory where to authenticate)")
32-
parser.add_argument("--searchservice", help="Name of the Azure Cognitive Search service where content should be indexed (must exist already)")
33-
parser.add_argument("--index", help="Name of the Azure Cognitive Search index where content should be indexed (will be created if it doesn't exist)")
34-
parser.add_argument("--searchkey", required=False, help="Optional. Use this Azure Cognitive Search account key instead of the current user identity to login (use az login to set current user for Azure)")
35-
parser.add_argument("--remove", action="store_true", help="Remove references to this document from blob storage and the search index")
36-
parser.add_argument("--removeall", action="store_true", help="Remove all blobs from blob storage and documents from the search index")
37-
parser.add_argument("--localpdfparser", action="store_true", help="Use PyPdf local PDF parser (supports only digital PDFs) instead of Azure Form Recognizer service to extract text, tables and layout from the documents")
38-
parser.add_argument("--formrecognizerservice", required=False, help="Optional. Name of the Azure Form Recognizer service which will be used to extract text, tables and layout from the documents (must exist already)")
39-
parser.add_argument("--formrecognizerkey", required=False, help="Optional. Use this Azure Form Recognizer account key instead of the current user identity to login (use az login to set current user for Azure)")
40-
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
41-
args = parser.parse_args()
42-
43-
# Use the current user identity to connect to Azure services unless a key is explicitly set for any of them
44-
azd_credential = AzureDeveloperCliCredential() if args.tenantid == None else AzureDeveloperCliCredential(tenant_id=args.tenantid, process_timeout=60)
45-
default_creds = azd_credential if args.searchkey == None or args.storagekey == None else None
46-
search_creds = default_creds if args.searchkey == None else AzureKeyCredential(args.searchkey)
47-
if not args.skipblobs:
48-
storage_creds = default_creds if args.storagekey == None else args.storagekey
49-
if not args.localpdfparser:
50-
# check if Azure Form Recognizer credentials are provided
51-
if args.formrecognizerservice == None:
52-
print("Error: Azure Form Recognizer service is not provided. Please provide formrecognizerservice or use --localpdfparser for local pypdf parser.")
53-
exit(1)
54-
formrecognizer_creds = default_creds if args.formrecognizerkey == None else AzureKeyCredential(args.formrecognizerkey)
5523

5624
def blob_name_from_file_page(filename, page = 0):
5725
if os.path.splitext(filename)[1].lower() == ".pdf":
@@ -220,10 +188,16 @@ def find_page(offset):
220188
if start + SECTION_OVERLAP < end:
221189
yield (all_text[start:end], find_page(start))
222190

191+
def filename_to_id(filename):
192+
filename_ascii = re.sub("[^0-9a-zA-Z_-]", "_", filename)
193+
filename_hash = base64.b16encode(filename.encode('utf-8')).decode('ascii')
194+
return f"file-{filename_ascii}-{filename_hash}"
195+
223196
def create_sections(filename, page_map):
197+
file_id = filename_to_id(filename)
224198
for i, (section, pagenum) in enumerate(split_text(page_map)):
225199
yield {
226-
"id": re.sub("[^0-9a-zA-Z_-]","_",f"{filename}-{i}"),
200+
"id": f"{file_id}-page-{i}",
227201
"content": section,
228202
"category": args.category,
229203
"sourcepage": blob_name_from_file_page(filename, pagenum),
@@ -291,25 +265,64 @@ def remove_from_index(filename):
291265
# It can take a few seconds for search results to reflect changes, so wait a bit
292266
time.sleep(2)
293267

294-
if args.removeall:
295-
remove_blobs(None)
296-
remove_from_index(None)
297-
else:
298-
if not args.remove:
299-
create_search_index()
300-
301-
print(f"Processing files...")
302-
for filename in glob.glob(args.files):
303-
if args.verbose: print(f"Processing '{filename}'")
304-
if args.remove:
305-
remove_blobs(filename)
306-
remove_from_index(filename)
307-
elif args.removeall:
308-
remove_blobs(None)
309-
remove_from_index(None)
310-
else:
311-
if not args.skipblobs:
312-
upload_blobs(filename)
313-
page_map = get_document_text(filename)
314-
sections = create_sections(os.path.basename(filename), page_map)
315-
index_sections(os.path.basename(filename), sections)
268+
269+
if __name__ == "__main__":
270+
271+
parser = argparse.ArgumentParser(
272+
description="Prepare documents by extracting content from PDFs, splitting content into sections, uploading to blob storage, and indexing in a search index.",
273+
epilog="Example: prepdocs.py '..\data\*' --storageaccount myaccount --container mycontainer --searchservice mysearch --index myindex -v"
274+
)
275+
parser.add_argument("files", help="Files to be processed")
276+
parser.add_argument("--category", help="Value for the category field in the search index for all sections indexed in this run")
277+
parser.add_argument("--skipblobs", action="store_true", help="Skip uploading individual pages to Azure Blob Storage")
278+
parser.add_argument("--storageaccount", help="Azure Blob Storage account name")
279+
parser.add_argument("--container", help="Azure Blob Storage container name")
280+
parser.add_argument("--storagekey", required=False, help="Optional. Use this Azure Blob Storage account key instead of the current user identity to login (use az login to set current user for Azure)")
281+
parser.add_argument("--tenantid", required=False, help="Optional. Use this to define the Azure directory where to authenticate)")
282+
parser.add_argument("--searchservice", help="Name of the Azure Cognitive Search service where content should be indexed (must exist already)")
283+
parser.add_argument("--index", help="Name of the Azure Cognitive Search index where content should be indexed (will be created if it doesn't exist)")
284+
parser.add_argument("--searchkey", required=False, help="Optional. Use this Azure Cognitive Search account key instead of the current user identity to login (use az login to set current user for Azure)")
285+
parser.add_argument("--remove", action="store_true", help="Remove references to this document from blob storage and the search index")
286+
parser.add_argument("--removeall", action="store_true", help="Remove all blobs from blob storage and documents from the search index")
287+
parser.add_argument("--localpdfparser", action="store_true", help="Use PyPdf local PDF parser (supports only digital PDFs) instead of Azure Form Recognizer service to extract text, tables and layout from the documents")
288+
parser.add_argument("--formrecognizerservice", required=False, help="Optional. Name of the Azure Form Recognizer service which will be used to extract text, tables and layout from the documents (must exist already)")
289+
parser.add_argument("--formrecognizerkey", required=False, help="Optional. Use this Azure Form Recognizer account key instead of the current user identity to login (use az login to set current user for Azure)")
290+
parser.add_argument("--verbose", "-v", action="store_true", help="Verbose output")
291+
args = parser.parse_args()
292+
293+
# Use the current user identity to connect to Azure services unless a key is explicitly set for any of them
294+
azd_credential = AzureDeveloperCliCredential() if args.tenantid == None else AzureDeveloperCliCredential(tenant_id=args.tenantid, process_timeout=60)
295+
default_creds = azd_credential if args.searchkey == None or args.storagekey == None else None
296+
search_creds = default_creds if args.searchkey == None else AzureKeyCredential(args.searchkey)
297+
if not args.skipblobs:
298+
storage_creds = default_creds if args.storagekey == None else args.storagekey
299+
if not args.localpdfparser:
300+
# check if Azure Form Recognizer credentials are provided
301+
if args.formrecognizerservice == None:
302+
print("Error: Azure Form Recognizer service is not provided. Please provide formrecognizerservice or use --localpdfparser for local pypdf parser.")
303+
exit(1)
304+
formrecognizer_creds = default_creds if args.formrecognizerkey == None else AzureKeyCredential(args.formrecognizerkey)
305+
306+
307+
if args.removeall:
308+
remove_blobs(None)
309+
remove_from_index(None)
310+
else:
311+
if not args.remove:
312+
create_search_index()
313+
314+
print(f"Processing files...")
315+
for filename in glob.glob(args.files):
316+
if args.verbose: print(f"Processing '{filename}'")
317+
if args.remove:
318+
remove_blobs(filename)
319+
remove_from_index(filename)
320+
elif args.removeall:
321+
remove_blobs(None)
322+
remove_from_index(None)
323+
else:
324+
if not args.skipblobs:
325+
upload_blobs(filename)
326+
page_map = get_document_text(filename)
327+
sections = create_sections(os.path.basename(filename), page_map)
328+
index_sections(os.path.basename(filename), sections)

scripts/test_prepdocs.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
from prepdocs import filename_to_id
2+
3+
def test_filename_to_id():
4+
# test ascii filename
5+
assert filename_to_id("foo.pdf") == "file-foo_pdf-666F6F2E706466"
6+
# test filename containing unicode
7+
assert filename_to_id("foo\u00A9.txt") == "file-foo__txt-666F6FC2A92E747874"
8+
# test filenaming starting with unicode
9+
assert filename_to_id("ファイル名.pdf") == "file-______pdf-E38395E382A1E382A4E383ABE5908D2E706466"

0 commit comments

Comments
 (0)