Skip to content

Commit 69e03a7

Browse files
RFP deployment update
1 parent 75259d9 commit 69e03a7

11 files changed

+170
-202
lines changed
Binary file not shown.
Binary file not shown.
Binary file not shown.
127 KB
Binary file not shown.
127 KB
Binary file not shown.
Binary file not shown.

docs/DeploymentGuide.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -491,4 +491,4 @@ To debug the python server in the frontend directory (frontend_server.py) and re
491491
"args": ["frontend_server:app", "--port", "3000", "--reload"],
492492
"jinja": true
493493
}
494-
```
494+
```

infra/scripts/index_datasets.py

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,49 @@
55
from azure.storage.blob import BlobServiceClient
66
import sys
77

8+
# PDF text extraction function
9+
def extract_pdf_text(pdf_bytes):
10+
"""Extract text content from PDF bytes using PyPDF2"""
11+
try:
12+
import PyPDF2
13+
import io
14+
15+
pdf_file = io.BytesIO(pdf_bytes)
16+
pdf_reader = PyPDF2.PdfReader(pdf_file)
17+
18+
# Check if PDF is encrypted/protected
19+
if pdf_reader.is_encrypted:
20+
return "PDF_PROTECTED: This PDF document is password-protected or encrypted and cannot be processed."
21+
22+
text_content = []
23+
for page in pdf_reader.pages:
24+
try:
25+
page_text = page.extract_text()
26+
if page_text and page_text.strip():
27+
text_content.append(page_text)
28+
except Exception:
29+
continue
30+
31+
full_text = "\n".join(text_content).strip()
32+
33+
# Check for protection messages
34+
protection_indicators = [
35+
"protected by Microsoft Office",
36+
"You'll need a different reader",
37+
"Download a compatible PDF reader",
38+
"This PDF Document has been protected"
39+
]
40+
41+
if any(indicator.lower() in full_text.lower() for indicator in protection_indicators):
42+
return "PDF_PROTECTED: This PDF document appears to be protected or encrypted."
43+
44+
return full_text if full_text else "PDF_NO_TEXT: No readable text content found in PDF."
45+
46+
except ImportError:
47+
return "PDF_ERROR: PyPDF2 library not available. Install with: pip install PyPDF2"
48+
except Exception as e:
49+
return f"PDF_ERROR: Error reading PDF content: {str(e)}"
50+
851
if len(sys.argv) < 4:
952
print("Usage: python index_datasets.py <storage_account_name> <blob_container_name> <ai_search_endpoint> [<ai_search_index_name>]")
1053
sys.exit(1)
@@ -51,11 +94,19 @@
5194
#if blob.name.endswith(".csv"):
5295
title = blob.name.replace(".csv", "")
5396
title = blob.name.replace(".json", "")
97+
title = blob.name.replace(".pdf", "") # Also handle PDF extension
5498
data = container_client.download_blob(blob.name).readall()
5599

56100
try:
57101
print(f"Reading data from blob: {blob.name}...")
58-
text = data.decode('utf-8')
102+
103+
# Check if this is a PDF file and process accordingly
104+
if blob.name.lower().endswith('.pdf'):
105+
text = extract_pdf_text(data)
106+
else:
107+
# Original processing for non-PDF files
108+
text = data.decode('utf-8')
109+
59110
data_list.append({
60111
"content": text,
61112
"id": str(idx),

infra/scripts/index_rfp_data.py

Lines changed: 0 additions & 100 deletions
This file was deleted.

infra/scripts/process_sample_data.sh

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -123,13 +123,14 @@ fi
123123

124124

125125
#Upload sample CSV files to blob storage
126-
echo "Uploading CSV sample files to blob storage..."
126+
echo "Uploading CSV and JSON sample files to blob storage..."
127127
az storage blob upload-batch --account-name "$storageAccount" --destination "$blobContainer" --source "data/datasets" --auth-mode login --pattern '*.csv' --overwrite --output none
128+
az storage blob upload-batch --account-name "$storageAccount" --destination "$blobContainer" --source "data/datasets" --auth-mode login --pattern '*.json' --overwrite --output none
128129
if [ $? -ne 0 ]; then
129-
echo "Error: Failed to upload CSV files to blob storage."
130+
echo "Error: Failed to upload CSV and JSON files to blob storage."
130131
exit 1
131132
fi
132-
echo "CSV files uploaded successfully to blob storage."
133+
echo "CSV and JSON files uploaded successfully to blob storage."
133134

134135
#Upload PDF files from RFP_dataset to blob storage
135136
echo "Uploading PDF files from RFP_dataset to blob storage..."
@@ -194,14 +195,14 @@ if [ "$has_csv" = true ]; then
194195
fi
195196
fi
196197

197-
if [ "$has_pdf" = true ]; then
198-
echo "Running the python script to index PDF data"
199-
$PYTHON_CMD infra/scripts/index_rfp_data.py "$storageAccount" "$blobContainer" "$aiSearch" "$aiSearchIndex"
200-
if [ $? -ne 0 ]; then
201-
echo "Error: PDF indexing python script execution failed."
202-
exit 1
203-
fi
204-
fi
198+
# if [ "$has_pdf" = true ]; then
199+
# echo "Running the python script to index PDF data"
200+
# $PYTHON_CMD infra/scripts/index_rfp_data.py "$storageAccount" "$blobContainer" "$aiSearch" "$aiSearchIndex"
201+
# if [ $? -ne 0 ]; then
202+
# echo "Error: PDF indexing python script execution failed."
203+
# exit 1
204+
# fi
205+
# fi
205206

206207
if [ "$has_csv" = false ] && [ "$has_pdf" = false ]; then
207208
echo "No CSV or PDF files found to index."
@@ -228,4 +229,4 @@ if [ "$srchIsPublicAccessDisabled" = true ]; then
228229
echo "Public access disabled for search service: $aiSearch"
229230
fi
230231

231-
echo "Script executed successfully. Sample Data Processed Successfully."
232+
echo "Script executed successfully. Sample Data Processed Successfully."

0 commit comments

Comments
 (0)