Skip to content

Commit 1242f44

Browse files
committed
sample data docx upload
1 parent b336e93 commit 1242f44

File tree

2 files changed

+35
-0
lines changed

2 files changed

+35
-0
lines changed

infra/scripts/index_datasets.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,39 @@ def extract_pdf_text(pdf_bytes):
4848
return "PDF_ERROR: PyPDF2 library not available. Install with: pip install PyPDF2"
4949
except Exception as e:
5050
return f"PDF_ERROR: Error reading PDF content: {str(e)}"
51+
52+
53+
# DOCX text extraction function
54+
def extract_docx_text(docx_bytes):
55+
"""Extract text content from DOCX bytes using python-docx"""
56+
try:
57+
from docx import Document
58+
import io
59+
60+
docx_file = io.BytesIO(docx_bytes)
61+
doc = Document(docx_file)
62+
63+
text_content = []
64+
65+
# Extract text from paragraphs
66+
for paragraph in doc.paragraphs:
67+
if paragraph.text.strip():
68+
text_content.append(paragraph.text)
69+
70+
# Extract text from tables
71+
for table in doc.tables:
72+
for row in table.rows:
73+
for cell in row.cells:
74+
if cell.text.strip():
75+
text_content.append(cell.text)
76+
77+
full_text = "\n".join(text_content).strip()
78+
return full_text if full_text else "DOCX_NO_TEXT: No readable text content found in DOCX."
79+
80+
except ImportError:
81+
return "DOCX_ERROR: python-docx library not available. Install with: pip install python-docx"
82+
except Exception as e:
83+
return f"DOCX_ERROR: Error reading DOCX content: {str(e)}"
5184

5285
if len(sys.argv) < 4:
5386
print("Usage: python index_datasets.py <storage_account_name> <blob_container_name> <ai_search_endpoint> [<ai_search_index_name>]")
@@ -106,6 +139,8 @@ def extract_pdf_text(pdf_bytes):
106139
# Check if this is a PDF file and process accordingly
107140
if blob.name.lower().endswith('.pdf'):
108141
text = extract_pdf_text(data)
142+
elif blob.name.lower().endswith('.docx'):
143+
text = extract_docx_text(data)
109144
else:
110145
# Original processing for non-PDF files
111146
text = data.decode('utf-8')

0 commit comments

Comments
 (0)