@@ -48,6 +48,39 @@ def extract_pdf_text(pdf_bytes):
4848 return "PDF_ERROR: PyPDF2 library not available. Install with: pip install PyPDF2"
4949 except Exception as e :
5050 return f"PDF_ERROR: Error reading PDF content: { str (e )} "
51+
52+
53+ # DOCX text extraction function
54+ def extract_docx_text (docx_bytes ):
55+ """Extract text content from DOCX bytes using python-docx"""
56+ try :
57+ from docx import Document
58+ import io
59+
60+ docx_file = io .BytesIO (docx_bytes )
61+ doc = Document (docx_file )
62+
63+ text_content = []
64+
65+ # Extract text from paragraphs
66+ for paragraph in doc .paragraphs :
67+ if paragraph .text .strip ():
68+ text_content .append (paragraph .text )
69+
70+ # Extract text from tables
71+ for table in doc .tables :
72+ for row in table .rows :
73+ for cell in row .cells :
74+ if cell .text .strip ():
75+ text_content .append (cell .text )
76+
77+ full_text = "\n " .join (text_content ).strip ()
78+ return full_text if full_text else "DOCX_NO_TEXT: No readable text content found in DOCX."
79+
80+ except ImportError :
81+ return "DOCX_ERROR: python-docx library not available. Install with: pip install python-docx"
82+ except Exception as e :
83+ return f"DOCX_ERROR: Error reading DOCX content: { str (e )} "
5184
5285if len (sys .argv ) < 4 :
5386 print ("Usage: python index_datasets.py <storage_account_name> <blob_container_name> <ai_search_endpoint> [<ai_search_index_name>]" )
@@ -106,6 +139,8 @@ def extract_pdf_text(pdf_bytes):
106139 # Check if this is a PDF file and process accordingly
107140 if blob .name .lower ().endswith ('.pdf' ):
108141 text = extract_pdf_text (data )
142+ elif blob .name .lower ().endswith ('.docx' ):
143+ text = extract_docx_text (data )
109144 else :
110145 # Original processing for non-PDF files
111146 text = data .decode ('utf-8' )
0 commit comments