Skip to content

Commit f6419a2

Browse files
committed
Fix docx linter temp file lifecycle for lazy-loading
1 parent 4363e33 commit f6419a2

File tree

2 files changed

+73
-46
lines changed

2 files changed

+73
-46
lines changed

services/docx_linter.py

Lines changed: 64 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ async def lint_docx_file(
112112
errors = []
113113
warnings = []
114114

115+
docxtpl_temp_file_path: Optional[str] = None
115116
try:
116117
# Stage 1: Extract plaintext using python-docx (independent of docxtpl)
117118
logger.info(f"Step 1: Extracting plaintext from {filename} using python-docx")
@@ -155,7 +156,7 @@ async def lint_docx_file(
155156

156157
# Stage 4: If syntax is clean, proceed with docxtpl processing
157158
logger.info(f"Step 4: Syntax clean, proceeding with docxtpl processing")
158-
doc_template, raw_xml = self._extract_xml_with_docxtpl(file_content, filename)
159+
doc_template, raw_xml, docxtpl_temp_file_path = self._extract_xml_with_docxtpl(file_content, filename)
159160

160161
# Stage 5: Use docxtpl to process extended docx tags
161162
logger.info(f"Step 5: Processing extended docx tags with docxtpl")
@@ -209,8 +210,15 @@ async def lint_docx_file(
209210
except Exception as e:
210211
logger.error(f"Linting failed for {filename}: {str(e)}")
211212
return self._create_error_result(e, filename, start_time)
213+
finally:
214+
# Keep the docxtpl temp file around for all lazy operations (patch_xml, variable detection, etc.)
215+
if docxtpl_temp_file_path and os.path.exists(docxtpl_temp_file_path):
216+
try:
217+
os.unlink(docxtpl_temp_file_path)
218+
except Exception as e:
219+
logger.warning(f"Failed to delete temp file {docxtpl_temp_file_path}: {e}")
212220

213-
def _extract_xml_with_docxtpl(self, file_content: bytes, filename: str) -> Tuple[DocxTemplate, str]:
221+
def _extract_xml_with_docxtpl(self, file_content: bytes, filename: str) -> Tuple[DocxTemplate, str, str]:
214222
"""
215223
Step 1: Use docxtpl to extract XML from docx.
216224
@@ -219,30 +227,34 @@ def _extract_xml_with_docxtpl(self, file_content: bytes, filename: str) -> Tuple
219227
filename: Original filename for error reporting
220228
221229
Returns:
222-
Tuple of (DocxTemplate instance, raw XML string)
230+
Tuple of (DocxTemplate instance, raw XML string, temp file path).
231+
NOTE: The caller is responsible for deleting the returned temp file path
232+
after all docxtpl/python-docx lazy operations are complete.
223233
"""
234+
temp_file_path: Optional[str] = None
224235
try:
225-
# Create temporary file
236+
# Create temporary file - must remain on disk while docxtpl lazily reads it
226237
with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as temp_file:
227238
temp_file.write(file_content)
228239
temp_file_path = temp_file.name
229-
230-
try:
231-
# Create DocxTemplate instance
232-
doc_template = DocxTemplate(temp_file_path)
233-
doc_template.init_docx()
234-
235-
# Extract raw XML
236-
raw_xml = doc_template.get_xml()
237-
238-
logger.debug(f"Successfully extracted XML from {filename}: {len(raw_xml)} characters")
239-
return doc_template, raw_xml
240-
241-
finally:
242-
if os.path.exists(temp_file_path):
243-
os.unlink(temp_file_path)
244-
240+
241+
# Create DocxTemplate instance
242+
doc_template = DocxTemplate(temp_file_path)
243+
doc_template.init_docx()
244+
245+
# Extract raw XML
246+
raw_xml = doc_template.get_xml()
247+
248+
logger.debug(f"Successfully extracted XML from {filename}: {len(raw_xml)} characters")
249+
return doc_template, raw_xml, temp_file_path
250+
245251
except Exception as e:
252+
# Clean up the temp file on failure to avoid leaking /tmp files
253+
if temp_file_path and os.path.exists(temp_file_path):
254+
try:
255+
os.unlink(temp_file_path)
256+
except Exception as cleanup_err:
257+
logger.warning(f"Failed to delete temp file {temp_file_path}: {cleanup_err}")
246258
raise DocumentExtractionException(
247259
f"Failed to extract XML from {filename} using docxtpl: {str(e)}"
248260
)
@@ -281,40 +293,46 @@ def _extract_structured_text(self, file_content: bytes, filename: str) -> str:
281293
Returns:
282294
Structured text with proper line breaks
283295
"""
296+
temp_file_path: Optional[str] = None
284297
try:
285-
# Create temporary file
298+
# Create temporary file - must remain on disk while python-docx lazily reads it
286299
with tempfile.NamedTemporaryFile(delete=False, suffix='.docx') as temp_file:
287300
temp_file.write(file_content)
288301
temp_file_path = temp_file.name
289-
290-
try:
291-
doc = Document(temp_file_path)
292-
full_text = []
293-
294-
# Extract paragraph text
295-
for paragraph in doc.paragraphs:
296-
if paragraph.text.strip(): # Skip empty paragraphs
297-
full_text.append(paragraph.text)
298-
299-
# Extract table text
300-
for table in doc.tables:
301-
for row in table.rows:
302-
row_text = []
303-
for cell in row.cells:
304-
row_text.append(cell.text.strip())
302+
303+
doc = Document(temp_file_path)
304+
full_text = []
305+
306+
# Extract paragraph text
307+
for paragraph in doc.paragraphs:
308+
if paragraph.text.strip(): # Skip empty paragraphs
309+
full_text.append(paragraph.text)
310+
311+
# Extract table text
312+
for table in doc.tables:
313+
for row in table.rows:
314+
row_text = []
315+
for cell in row.cells:
316+
text = cell.text.strip()
317+
if text:
318+
row_text.append(text)
319+
if row_text:
305320
full_text.append(' | '.join(row_text))
306-
307-
structured_text = '\n'.join(full_text)
308-
logger.debug(f"Extracted structured text: {len(structured_text)} characters, {len(full_text)} lines")
309-
return structured_text
310-
311-
finally:
312-
if os.path.exists(temp_file_path):
313-
os.unlink(temp_file_path)
314-
321+
322+
structured_text = '\n'.join(full_text)
323+
logger.debug(f"Extracted structured text: {len(structured_text)} characters, {len(full_text)} lines")
324+
return structured_text
325+
315326
except Exception as e:
316327
logger.error(f"Failed to extract structured text from {filename}: {str(e)}")
317328
return ""
329+
finally:
330+
# Clean up AFTER all operations complete
331+
if temp_file_path and os.path.exists(temp_file_path):
332+
try:
333+
os.unlink(temp_file_path)
334+
except Exception as cleanup_err:
335+
logger.warning(f"Failed to delete temp file {temp_file_path}: {cleanup_err}")
318336

319337
def _create_input_data(self, raw_xml: str, processed_xml: str, structured_text: str, filename: str) -> dict:
320338
"""

tests/conftest.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
import sys
2+
from pathlib import Path
3+
4+
# Ensure repository root (where `main.py` lives) is importable during pytest collection.
5+
ROOT = Path(__file__).resolve().parents[1]
6+
if str(ROOT) not in sys.path:
7+
sys.path.insert(0, str(ROOT))
8+
9+

0 commit comments

Comments
 (0)