Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 44 additions & 19 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,26 +28,33 @@ def embed_font_in_html(font_path, font_name, html_content):
font-family: '{font_name}', Arial, sans-serif;
margin: 0;
padding: 0;
background-color: white;
background-color: #f5f5f5;
line-height: 1.6;
}}
.page {{
position: relative;
width: 8.5in;
min-height: 11in;
margin: 20px auto;
padding: 20px;
padding: 1in;
box-sizing: border-box;
background-color: white;
box-shadow: 0 0 10px rgba(0,0,0,0.1);
box-shadow: 0 2px 8px rgba(0,0,0,0.1);
}}
.paragraph {{
margin: 0;
margin-bottom: 12pt;
padding: 0;
position: relative;
line-height: 1.5;
}}
.paragraph > div {{
line-height: inherit;
}}
.image-container {{
display: inline-block;
position: relative;
vertical-align: middle;
margin: 10px 0;
}}
img {{
max-width: 100%;
Expand All @@ -58,12 +65,13 @@ def embed_font_in_html(font_path, font_name, html_content):
table {{
border-collapse: collapse;
width: 100%;
margin: 10px 0;
margin: 15px 0;
}}
td, th {{
border: 1px solid black;
padding: 8px;
border: 1px solid #ddd;
padding: 10px 12px;
position: relative;
line-height: 1.4;
}}
</style>
"""
Expand Down Expand Up @@ -112,24 +120,28 @@ def get_image_position(element):

def process_paragraph(paragraph, images_dict):
html_content = '<div class="paragraph">'


# Determine text alignment
if paragraph.alignment == WD_ALIGN_PARAGRAPH.CENTER:
html_content += '<div style="text-align: center;">'
elif paragraph.alignment == WD_ALIGN_PARAGRAPH.RIGHT:
html_content += '<div style="text-align: right;">'
else:
html_content += '<div>'


# Check if paragraph is empty
has_content = False

for run in paragraph.runs:
style = []
if run.bold: style.append('font-weight: bold')
if run.italic: style.append('font-style: italic')
if run.underline: style.append('text-decoration: underline')
if run.font.size: style.append(f'font-size: {run.font.size.pt}pt')

drawing_elements = run._element.findall('.//w:drawing',
{'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})

for drawing in drawing_elements:
blip = drawing.find('.//a:blip',
{'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'})
Expand All @@ -146,11 +158,19 @@ def process_paragraph(paragraph, images_dict):
html_content += f'<div class="image-container">'
html_content += f'<img src="{images_dict[image_rel_id]}" alt="Document Image"/>'
html_content += '</div>'

has_content = True

style_str = '; '.join(style)
if run.text.strip():
# Preserve all text including whitespace for proper spacing
if run.text:
html_content += f'<span style="{style_str}">{run.text}</span>'

if run.text.strip():
has_content = True

# If paragraph is empty, add a line break to preserve spacing
if not has_content:
html_content += '<br/>'

html_content += '</div></div>'
return html_content

Expand All @@ -160,17 +180,21 @@ def process_table(table, images_dict):
html_content += '<tr>'
for cell in row.cells:
html_content += '<td>'
for paragraph in cell.paragraphs:
for para_idx, paragraph in enumerate(cell.paragraphs):
# Add spacing between multiple paragraphs in a cell
if para_idx > 0:
html_content += '<br/>'

for run in paragraph.runs:
style = []
if run.bold: style.append('font-weight: bold')
if run.italic: style.append('font-style: italic')
if run.underline: style.append('text-decoration: underline')
if run.font.size: style.append(f'font-size: {run.font.size.pt}pt')

drawing_elements = run._element.findall('.//w:drawing',
{'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})

for drawing in drawing_elements:
blip = drawing.find('.//a:blip',
{'a': 'http://schemas.openxmlformats.org/drawingml/2006/main'})
Expand All @@ -180,9 +204,10 @@ def process_table(table, images_dict):
html_content += f'<div class="image-container">'
html_content += f'<img src="{images_dict[image_rel_id]}" alt="Table Cell Image"/>'
html_content += '</div>'

style_str = '; '.join(style)
if run.text.strip():
# Preserve text with proper spacing
if run.text:
html_content += f'<span style="{style_str}">{run.text}</span>'
html_content += '</td>'
html_content += '</tr>'
Expand Down