-
Notifications
You must be signed in to change notification settings - Fork 683
Description
Original issue: - #3653:
My question:
i @JorjMcKie I have reviewed your recovered lines script, my question is how to use this script, is it going to edit the pdf with recovered lines or after reading the we need to all the recovered lines function. I need to recovered the lines and make those lines a single block in PDF so later save all the properties in data frame. Below is my data frame object. Pleast help with saving the block text as recovered lines with applied html tags Could you please help?
Step 5: Group by block_id and concatenate HTML-formatted text
rows_with_html = []
for page_num, blocks in block_dict.items():
for block in blocks:
if block['type'] == 0: # Only text blocks
block_id = block['number']
block_text = [] # Collect text for this block
original_text = []
for line in block['lines']:
for span in line['spans']:
xmin, ymin, xmax, ymax = list(span['bbox'])
font_size = span['size']
text = span['text'].strip().replace('\n', '').replace('\r', '')
span_font = span['font']
color = span["color"]
is_upper = "uppercase" in span_font.lower()
is_bold = "bold" in span_font.lower()
# Validate and format color value
if isinstance(color, int):
font_color = f'#{color:06x}' # Ensure it's a 6-digit hex
elif isinstance(color, tuple) and len(color) >= 3:
font_color = f'#{color[0]:02x}{color[1]:02x}{color[2]:02x}'
else:
font_color = '#000000' # Fallback to black if invalid
# Validate color length (should be 7 characters including #)
if len(font_color) != 7 or not font_color.startswith('#'):
font_color = '#000000' # Fallback to black if invalid
if text.replace(" ", "") != "":
original_text.append(text)
text = unidecode(text)
tag_for_text = tag.get(round(font_size), 'span') # Default to 'span' if not found
if (font_size > 14):
tag_for_text = 'h1'
elif is_bold and tag_for_text.startswith('h'):
tag_for_text = 'h2'
elif tag_for_text.startswith('h'):
tag_for_text = 'h3'
if is_bold:
# Apply <b> tags only if it's bold and not a heading
text = f"<b>{text}</b>"
# if is_upper:
# text = f"<span style='text-transform:uppercase'>{text}</span>"
# Only execute if text is not None, not empty, and not whitespace
# if text and text.strip():
# text_with_tag = f"<{tag_for_text} style='display:inline; color:{font_color};'>{text}</{tag_for_text}>\n"
# block_text.append(text_with_tag)
if tag_for_text != 'p':
text_with_tag = f"<{tag_for_text} style='display:inline; color:{font_color};'>{text}</{tag_for_text}>\n"
block_text.append(text_with_tag)
else:
block_text.append(text)
if not block_text or not block_text[0].startswith('<h'):
rows_with_html.append((page_num, block_id, f"<p>{' '.join(block_text)}</p>", ' '.join(original_text)))
else:
rows_with_html.append((page_num, block_id, ' '.join(block_text) + "<p></p>", ' '.join(original_text)))
#rows_with_html.append((page_num, block_id, ' '.join(block_text) + "<br><br>", ' '.join(original_text)))
Create the final DataFrame
grouped_df = pd.DataFrame(rows_with_html, columns=['page_num', 'block_id', 'text', 'originalText'])
grouped_df.to_excel('test.xlsx')
return grouped_df