@@ -47,6 +47,37 @@ def extract_image(self, document, image_id):
4747 )
4848 return cropped
4949
50+ def insert_block_id (self , soup , block_id : BlockId ):
51+ """
52+ Insert a block ID into the soup as a data attribute.
53+ """
54+ if block_id .block_type in [BlockTypes .Line , BlockTypes .Span ]:
55+ return soup
56+
57+ if self .add_block_ids :
58+ # Find the outermost tag (first tag that isn't a NavigableString)
59+ outermost_tag = None
60+ for element in soup .contents :
61+ if hasattr (element , "name" ) and element .name :
62+ outermost_tag = element
63+ break
64+
65+ # If we found an outermost tag, add the data-block-id attribute
66+ if outermost_tag :
67+ outermost_tag ["data-block-id" ] = str (block_id )
68+
69+ # If soup only contains text or no tags, wrap in a span
70+ elif soup .contents :
71+ wrapper = soup .new_tag ("span" )
72+ wrapper ["data-block-id" ] = str (block_id )
73+
74+ contents = list (soup .contents )
75+ for content in contents :
76+ content .extract ()
77+ wrapper .append (content )
78+ soup .append (wrapper )
79+ return soup
80+
5081 def extract_html (self , document , document_output , level = 0 ):
5182 soup = BeautifulSoup (document_output .html , "html.parser" )
5283
@@ -69,22 +100,24 @@ def extract_html(self, document, document_output, level=0):
69100 image = self .extract_image (document , ref_block_id )
70101 image_name = f"{ ref_block_id .to_path ()} .{ settings .OUTPUT_IMAGE_FORMAT .lower ()} "
71102 images [image_name ] = image
72- ref .replace_with (
73- BeautifulSoup (
74- f"<p>{ content } <img src='{ image_name } '></p>" , "html.parser"
75- )
103+ element = BeautifulSoup (
104+ f"<p>{ content } <img src='{ image_name } '></p>" , "html.parser"
76105 )
106+ ref .replace_with (self .insert_block_id (element , ref_block_id ))
77107 else :
78108 # This will be the image description if using llm mode, or empty if not
79- ref .replace_with (BeautifulSoup (f"{ content } " , "html.parser" ))
109+ element = BeautifulSoup (f"{ content } " , "html.parser" )
110+ ref .replace_with (self .insert_block_id (element , ref_block_id ))
80111 elif ref_block_id .block_type in self .page_blocks :
81112 images .update (sub_images )
82113 if self .paginate_output :
83114 content = f"<div class='page' data-page-id='{ ref_block_id .page_id } '>{ content } </div>"
84- ref .replace_with (BeautifulSoup (f"{ content } " , "html.parser" ))
115+ element = BeautifulSoup (f"{ content } " , "html.parser" )
116+ ref .replace_with (self .insert_block_id (element , ref_block_id ))
85117 else :
86118 images .update (sub_images )
87- ref .replace_with (BeautifulSoup (f"{ content } " , "html.parser" ))
119+ element = BeautifulSoup (f"{ content } " , "html.parser" )
120+ ref .replace_with (self .insert_block_id (element , ref_block_id ))
88121
89122 output = str (soup )
90123 if level == 0 :
0 commit comments