1+ import json
2+ import re
13from tempfile import SpooledTemporaryFile
24from typing import IO , BinaryIO , List , Optional , Union , cast
35
1517 Title ,
1618 process_metadata ,
1719)
20+ from bisheng_unstructured .documents .markdown import transform_html_table_to_md
1821from bisheng_unstructured .file_utils .filetype import FileType , add_metadata_with_filetype
1922from bisheng_unstructured .partition .common import (
2023 convert_ms_office_table_to_text ,
3134
3235OPENXML_SCHEMA_NAME = "{http://schemas.openxmlformats.org/drawingml/2006/main}"
3336
37+ RE_MULTLINES = re .compile (pattern = r"\n+" , flags = re .DOTALL )
38+ RE_SPACES = re .compile (pattern = r"[ \t\r\f\v]+" , flags = re .DOTALL )
39+ RE_NORMAL_SPACES = re .compile (pattern = r"\s+" , flags = re .DOTALL )
40+
3441
3542@process_metadata ()
3643@add_metadata_with_filetype (FileType .PPTX )
@@ -85,6 +92,10 @@ def partition_pptx(
8592 elements : List [Element ] = []
8693 metadata = ElementMetadata (filename = metadata_filename or filename )
8794 num_slides = len (presentation .slides )
95+ slide_height = presentation .slide_height
96+ slide_width = presentation .slide_width
97+ page_bbox = [slide_width , slide_height ]
98+ sel_i = 30
8899 for i , slide in enumerate (presentation .slides ):
89100 metadata = ElementMetadata .from_dict (metadata .to_dict ())
90101 metadata .last_modified = metadata_last_modified or last_modification_date
@@ -97,11 +108,16 @@ def partition_pptx(
97108 if notes_text .strip () != "" :
98109 elements .append (NarrativeText (text = notes_text , metadata = metadata ))
99110
111+ shape_infos = []
112+ shape_index = - 1
100113 for shape in _order_shapes (slide .shapes ):
114+ shape_index += 1
101115 if shape .has_table :
102116 table : pptx .table .Table = shape .table
103117 html_table = convert_ms_office_table_to_text (table , as_html = True )
104- text_table = convert_ms_office_table_to_text (table , as_html = False )
118+ # text_table = convert_ms_office_table_to_text(table, as_html=False)
119+ text_table = transform_html_table_to_md (html_table )["text" ]
120+ # print('---table---', html_table, text_table)
105121 if (text_table := text_table .strip ()) != "" :
106122 metadata = ElementMetadata (
107123 filename = metadata_filename or filename ,
@@ -113,24 +129,46 @@ def partition_pptx(
113129 continue
114130 if not shape .has_text_frame :
115131 continue
116- # NOTE(robinson) - avoid processing shapes that are not on the actual slide
117- # NOTE - skip check if no top or left position (shape displayed top left)
118- if (shape .top and shape .left ) and (shape .top < 0 or shape .left < 0 ):
132+
133+ bbox = [shape .left , shape .top , shape .width , shape .height ]
134+ shape_info = []
135+ shape_infos .append ({"runs" : shape_info , "bbox" : bbox })
136+ metadata = {"bbox" : bbox , "page_bbox" : page_bbox }
137+ metadata = ElementMetadata (
138+ page_number = i , text_as_html = json .dumps (metadata ), page_name = "paragraph"
139+ )
140+
141+ TITLE_AREA_THRESHOLD = 0.2
142+ ratio = abs (bbox [3 ] - bbox [1 ]) * 1.0 / page_bbox [1 ]
143+ # print('bbox', bbox, page_bbox, ratio)
144+
145+ is_title = False
146+ text = None
147+ if shape_index == 0 and ratio <= TITLE_AREA_THRESHOLD :
148+ text = re .sub (RE_NORMAL_SPACES , " " , shape .text_frame .text )
149+ is_title = is_possible_title (
150+ text , language = "zh" , title_max_word_length = 30 , is_pptx = True
151+ )
152+
153+ if not is_title :
154+ text = shape .text_frame .text .replace ("\x0b " , "\n " )
155+ text = re .sub (RE_MULTLINES , "\n " , text ).strip ()
156+ text = re .sub (RE_SPACES , " " , text )
157+
158+ if text == "" :
119159 continue
120- for paragraph in shape .text_frame .paragraphs :
121- text = paragraph .text
122- if text .strip () == "" :
123- continue
124- if _is_bulleted_paragraph (paragraph ):
125- elements .append (ListItem (text = text , metadata = metadata ))
126- elif is_email_address (text ):
127- elements .append (EmailAddress (text = text ))
128- elif is_possible_narrative_text (text ):
129- elements .append (NarrativeText (text = text , metadata = metadata ))
130- elif is_possible_title (text ):
131- elements .append (Title (text = text , metadata = metadata ))
132- else :
133- elements .append (Text (text = text , metadata = metadata ))
160+
161+ # for paragraph in shape.text_frame.paragraphs:
162+ # print('is_bulleted', _is_bulleted_paragraph(paragraph))
163+
164+ if is_email_address (text ):
165+ elements .append (EmailAddress (text = text ))
166+ elif is_possible_narrative_text (text ):
167+ elements .append (NarrativeText (text = text , metadata = metadata ))
168+ elif is_title :
169+ elements .append (Title (text = text , metadata = metadata ))
170+ else :
171+ elements .append (Text (text = text , metadata = metadata ))
134172
135173 if include_page_breaks and i < num_slides - 1 :
136174 elements .append (PageBreak (text = "" ))
0 commit comments