@@ -110,24 +110,51 @@ def get_image_id(image_id):
110110 return get_image_id
111111
112112
113+ title_font_list = [
114+ [36 , 100 ],
115+ [26 , 36 ],
116+ [24 , 26 ],
117+ [22 , 24 ],
118+ [18 , 22 ],
119+ [16 , 18 ]
120+ ]
121+
122+
123+ def get_title_level (paragraph : Paragraph ):
124+ try :
125+ if paragraph .style is not None :
126+ psn = paragraph .style .name
127+ if psn .startswith ('Heading' ) or psn .startswith ('TOC 标题' ) or psn .startswith ('标题' ):
128+ return int (psn .replace ("Heading " , '' ).replace ('TOC 标题' , '' ).replace ('标题' ,
129+ '' ))
130+ if len (paragraph .runs ) == 1 :
131+ font_size = paragraph .runs [0 ].font .size
132+ pt = font_size .pt
133+ if pt >= 16 :
134+ for _value , index in zip (title_font_list , range (len (title_font_list ))):
135+ if pt >= _value [0 ] and pt < _value [1 ]:
136+ return index + 1
137+ except Exception as e :
138+ pass
139+ return None
140+
141+
113142class DocSplitHandle (BaseSplitHandle ):
114143 @staticmethod
115144 def paragraph_to_md (paragraph : Paragraph , doc : Document , images_list , get_image_id ):
116145 try :
117- psn = paragraph .style .name
118- if psn .startswith ('Heading' ) or psn .startswith ('TOC 标题' ) or psn .startswith ('标题' ):
119- title = "" .join (["#" for i in range (
120- int (psn .replace ("Heading " , '' ).replace ('TOC 标题' , '' ).replace ('标题' ,
121- '' )))]) + " " + paragraph .text
146+ title_level = get_title_level (paragraph )
147+ if title_level is not None :
148+ title = "" .join (["#" for i in range (title_level )]) + " " + paragraph .text
122149 images = reduce (lambda x , y : [* x , * y ],
123150 [get_paragraph_element_images (e , doc , images_list , get_image_id ) for e in
124151 paragraph ._element ],
125152 [])
126-
127153 if len (images ) > 0 :
128154 return title + '\n ' + images_to_string (images , doc , images_list , get_image_id ) if len (
129155 paragraph .text ) > 0 else images_to_string (images , doc , images_list , get_image_id )
130156 return title
157+
131158 except Exception as e :
132159 traceback .print_exc ()
133160 return paragraph .text
0 commit comments