Skip to content

Commit e680580

Browse files
authored
support office files
support office files
2 parents c0118ee + 32a8b5e commit e680580

File tree

15 files changed

+322
-33
lines changed

15 files changed

+322
-33
lines changed

docker/prepare.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ function install_deps() {
3131
}
3232

3333
function twine_upload() {
34-
twine upload dist/* -u __token__ -p ${PYPI_PASSWORD} --repository pypi
34+
twine upload dist/* -u __token__ -p ${PYPI_PASSWORD} --repository pypi --skip-existing
3535
}
3636

3737

examples/docs/maoxuan_sample.docx

389 Bytes
Binary file not shown.

examples/docs/test.xlsx

10.1 KB
Binary file not shown.

src/bisheng_unstructured/documents/html_utils.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,8 @@ def visualize_html(elements, output_file):
4141
text = el.metadata.text_as_html
4242
text = text.replace("\n", " ")
4343
else:
44-
text = f"<p {styles[idx % 2]}>{el.text}</p>"
44+
text = el.text.replace("\n", "<br>")
45+
text = f"<p {styles[idx % 2]}>{text}</p>"
4546
idx += 1
4647

4748
if text:
@@ -57,19 +58,24 @@ def save_to_txt(elements, output_file):
5758
text_elem_sep = "\n"
5859
content_page = []
5960
is_first_elem = True
61+
last_label = ""
6062
for el in elements:
6163
label, text = el.category, el.text
6264
if is_first_elem:
6365
f_text = text + "\n" if label == "Title" else text
6466
content_page.append(f_text)
6567
is_first_elem = False
6668
else:
67-
if label == "Title":
69+
if last_label == "Title" and label == "Title":
70+
content_page.append("\n" + text + "\n")
71+
elif label == "Title":
6872
content_page.append("\n\n" + text + "\n")
6973
elif label == "Table":
7074
content_page.append("\n\n" + text + "\n")
7175
else:
7276
content_page.append(text_elem_sep + text)
7377

78+
last_label = label
79+
7480
with open(output_file, "w") as fout:
7581
fout.write("".join(content_page))

src/bisheng_unstructured/nlp/patterns.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,3 +143,6 @@
143143
# For zh variables
144144
ZH_PUNC_NOT_IN_TITLE_PATTERN = r"[,。、;!|,;!]"
145145
ZH_PUNC_NOT_IN_TITLE_RE = re.compile(ZH_PUNC_NOT_IN_TITLE_PATTERN)
146+
147+
ZH_PUNC_NOT_IN_PPTX_TITLE_PATTERN = r"[。;!|;!]"
148+
ZH_PUNC_NOT_IN_PPTX_TITLE_RE = re.compile(ZH_PUNC_NOT_IN_PPTX_TITLE_PATTERN)

src/bisheng_unstructured/partition/docx.py

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from docx.table import Table as DocxTable
99
from docx.text.paragraph import Paragraph
1010
from docx.text.run import Run
11+
from lxml import etree
1112

1213
from bisheng_unstructured.cleaners.core import clean_bullets
1314
from bisheng_unstructured.documents.elements import (
@@ -25,6 +26,7 @@
2526
Title,
2627
process_metadata,
2728
)
29+
from bisheng_unstructured.documents.markdown import transform_html_table_to_md
2830
from bisheng_unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
2931
from bisheng_unstructured.partition.common import (
3032
convert_ms_office_table_to_text,
@@ -140,6 +142,7 @@ def partition_docx(
140142

141143
# Verify that only one of the arguments was provided
142144
exactly_one(filename=filename, file=file)
145+
language = kwargs.get("language", "zh")
143146

144147
last_modification_date = None
145148
if filename is not None:
@@ -168,14 +171,16 @@ def partition_docx(
168171
section = 0
169172
is_list = False
170173
for element_item in document.element.body:
174+
# print("---element_item---", element_item, element_item.tag, element_item.xml)
171175
if element_item.tag.endswith("tbl"):
172176
table = document.tables[table_index]
173177
emphasized_texts = _get_emphasized_texts_from_table(table)
174178
emphasized_text_contents, emphasized_text_tags = _extract_contents_and_tags(
175179
emphasized_texts,
176180
)
177181
html_table = convert_ms_office_table_to_text(table, as_html=True)
178-
text_table = convert_ms_office_table_to_text(table, as_html=False)
182+
# text_table = convert_ms_office_table_to_text(table, as_html=False)
183+
text_table = transform_html_table_to_md(html_table)["text"]
179184
element = Table(text_table)
180185
if element is not None:
181186
element.metadata = ElementMetadata(
@@ -196,7 +201,7 @@ def partition_docx(
196201
emphasized_text_contents, emphasized_text_tags = _extract_contents_and_tags(
197202
emphasized_texts,
198203
)
199-
para_element: Optional[Text] = _paragraph_to_element(paragraph, is_list)
204+
para_element: Optional[Text] = _paragraph_to_element(paragraph, is_list, language)
200205
if para_element is not None:
201206
para_element.metadata = ElementMetadata(
202207
filename=metadata_filename,
@@ -207,6 +212,14 @@ def partition_docx(
207212
)
208213
elements.append(para_element)
209214
is_list = False
215+
# print(
216+
# "---p---",
217+
# emphasized_texts,
218+
# emphasized_text_contents,
219+
# emphasized_text_tags,
220+
# para_element,
221+
# paragraph.style,
222+
# )
210223
elif element_item.tag.endswith("sectPr"):
211224
if len(headers_and_footers) > section:
212225
footers = headers_and_footers[section][1]
@@ -226,26 +239,30 @@ def partition_docx(
226239

227240

228241
def _paragraph_to_element(
229-
paragraph: docx.text.paragraph.Paragraph,
230-
is_list=False,
242+
paragraph: docx.text.paragraph.Paragraph, is_list=False, language="eng"
231243
) -> Optional[Text]:
232244
"""Converts a docx Paragraph object into the appropriate unstructured document element.
233245
If the paragraph style is "Normal" or unknown, we try to predict the element type from the
234246
raw text."""
235247
text = paragraph.text
248+
# normailize the text
249+
text = text.strip("\n")
236250
style_name = paragraph.style and paragraph.style.name # .style can be None
237251

238252
if len(text.strip()) == 0:
239253
return None
240254

255+
if "Heading" in paragraph.style.name:
256+
return Title(text)
257+
241258
element_class = STYLE_TO_ELEMENT_MAPPING.get(style_name)
242259

243260
# NOTE(robinson) - The "Normal" style name will return None since it's in the mapping.
244261
# Unknown style names will also return None
245262
if is_list:
246-
return _text_to_element(text, is_list)
263+
return _text_to_element(text, is_list, language=language)
247264
elif element_class is None:
248-
return _text_to_element(text)
265+
return _text_to_element(text, language=language)
249266
else:
250267
return element_class(text)
251268

@@ -266,7 +283,7 @@ def _element_contains_pagebreak(element) -> bool:
266283
return False
267284

268285

269-
def _text_to_element(text: str, is_list=False) -> Optional[Text]:
286+
def _text_to_element(text: str, is_list=False, language="eng") -> Optional[Text]:
270287
"""Converts raw text into an unstructured Text element."""
271288
if is_bulleted_text(text) or is_list:
272289
clean_text = clean_bullets(text).strip()
@@ -280,8 +297,8 @@ def _text_to_element(text: str, is_list=False) -> Optional[Text]:
280297
return None
281298
elif is_possible_narrative_text(text):
282299
return NarrativeText(text)
283-
elif is_possible_title(text):
284-
return Title(text)
300+
# elif is_possible_title(text, title_max_word_length=20, language=language):
301+
# return Title(text)
285302
else:
286303
return Text(text)
287304

src/bisheng_unstructured/partition/pptx.py

Lines changed: 56 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import json
2+
import re
13
from tempfile import SpooledTemporaryFile
24
from typing import IO, BinaryIO, List, Optional, Union, cast
35

@@ -15,6 +17,7 @@
1517
Title,
1618
process_metadata,
1719
)
20+
from bisheng_unstructured.documents.markdown import transform_html_table_to_md
1821
from bisheng_unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
1922
from bisheng_unstructured.partition.common import (
2023
convert_ms_office_table_to_text,
@@ -31,6 +34,10 @@
3134

3235
OPENXML_SCHEMA_NAME = "{http://schemas.openxmlformats.org/drawingml/2006/main}"
3336

37+
RE_MULTLINES = re.compile(pattern=r"\n+", flags=re.DOTALL)
38+
RE_SPACES = re.compile(pattern=r"[ \t\r\f\v]+", flags=re.DOTALL)
39+
RE_NORMAL_SPACES = re.compile(pattern=r"\s+", flags=re.DOTALL)
40+
3441

3542
@process_metadata()
3643
@add_metadata_with_filetype(FileType.PPTX)
@@ -85,6 +92,10 @@ def partition_pptx(
8592
elements: List[Element] = []
8693
metadata = ElementMetadata(filename=metadata_filename or filename)
8794
num_slides = len(presentation.slides)
95+
slide_height = presentation.slide_height
96+
slide_width = presentation.slide_width
97+
page_bbox = [slide_width, slide_height]
98+
sel_i = 30
8899
for i, slide in enumerate(presentation.slides):
89100
metadata = ElementMetadata.from_dict(metadata.to_dict())
90101
metadata.last_modified = metadata_last_modified or last_modification_date
@@ -97,11 +108,16 @@ def partition_pptx(
97108
if notes_text.strip() != "":
98109
elements.append(NarrativeText(text=notes_text, metadata=metadata))
99110

111+
shape_infos = []
112+
shape_index = -1
100113
for shape in _order_shapes(slide.shapes):
114+
shape_index += 1
101115
if shape.has_table:
102116
table: pptx.table.Table = shape.table
103117
html_table = convert_ms_office_table_to_text(table, as_html=True)
104-
text_table = convert_ms_office_table_to_text(table, as_html=False)
118+
# text_table = convert_ms_office_table_to_text(table, as_html=False)
119+
text_table = transform_html_table_to_md(html_table)["text"]
120+
# print('---table---', html_table, text_table)
105121
if (text_table := text_table.strip()) != "":
106122
metadata = ElementMetadata(
107123
filename=metadata_filename or filename,
@@ -113,24 +129,46 @@ def partition_pptx(
113129
continue
114130
if not shape.has_text_frame:
115131
continue
116-
# NOTE(robinson) - avoid processing shapes that are not on the actual slide
117-
# NOTE - skip check if no top or left position (shape displayed top left)
118-
if (shape.top and shape.left) and (shape.top < 0 or shape.left < 0):
132+
133+
bbox = [shape.left, shape.top, shape.width, shape.height]
134+
shape_info = []
135+
shape_infos.append({"runs": shape_info, "bbox": bbox})
136+
metadata = {"bbox": bbox, "page_bbox": page_bbox}
137+
metadata = ElementMetadata(
138+
page_number=i, text_as_html=json.dumps(metadata), page_name="paragraph"
139+
)
140+
141+
TITLE_AREA_THRESHOLD = 0.2
142+
ratio = abs(bbox[3] - bbox[1]) * 1.0 / page_bbox[1]
143+
# print('bbox', bbox, page_bbox, ratio)
144+
145+
is_title = False
146+
text = None
147+
if shape_index == 0 and ratio <= TITLE_AREA_THRESHOLD:
148+
text = re.sub(RE_NORMAL_SPACES, " ", shape.text_frame.text)
149+
is_title = is_possible_title(
150+
text, language="zh", title_max_word_length=30, is_pptx=True
151+
)
152+
153+
if not is_title:
154+
text = shape.text_frame.text.replace("\x0b", "\n")
155+
text = re.sub(RE_MULTLINES, "\n", text).strip()
156+
text = re.sub(RE_SPACES, " ", text)
157+
158+
if text == "":
119159
continue
120-
for paragraph in shape.text_frame.paragraphs:
121-
text = paragraph.text
122-
if text.strip() == "":
123-
continue
124-
if _is_bulleted_paragraph(paragraph):
125-
elements.append(ListItem(text=text, metadata=metadata))
126-
elif is_email_address(text):
127-
elements.append(EmailAddress(text=text))
128-
elif is_possible_narrative_text(text):
129-
elements.append(NarrativeText(text=text, metadata=metadata))
130-
elif is_possible_title(text):
131-
elements.append(Title(text=text, metadata=metadata))
132-
else:
133-
elements.append(Text(text=text, metadata=metadata))
160+
161+
# for paragraph in shape.text_frame.paragraphs:
162+
# print('is_bulleted', _is_bulleted_paragraph(paragraph))
163+
164+
if is_email_address(text):
165+
elements.append(EmailAddress(text=text))
166+
elif is_possible_narrative_text(text):
167+
elements.append(NarrativeText(text=text, metadata=metadata))
168+
elif is_title:
169+
elements.append(Title(text=text, metadata=metadata))
170+
else:
171+
elements.append(Text(text=text, metadata=metadata))
134172

135173
if include_page_breaks and i < num_slides - 1:
136174
elements.append(PageBreak(text=""))

src/bisheng_unstructured/partition/text_type.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
UNICODE_BULLETS_RE,
1919
US_CITY_STATE_ZIP_RE,
2020
US_PHONE_NUMBERS_RE,
21+
ZH_PUNC_NOT_IN_PPTX_TITLE_RE,
2122
ZH_PUNC_NOT_IN_TITLE_RE,
2223
)
2324
from bisheng_unstructured.nlp.tokenize import pos_tag, sent_tokenize, word_tokenize
@@ -99,6 +100,7 @@ def is_possible_title(
99100
non_alpha_threshold: float = 0.5,
100101
language: str = "en",
101102
language_checks: bool = False,
103+
is_pptx: bool = False,
102104
) -> bool:
103105
"""Checks to see if the text passes all of the checks for a valid title.
104106
@@ -127,7 +129,8 @@ def is_possible_title(
127129
return False
128130

129131
if language == "zh":
130-
if ZH_PUNC_NOT_IN_TITLE_RE.search(text) is not None:
132+
PUNK_RE = ZH_PUNC_NOT_IN_PPTX_TITLE_RE if is_pptx else ZH_PUNC_NOT_IN_TITLE_RE
133+
if PUNK_RE.search(text) is not None:
131134
return False
132135

133136
title_max_word_length = int(

src/bisheng_unstructured/partition/xlsx.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
Table,
1111
process_metadata,
1212
)
13+
from bisheng_unstructured.documents.markdown import transform_html_table_to_md
1314
from bisheng_unstructured.file_utils.filetype import FileType, add_metadata_with_filetype
1415
from bisheng_unstructured.partition.common import (
1516
exactly_one,
@@ -63,7 +64,9 @@ def partition_xlsx(
6364
for sheet_name, table in sheets.items():
6465
page_number += 1
6566
html_text = table.to_html(index=False, header=include_header, na_rep="")
66-
text = soupparser_fromstring(html_text).text_content()
67+
text = transform_html_table_to_md(html_text)["text"]
68+
69+
# text = soupparser_fromstring(html_text).text_content()
6770

6871
if include_metadata:
6972
metadata = ElementMetadata(

tests/test_doc.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
from bisheng_unstructured.documents.html_utils import save_to_txt, visualize_html
2+
from bisheng_unstructured.partition.docx import partition_docx
3+
4+
5+
def test_docx():
6+
filename = "./examples/docs/maoxuan_sample.docx"
7+
elements = partition_docx(filename=filename)
8+
9+
output_file = "./data/maoxuan_sample_docx.html"
10+
output_file2 = "./data/maoxuan_sample_docx.txt"
11+
visualize_html(elements, output_file)
12+
save_to_txt(elements, output_file2)
13+
14+
15+
def test_docx2():
16+
filename = "./examples/docs/handbook-1p.docx"
17+
elements = partition_docx(filename=filename)
18+
19+
output_file = "./data/handbook-1p.html"
20+
visualize_html(elements, output_file)
21+
22+
23+
def test_docx3():
24+
import docx
25+
26+
filename = "./examples/docs/handbook-1p.docx"
27+
output = "./examples/docs/handbook-1p.pdf"
28+
29+
# Open the .docs file
30+
doc = docx.Document(filename)
31+
# Save the file as pdf
32+
doc.save(output)
33+
34+
35+
def test4():
36+
inp = "./examples/docs/handbook-1p.docx"
37+
outp = "./examples/docs/handbook-1p.pdf"
38+
39+
import pypandoc
40+
41+
pypandoc.convert_file(inp, "pdf", outputfile=outp)
42+
43+
44+
def test5():
45+
inp = "./examples/docs/maoxuan_sample.docx"
46+
outp = "./data/maoxuan_sample.pdf"
47+
48+
49+
test_docx()
50+
# test_docx2()
51+
# test_docx3()
52+
# test4()
53+
# test5()

0 commit comments

Comments
 (0)