Skip to content

Commit 7130a24

Browse files
committed
Better splitting
1 parent 65e5616 commit 7130a24

File tree

9 files changed

+44
-53
lines changed

9 files changed

+44
-53
lines changed

app/backend/prepdocslib/blobmanager.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ def sourcepage_from_file_page(cls, filename, page=0) -> str:
171171

172172
@classmethod
173173
def blob_image_name_from_file_page(cls, filename, page=0) -> str:
174-
return os.path.splitext(os.path.basename(filename))[0] + f"-{page}" + ".png"
174+
return os.path.splitext(os.path.basename(filename))[0] + f"-{page+1}" + ".png"
175175

176176
@classmethod
177177
def blob_name_from_file_name(cls, filename) -> str:

app/backend/prepdocslib/cu_image.py

Lines changed: 5 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
PATH_ANALYZER_INFERENCE = "/analyzers/{analyzerId}:analyze"
1818
PATH_ANALYZER_INFERENCE_GET_IMAGE = "/analyzers/{analyzerId}/results/{operationId}/images/{imageId}"
1919

20-
analyzer_name = "image_schema_analyzer"
20+
analyzer_name = "image_analyzer"
2121
image_schema = {
2222
"analyzerId": analyzer_name,
2323
"name": "Image understanding",
@@ -27,33 +27,11 @@
2727
"config": {"returnDetails": False},
2828
"fieldSchema": {
2929
"name": "ImageInformation",
30-
"descriptions": "Structured information from images.",
30+
"descriptions": "Description of image.",
3131
"fields": {
32-
"Title": {
32+
"Description": {
3333
"type": "string",
34-
"description": "Title for the image (either taken from the image directly or a good short title based off content)",
35-
},
36-
"ImageType": {
37-
"type": "string",
38-
"description": "The type of image.",
39-
"kind": "classify",
40-
"enum": [
41-
"chart",
42-
"diagram",
43-
"table",
44-
"figure",
45-
"photo",
46-
"screenshot",
47-
"logo",
48-
"icon",
49-
"map",
50-
"infographic",
51-
"other",
52-
],
53-
},
54-
"MarkdownDescription": {
55-
"type": "string",
56-
"description": "Description of the image in markdown format. Start with a 2-sentence summary. If the image is a chart, diagram, or table, include the underlying data in tabular markdown format, with valid syntax and accurate numbers. If the image is a chart, describe any axis or legends.",
34+
"description": "Description of the image. If the image has a title, start with the title. Include a 2-sentence summary. If the image is a chart, diagram, or table, include the underlying data in an HTML table tag, with accurate numbers. If the image is a chart, describe any axis or legends. The only allowed HTML tags are the table/thead/tr/td/tbody tags.",
5735
},
5836
},
5937
},
@@ -133,4 +111,4 @@ async def poll():
133111

134112
results = await poll()
135113
fields = results["result"]["contents"][0]["fields"]
136-
return f"Title: {fields['Title']['valueString']}\n\nType: {fields['ImageType']['valueString']}\n\nDescription: {fields['MarkdownDescription']['valueString']}"
114+
return fields["DescriptionHTML"]["valueString"]

app/backend/prepdocslib/figure_output.json

Whitespace-only changes.

app/backend/prepdocslib/page.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ class Page:
33
A single page from a document
44
55
Attributes:
6-
page_num (int): Page number
6+
page_num (int): Page number (0-indexed)
77
offset (int): If the text of the entire Document was concatenated into a single string, the index of the first character on the page. For example, if page 1 had the text "hello" and page 2 had the text "world", the offset of page 2 is 5 ("hellow")
88
text (str): The text of the page
99
"""
@@ -17,6 +17,10 @@ def __init__(self, page_num: int, offset: int, text: str):
1717
class SplitPage:
1818
"""
1919
A section of a page that has been split into a smaller chunk.
20+
21+
Attributes:
22+
page_num (int): Page number (0-indexed)
23+
text (str): The text of the section
2024
"""
2125

2226
def __init__(self, page_num: int, text: str):

app/backend/prepdocslib/pdfparser.py

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import html
22
import io
3+
import json
34
import logging
45
from enum import Enum
56
from typing import IO, AsyncGenerator, Union
@@ -8,6 +9,7 @@
89
from azure.ai.documentintelligence.aio import DocumentIntelligenceClient
910
from azure.ai.documentintelligence.models import (
1011
AnalyzeDocumentRequest,
12+
AnalyzeResult,
1113
DocumentFigure,
1214
DocumentTable,
1315
)
@@ -83,19 +85,20 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
8385
poller = await document_intelligence_client.begin_analyze_document(
8486
model_id=self.model_id, analyze_request=content, content_type="application/octet-stream"
8587
)
86-
form_recognizer_results = await poller.result()
88+
form_recognizer_results: AnalyzeResult = await poller.result()
8789

8890
offset = 0
89-
for page_num, page in enumerate(form_recognizer_results.pages):
91+
pages_json = []
92+
for page in form_recognizer_results.pages:
9093
tables_on_page = [
9194
table
9295
for table in (form_recognizer_results.tables or [])
93-
if table.bounding_regions and table.bounding_regions[0].page_number == page_num + 1
96+
if table.bounding_regions and table.bounding_regions[0].page_number == page.page_number
9497
]
9598
figures_on_page = [
9699
figure
97100
for figure in (form_recognizer_results.figures or [])
98-
if figure.bounding_regions and figure.bounding_regions[0].page_number == page_num + 1
101+
if figure.bounding_regions and figure.bounding_regions[0].page_number == page.page_number
99102
]
100103

101104
class ObjectType(Enum):
@@ -135,13 +138,26 @@ class ObjectType(Enum):
135138
added_objects.add(mask_char)
136139
elif object_type == ObjectType.FIGURE:
137140
if mask_char not in added_objects:
138-
page_text += await DocumentAnalysisParser.figure_to_html(
141+
figure_html = await DocumentAnalysisParser.figure_to_html(
139142
doc_for_pymupdf, cu_manager, figures_on_page[object_idx]
140143
)
144+
page_text += figure_html
141145
added_objects.add(mask_char)
142-
# TODO: reset page numbers based on the mask
143-
yield Page(page_num=page_num, offset=offset, text=page_text)
146+
# We remove these comments since they are not needed and skew the page numbers
147+
page_text = page_text.replace("<!-- PageBreak -->", "")
148+
# We remove excess newlines at the beginning and end of the page
149+
page_text = page_text.strip()
150+
yield Page(page_num=page.page_number - 1, offset=offset, text=page_text)
151+
# Serialize the page text to a JSON and save it locally
152+
page_json = {
153+
"page_num": page.page_number - 1,
154+
"offset": offset,
155+
"text": page_text,
156+
}
157+
pages_json.append(page_json)
144158
offset += len(page_text)
159+
with open("pages.json", "w") as f:
160+
json.dump(pages_json, f)
145161

146162
@staticmethod
147163
async def figure_to_html(
@@ -158,12 +174,12 @@ async def figure_to_html(
158174
page_number = figure.bounding_regions[0]["pageNumber"] # 1-indexed
159175
cropped_img = DocumentAnalysisParser.crop_image_from_pdf_page(doc, page_number - 1, bounding_box)
160176
figure_description = await cu_manager.verbalize_figure(cropped_img)
161-
# TODO: add DI's original figcaption to this caption - figure.caption.content
162-
return f"<figure><figcaption>{figure_description}</figcaption></figure>"
177+
figure_title = (figure.caption and figure.caption.content) or ""
178+
return f"<figure><figcaption>{figure_title}<br>{figure_description}</figcaption></figure>"
163179

164180
@staticmethod
165181
def table_to_html(table: DocumentTable):
166-
table_html = "<table>"
182+
table_html = "<figure><table>"
167183
rows = [
168184
sorted([cell for cell in table.cells if cell.row_index == i], key=lambda cell: cell.column_index)
169185
for i in range(table.row_count)
@@ -179,7 +195,7 @@ def table_to_html(table: DocumentTable):
179195
cell_spans += f" rowSpan={cell.row_span}"
180196
table_html += f"<{tag}{cell_spans}>{html.escape(cell.content)}</{tag}>"
181197
table_html += "</tr>"
182-
table_html += "</table>"
198+
table_html += "</table></figure>"
183199
return table_html
184200

185201
@staticmethod

app/backend/prepdocslib/textsplitter.py

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ def split_page_by_max_tokens(self, page_num: int, text: str) -> Generator[SplitP
103103
tokens = bpe.encode(text)
104104
if len(tokens) <= self.max_tokens_per_section:
105105
# Section is already within max tokens, return
106+
print(f"Page {page_num}: {text}")
106107
yield SplitPage(page_num=page_num, text=text)
107108
else:
108109
# Start from the center and try and find the closest sentence ending by spiralling outward.
@@ -192,24 +193,15 @@ def find_page(offset):
192193
section_text = all_text[start:end]
193194
yield from self.split_page_by_max_tokens(page_num=find_page(start), text=section_text)
194195

195-
last_table_start = section_text.rfind("<table")
196196
last_figure_start = section_text.rfind("<figure")
197-
if last_table_start > 2 * self.sentence_search_limit and last_table_start > section_text.rfind("</table"):
198-
# If the section ends with an unclosed table, we need to start the next section with the table.
199-
# If table starts inside sentence_search_limit, we ignore it, as that will cause an infinite loop for tables longer than MAX_SECTION_LENGTH
200-
# If last table starts inside section_overlap, keep overlapping
201-
logger.info(
202-
f"Section ends with unclosed table, starting next section with the table at page {find_page(start)} offset {start} table start {last_table_start}"
203-
)
204-
start = min(end - self.section_overlap, start + last_table_start)
205-
elif last_figure_start > 2 * self.sentence_search_limit and last_figure_start > section_text.rfind(
197+
if last_figure_start > 2 * self.sentence_search_limit and last_figure_start > section_text.rfind(
206198
"</figure"
207199
):
208200
# If the section ends with an unclosed figure, we need to start the next section with the figure.
201+
start = min(end - self.section_overlap, start + last_figure_start)
209202
logger.info(
210203
f"Section ends with unclosed figure, starting next section with the figure at page {find_page(start)} offset {start} figure start {last_figure_start}"
211204
)
212-
start = min(end - self.section_overlap, start + last_figure_start)
213205
else:
214206
start = end - self.section_overlap
215207

docs/data_ingestion.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ A [recent change](https://github.com/Azure-Samples/azure-search-openai-demo/pull
6969

7070
You may want to remove documents from the index. For example, if you're using the sample data, you may want to remove the documents that are already in the index before adding your own.
7171

72-
To remove all documents, use `scripts/prepdocs.sh --removeall` or `scripts/prepdocs.ps1 --removeall`.
72+
To remove all documents, use `./scripts/prepdocs.sh --removeall` or `./scripts/prepdocs.ps1 --removeall`.
7373

7474
You can also remove individual documents by using the `--remove` flag. Open either `scripts/prepdocs.sh` or `scripts/prepdocs.ps1` and replace `/data/*` with `/data/YOUR-DOCUMENT-FILENAME-GOES-HERE.pdf`. Then run `scripts/prepdocs.sh --remove` or `scripts/prepdocs.ps1 --remove`.
7575

0 commit comments

Comments
 (0)