Skip to content

Commit fddf111

Browse files
authored
Merge pull request #1776 from myhloli/dev
perf(model): optimize batch analyze process
2 parents b22e87c + 315adbc commit fddf111

File tree

2 files changed

+33
-7
lines changed

2 files changed

+33
-7
lines changed

magic_pdf/dict2md/ocr_mkcontent.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,11 +126,35 @@ def detect_language(text):
126126
return 'empty'
127127

128128

129+
def full_to_half(text: str) -> str:
130+
"""Convert full-width characters to half-width characters using code point manipulation.
131+
132+
Args:
133+
text: String containing full-width characters
134+
135+
Returns:
136+
String with full-width characters converted to half-width
137+
"""
138+
result = []
139+
for char in text:
140+
code = ord(char)
141+
# Full-width ASCII variants (FF01-FF5E)
142+
if 0xFF01 <= code <= 0xFF5E:
143+
result.append(chr(code - 0xFEE0)) # Shift to ASCII range
144+
# Full-width space
145+
elif code == 0x3000:
146+
result.append(' ')
147+
else:
148+
result.append(char)
149+
return ''.join(result)
150+
151+
129152
def merge_para_with_text(para_block):
130153
block_text = ''
131154
for line in para_block['lines']:
132155
for span in line['spans']:
133156
if span['type'] in [ContentType.Text]:
157+
span['content'] = full_to_half(span['content'])
134158
block_text += span['content']
135159
block_lang = detect_lang(block_text)
136160

magic_pdf/model/doc_analyze_by_custom_model.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ def doc_analyze(
157157
)
158158

159159
batch_analyze = False
160+
batch_ratio = 1
160161
device = get_device()
161162

162163
npu_support = False
@@ -181,7 +182,6 @@ def doc_analyze(
181182
batch_ratio = 2
182183

183184
logger.info(f'gpu_memory: {gpu_memory} GB, batch_ratio: {batch_ratio}')
184-
batch_model = BatchAnalyze(model=custom_model, batch_ratio=batch_ratio)
185185
batch_analyze = True
186186

187187
model_json = []
@@ -190,24 +190,26 @@ def doc_analyze(
190190
if batch_analyze:
191191
# batch analyze
192192
images = []
193+
page_wh_list = []
193194
for index in range(len(dataset)):
194195
if start_page_id <= index <= end_page_id:
195196
page_data = dataset.get_page(index)
196197
img_dict = page_data.get_image()
197198
images.append(img_dict['img'])
199+
page_wh_list.append((img_dict['width'], img_dict['height']))
200+
batch_model = BatchAnalyze(model=custom_model, batch_ratio=batch_ratio)
198201
analyze_result = batch_model(images)
199202

200203
for index in range(len(dataset)):
201-
page_data = dataset.get_page(index)
202-
img_dict = page_data.get_image()
203-
page_width = img_dict['width']
204-
page_height = img_dict['height']
205204
if start_page_id <= index <= end_page_id:
206205
result = analyze_result.pop(0)
206+
page_width, page_height = page_wh_list.pop(0)
207207
else:
208208
result = []
209+
page_height = 0
210+
page_width = 0
209211

210-
page_info = {'page_no': index, 'height': page_height, 'width': page_width}
212+
page_info = {'page_no': index, 'width': page_width, 'height': page_height}
211213
page_dict = {'layout_dets': result, 'page_info': page_info}
212214
model_json.append(page_dict)
213215

@@ -227,7 +229,7 @@ def doc_analyze(
227229
else:
228230
result = []
229231

230-
page_info = {'page_no': index, 'height': page_height, 'width': page_width}
232+
page_info = {'page_no': index, 'width': page_width, 'height': page_height}
231233
page_dict = {'layout_dets': result, 'page_info': page_info}
232234
model_json.append(page_dict)
233235

0 commit comments

Comments
 (0)