Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions magic_pdf/dict2md/ocr_mkcontent.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,11 +126,35 @@ def detect_language(text):
return 'empty'


def full_to_half(text: str) -> str:
"""Convert full-width characters to half-width characters using code point manipulation.

Args:
text: String containing full-width characters

Returns:
String with full-width characters converted to half-width
"""
result = []
for char in text:
code = ord(char)
# Full-width ASCII variants (FF01-FF5E)
if 0xFF01 <= code <= 0xFF5E:
result.append(chr(code - 0xFEE0)) # Shift to ASCII range
# Full-width space
elif code == 0x3000:
result.append(' ')
else:
result.append(char)
return ''.join(result)


def merge_para_with_text(para_block):
block_text = ''
for line in para_block['lines']:
for span in line['spans']:
if span['type'] in [ContentType.Text]:
span['content'] = full_to_half(span['content'])
block_text += span['content']
block_lang = detect_lang(block_text)

Expand Down
16 changes: 9 additions & 7 deletions magic_pdf/model/doc_analyze_by_custom_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ def doc_analyze(
)

batch_analyze = False
batch_ratio = 1
device = get_device()

npu_support = False
Expand All @@ -181,7 +182,6 @@ def doc_analyze(
batch_ratio = 2

logger.info(f'gpu_memory: {gpu_memory} GB, batch_ratio: {batch_ratio}')
batch_model = BatchAnalyze(model=custom_model, batch_ratio=batch_ratio)
batch_analyze = True

model_json = []
Expand All @@ -190,24 +190,26 @@ def doc_analyze(
if batch_analyze:
# batch analyze
images = []
page_wh_list = []
for index in range(len(dataset)):
if start_page_id <= index <= end_page_id:
page_data = dataset.get_page(index)
img_dict = page_data.get_image()
images.append(img_dict['img'])
page_wh_list.append((img_dict['width'], img_dict['height']))
batch_model = BatchAnalyze(model=custom_model, batch_ratio=batch_ratio)
analyze_result = batch_model(images)

for index in range(len(dataset)):
page_data = dataset.get_page(index)
img_dict = page_data.get_image()
page_width = img_dict['width']
page_height = img_dict['height']
if start_page_id <= index <= end_page_id:
result = analyze_result.pop(0)
page_width, page_height = page_wh_list.pop(0)
else:
result = []
page_height = 0
page_width = 0

page_info = {'page_no': index, 'height': page_height, 'width': page_width}
page_info = {'page_no': index, 'width': page_width, 'height': page_height}
page_dict = {'layout_dets': result, 'page_info': page_info}
model_json.append(page_dict)

Expand All @@ -227,7 +229,7 @@ def doc_analyze(
else:
result = []

page_info = {'page_no': index, 'height': page_height, 'width': page_width}
page_info = {'page_no': index, 'width': page_width, 'height': page_height}
page_dict = {'layout_dets': result, 'page_info': page_info}
model_json.append(page_dict)

Expand Down