Merge pull request #1776 from myhloli/dev

myhloli · web-flow · commit fddf111f35b1 · 2025-02-25T18:15:35.000+08:00
perf(model): optimize batch analyze process
diff --git a/magic_pdf/dict2md/ocr_mkcontent.py b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -126,11 +126,35 @@ def detect_language(text):
         return 'empty'
 
 
+def full_to_half(text: str) -> str:
+    """Convert full-width characters to half-width characters using code point manipulation.
+
+    Args:
+        text: String containing full-width characters
+
+    Returns:
+        String with full-width characters converted to half-width
+    """
+    result = []
+    for char in text:
+        code = ord(char)
+        # Full-width ASCII variants (FF01-FF5E)
+        if 0xFF01 <= code <= 0xFF5E:
+            result.append(chr(code - 0xFEE0))  # Shift to ASCII range
+        # Full-width space
+        elif code == 0x3000:
+            result.append(' ')
+        else:
+            result.append(char)
+    return ''.join(result)
+
+
 def merge_para_with_text(para_block):
     block_text = ''
     for line in para_block['lines']:
         for span in line['spans']:
             if span['type'] in [ContentType.Text]:
+                span['content'] = full_to_half(span['content'])
                 block_text += span['content']
     block_lang = detect_lang(block_text)
 
diff --git a/magic_pdf/model/doc_analyze_by_custom_model.py b/magic_pdf/model/doc_analyze_by_custom_model.py
@@ -157,6 +157,7 @@ def doc_analyze(
     )
 
     batch_analyze = False
+    batch_ratio = 1
     device = get_device()
 
     npu_support = False
@@ -181,7 +182,6 @@ def doc_analyze(
                 batch_ratio = 2
 
             logger.info(f'gpu_memory: {gpu_memory} GB, batch_ratio: {batch_ratio}')
-            batch_model = BatchAnalyze(model=custom_model, batch_ratio=batch_ratio)
             batch_analyze = True
 
     model_json = []
@@ -190,24 +190,26 @@ def doc_analyze(
     if batch_analyze:
         # batch analyze
         images = []
+        page_wh_list = []
         for index in range(len(dataset)):
             if start_page_id <= index <= end_page_id:
                 page_data = dataset.get_page(index)
                 img_dict = page_data.get_image()
                 images.append(img_dict['img'])
+                page_wh_list.append((img_dict['width'], img_dict['height']))
+        batch_model = BatchAnalyze(model=custom_model, batch_ratio=batch_ratio)
         analyze_result = batch_model(images)
 
         for index in range(len(dataset)):
-            page_data = dataset.get_page(index)
-            img_dict = page_data.get_image()
-            page_width = img_dict['width']
-            page_height = img_dict['height']
             if start_page_id <= index <= end_page_id:
                 result = analyze_result.pop(0)
+                page_width, page_height = page_wh_list.pop(0)
             else:
                 result = []
+                page_height = 0
+                page_width = 0
 
-            page_info = {'page_no': index, 'height': page_height, 'width': page_width}
+            page_info = {'page_no': index, 'width': page_width, 'height': page_height}
             page_dict = {'layout_dets': result, 'page_info': page_info}
             model_json.append(page_dict)
 
@@ -227,7 +229,7 @@ def doc_analyze(
             else:
                 result = []
 
-            page_info = {'page_no': index, 'height': page_height, 'width': page_width}
+            page_info = {'page_no': index, 'width': page_width, 'height': page_height}
             page_dict = {'layout_dets': result, 'page_info': page_info}
             model_json.append(page_dict)