add demo_zh.png, html output and latex code postprocess

PrinceVictor · PrinceVictor · commit 1f60d91f2414 · 2024-07-25T20:42:24.000+08:00
diff --git a/demo/demo.py b/demo/demo.py
@@ -1,3 +1,4 @@
+import time
 import torch
 import argparse
 
@@ -9,23 +10,38 @@ def parse_config():
     parser = argparse.ArgumentParser(description='arg parser')
     parser.add_argument('--image_path', type=str, default='demo.png', help='data path for table image')
     parser.add_argument('--ckpt_path', type=str, default='', help='ckpt path for table model')
+    parser.add_argument('--cpu', action='store_true', default=False, help='using cpu for inference')
+    parser.add_argument('--html', action='store_true', default=False, help='output html format table code')
     args = parser.parse_args()
     return args
 
 def main():
     args = parse_config()
+    if args.html:
+        from pypandoc import convert_text
 
     # build model
-    model = build_model(args.ckpt_path, max_new_tokens=4096, max_time=120)
+    model = build_model(args.ckpt_path, max_new_tokens=4096, max_time=120, use_gpu=(not args.cpu))
+    if not args.cpu:
+        model = model.cuda()
 
     # model inference
     raw_image = Image.open(args.image_path)
+    
+    start_time = time.time()
     with torch.no_grad():
         output = model(raw_image)
 
     # show output latex code of table
+    cost_time = time.time() - start_time
+    print(f"total cost time: {cost_time:.2f}s")
     for i, latex_code in enumerate(output):
-        print(f"Table {i}:\n{latex_code}")
+        if args.html:
+            html_code = convert_text(latex_code, 'html', format='latex')
+            print(f"Table {i} HTML code:\n{html_code}")
+        else:
+            print(f"Table {i} LaTex code:\n{latex_code}")
+
 
 if __name__ == '__main__':
     main()
diff --git a/demo/demo_zh.png b/demo/demo_zh.png
diff --git a/struct_eqtable/model.py b/struct_eqtable/model.py
@@ -1,24 +1,33 @@
+import re
 import torch
 
 from torch import nn
 from transformers import AutoModelForVision2Seq, AutoProcessor
 
 
 class StructTable(nn.Module):
-    def __init__(self, model_path, max_new_tokens=2048, max_time=60):
+    def __init__(self, model_path, max_new_tokens=2048, max_time=60, use_gpu=True):
         super().__init__()
         self.model_path = model_path
         self.max_new_tokens = max_new_tokens
         self.max_generate_time = max_time
+        self.use_gpu = use_gpu
 
         # init model and image processor from ckpt path
         self.init_image_processor(model_path)
         self.init_model(model_path)
-    
+
+        self.special_str_list = ['\\midrule', '\\hline']
+
+    def postprocess_latex_code(self, code):
+        for special_str in self.special_str_list:
+            code = code.replace(special_str, special_str + ' ')
+        return code
+
     def init_model(self, model_path):
         self.model = AutoModelForVision2Seq.from_pretrained(model_path)
         self.model.eval()
-    
+
     def init_image_processor(self, image_processor_path):
         self.data_processor = AutoProcessor.from_pretrained(image_processor_path)
 
@@ -28,6 +37,9 @@ def forward(self, image):
             images=image,
             return_tensors='pt',
         )
+        if self.use_gpu:
+            for k, v in image_tokens.items():
+                image_tokens[k] = v.cuda()
 
         # generate text from image tokens
         model_output = self.model.generate(
@@ -37,5 +49,8 @@ def forward(self, image):
             max_time=self.max_generate_time
         )
         latex_codes = self.data_processor.batch_decode(model_output, skip_special_tokens=True)
+        # postprocess
+        for i, code in enumerate(latex_codes):
+            latex_codes[i] = self.postprocess_latex_code(code)
 
         return latex_codes