Skip to content

Commit 1cc86c1

Browse files
committed
初始化仓库文件
1 parent d7996e2 commit 1cc86c1

File tree

17 files changed

+17184
-0
lines changed

17 files changed

+17184
-0
lines changed

OCR.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
import wcocr
2+
import os
3+
from docx import Document
4+
from docx.shared import Pt
5+
from docx.oxml.ns import qn
6+
from colorama import init, Fore, Style
7+
8+
def find_wechat_path():
9+
script_dir = os.path.dirname(os.path.abspath(__file__))
10+
common_paths = os.path.join(script_dir, 'path')
11+
if os.path.exists(common_paths):
12+
return common_paths
13+
else:
14+
print(f"The path folder does not exist at {common_paths}.")
15+
return None
16+
17+
def find_wechatocr_exe():
18+
script_dir = os.path.dirname(os.path.abspath(__file__))
19+
wechatocr_path = os.path.join(script_dir, 'path', 'WeChatOCR', 'WeChatOCR.exe')
20+
if os.path.isfile(wechatocr_path):
21+
return wechatocr_path
22+
else:
23+
print(f"The WeChatOCR.exe does not exist at {wechatocr_path}.")
24+
return None
25+
26+
def wechat_ocr(image_path):
27+
wechat_path = find_wechat_path()
28+
wechatocr_path = find_wechatocr_exe()
29+
if not wechat_path or not wechatocr_path:
30+
return [] # 返回空结果
31+
32+
wcocr.init(wechatocr_path, wechat_path)
33+
result = wcocr.ocr(image_path)
34+
texts = []
35+
36+
for temp in result['ocr_response']:
37+
text = temp['text']
38+
if isinstance(text, bytes):
39+
text = text.decode('utf-8', errors='ignore')
40+
texts.append(text)
41+
42+
return texts
43+
44+
def save_to_docx(texts, output_path):
45+
doc = Document()
46+
47+
for text in texts:
48+
# 添加段落并设置宋体字体
49+
paragraph = doc.add_paragraph()
50+
run = paragraph.add_run(text)
51+
run.font.name = '宋体'
52+
53+
# 设置字体为宋体 (兼容中文设置)
54+
r = run._element
55+
r.rPr.rFonts.set(qn('w:eastAsia'), '宋体')
56+
57+
# 设置字体大小为五号字体 (10.5 磅)
58+
run.font.size = Pt(10.5)
59+
60+
doc.save(output_path)
61+
62+
def process_all_images():
63+
script_dir = os.path.dirname(os.path.abspath(__file__))
64+
src_folder = os.path.join(script_dir, 'src')
65+
docx_folder = os.path.join(script_dir, 'docx')
66+
67+
if not os.path.exists(docx_folder):
68+
os.makedirs(docx_folder)
69+
70+
# 支持的图像格式
71+
image_extensions = ('.jpg', '.jpeg', '.png', '.bmp', '.tif')
72+
73+
# 遍历 src 文件夹及其所有子文件夹
74+
for root, dirs, files in os.walk(src_folder):
75+
for file in files:
76+
if file.lower().endswith(image_extensions):
77+
image_path = os.path.join(root, file)
78+
relative_path = os.path.relpath(root, src_folder)
79+
docx_folder_path = os.path.join(docx_folder, relative_path)
80+
81+
# 确保 docx 文件夹路径存在
82+
if not os.path.exists(docx_folder_path):
83+
os.makedirs(docx_folder_path)
84+
85+
# 处理图片文件
86+
print(Fore.GREEN + f"正在处理: {os.path.relpath(image_path, script_dir)}" + Style.RESET_ALL)
87+
texts = wechat_ocr(image_path)
88+
image_name = os.path.splitext(file)[0]
89+
output_docx = os.path.join(docx_folder_path, f'{image_name}_OCR.docx')
90+
save_to_docx(texts, output_docx)
91+
# 显示相对路径
92+
relative_docx_path = os.path.relpath(output_docx, script_dir)
93+
print(f"OCR 结果已保存到: {relative_docx_path}\n")
94+
95+
if __name__ == '__main__':
96+
init(autoreset=True) # 初始化 colorama
97+
process_all_images()
98+
print(Fore.RED + "全部文件处理完成,请按 Enter 键退出……" + Style.RESET_ALL)
99+
input()

README.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,12 @@
11
# WeChatOCR
22
这是一个采用Python调用微信OCR功能,进行批处理图片OCR的代码。
3+
4+
首先非常感谢swigger,52PJ的FeiyuYip,nulptr以及其他对此做出贡献的朋友。
5+
6+
基于他们的工作,改动如下:
7+
1. 将WeChatOCR.exe做了本地化,不再依赖微信的安装路径。
8+
2. 将图片处理的格式多样化,增加了jpg,jpeg,bmp,tif格式的处理,只需要将文件放入scr文件夹中的即可。
9+
3. 将OCR的处理结果将以docx格式保存到docx文件夹中。
10+
11+
关于源文件的问题:
12+
我感觉wenchatocr对png格式的处理能力比较好,所以建议将图片格式转换为png以后再做OCR处理。
9.13 MB
Binary file not shown.
1.53 MB
Binary file not shown.
2.7 MB
Binary file not shown.
7.29 MB
Binary file not shown.

0 commit comments

Comments
 (0)