Skip to content

Commit 3edaf85

Browse files
committed
feat: 增强文档分析功能,添加详细的文本分析和可读性评分
1 parent dbe5c34 commit 3edaf85

File tree

2 files changed

+208
-0
lines changed

2 files changed

+208
-0
lines changed
Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
from typing import Dict, List, Optional
2+
import re
3+
import nltk
4+
from collections import Counter
5+
from nltk.tokenize import word_tokenize, sent_tokenize
6+
from nltk.corpus import stopwords
7+
import PyPDF2
8+
from docx import Document
9+
import markdown
10+
from bs4 import BeautifulSoup
11+
12+
class DocumentAnalyzer:
13+
"""文档分析器"""
14+
15+
def __init__(self):
16+
# 下载必要的NLTK数据
17+
try:
18+
nltk.data.find('tokenizers/punkt')
19+
except LookupError:
20+
nltk.download('punkt')
21+
try:
22+
nltk.data.find('corpora/stopwords')
23+
except LookupError:
24+
nltk.download('stopwords')
25+
26+
def analyze_text(self, text: str) -> Dict[str, any]:
27+
"""
28+
分析文本内容
29+
30+
Args:
31+
text: 要分析的文本
32+
33+
Returns:
34+
包含分析结果的字典
35+
"""
36+
# 分词和句子分割
37+
words = word_tokenize(text.lower())
38+
sentences = sent_tokenize(text)
39+
40+
# 去除停用词
41+
stop_words = set(stopwords.words('english'))
42+
words_no_stop = [word for word in words if word.isalnum() and word not in stop_words]
43+
44+
# 词频统计
45+
word_freq = Counter(words_no_stop)
46+
47+
# 计算平均句子长度
48+
avg_sentence_length = len(words) / len(sentences) if sentences else 0
49+
50+
return {
51+
'word_count': len(words),
52+
'sentence_count': len(sentences),
53+
'unique_words': len(set(words_no_stop)),
54+
'avg_sentence_length': round(avg_sentence_length, 2),
55+
'top_words': dict(word_freq.most_common(10)),
56+
'readability_score': self._calculate_readability(text)
57+
}
58+
59+
def analyze_document(self, file_path: str) -> Dict[str, any]:
60+
"""
61+
分析文档文件
62+
63+
Args:
64+
file_path: 文档文件路径
65+
66+
Returns:
67+
包含分析结果的字典
68+
"""
69+
text = self._extract_text(file_path)
70+
if not text:
71+
return {
72+
'error': 'Failed to extract text from document',
73+
'analysis': None
74+
}
75+
76+
analysis = self.analyze_text(text)
77+
return {
78+
'error': None,
79+
'analysis': analysis
80+
}
81+
82+
def _extract_text(self, file_path: str) -> Optional[str]:
83+
"""从不同格式的文档中提取文本"""
84+
try:
85+
ext = file_path.lower().split('.')[-1]
86+
87+
if ext == 'pdf':
88+
return self._extract_from_pdf(file_path)
89+
elif ext in ['doc', 'docx']:
90+
return self._extract_from_docx(file_path)
91+
elif ext == 'md':
92+
return self._extract_from_markdown(file_path)
93+
elif ext == 'txt':
94+
return self._extract_from_txt(file_path)
95+
else:
96+
return None
97+
except Exception:
98+
return None
99+
100+
def _extract_from_pdf(self, file_path: str) -> str:
101+
"""从PDF文件提取文本"""
102+
text = ""
103+
with open(file_path, 'rb') as file:
104+
reader = PyPDF2.PdfReader(file)
105+
for page in reader.pages:
106+
text += page.extract_text()
107+
return text
108+
109+
def _extract_from_docx(self, file_path: str) -> str:
110+
"""从Word文档提取文本"""
111+
doc = Document(file_path)
112+
return ' '.join([paragraph.text for paragraph in doc.paragraphs])
113+
114+
def _extract_from_markdown(self, file_path: str) -> str:
115+
"""从Markdown文件提取文本"""
116+
with open(file_path, 'r', encoding='utf-8') as file:
117+
md_text = file.read()
118+
html = markdown.markdown(md_text)
119+
soup = BeautifulSoup(html, 'html.parser')
120+
return soup.get_text()
121+
122+
def _extract_from_txt(self, file_path: str) -> str:
123+
"""从文本文件提取文本"""
124+
with open(file_path, 'r', encoding='utf-8') as file:
125+
return file.read()
126+
127+
def _calculate_readability(self, text: str) -> float:
128+
"""计算文本可读性分数(使用简化的Flesch Reading Ease公式)"""
129+
sentences = sent_tokenize(text)
130+
words = word_tokenize(text)
131+
132+
if not sentences or not words:
133+
return 0.0
134+
135+
avg_sentence_length = len(words) / len(sentences)
136+
syllable_count = sum([self._count_syllables(word) for word in words])
137+
avg_syllables_per_word = syllable_count / len(words)
138+
139+
# Flesch Reading Ease Score
140+
score = 206.835 - (1.015 * avg_sentence_length) - (84.6 * avg_syllables_per_word)
141+
return round(max(0, min(100, score)), 2)
142+
143+
def _count_syllables(self, word: str) -> int:
144+
"""计算单词的音节数"""
145+
word = word.lower()
146+
count = 0
147+
vowels = "aeiouy"
148+
if word[0] in vowels:
149+
count += 1
150+
for index in range(1, len(word)):
151+
if word[index] in vowels and word[index - 1] not in vowels:
152+
count += 1
153+
if word.endswith("e"):
154+
count -= 1
155+
if count == 0:
156+
count += 1
157+
return count

AIDocGenius/tests/test_analyzer.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import pytest
2+
from aidocgenius.analyzer import DocumentAnalyzer
3+
4+
def test_analyze_text():
5+
analyzer = DocumentAnalyzer()
6+
text = """
7+
This is a sample text for testing. It contains multiple sentences and words.
8+
We will use this text to test our document analyzer functionality.
9+
The analyzer should be able to count words, sentences, and calculate readability scores.
10+
"""
11+
12+
result = analyzer.analyze_text(text)
13+
14+
assert result['word_count'] > 0
15+
assert result['sentence_count'] == 3
16+
assert result['unique_words'] > 0
17+
assert 0 <= result['readability_score'] <= 100
18+
assert isinstance(result['top_words'], dict)
19+
assert len(result['top_words']) <= 10
20+
21+
def test_empty_text():
22+
analyzer = DocumentAnalyzer()
23+
result = analyzer.analyze_text("")
24+
25+
assert result['word_count'] == 0
26+
assert result['sentence_count'] == 0
27+
assert result['unique_words'] == 0
28+
assert result['readability_score'] == 0
29+
assert len(result['top_words']) == 0
30+
31+
def test_syllable_counting():
32+
analyzer = DocumentAnalyzer()
33+
34+
test_cases = {
35+
'hello': 2,
36+
'world': 1,
37+
'beautiful': 3,
38+
'example': 3,
39+
'test': 1
40+
}
41+
42+
for word, expected in test_cases.items():
43+
assert analyzer._count_syllables(word) == expected
44+
45+
def test_text_extraction():
46+
analyzer = DocumentAnalyzer()
47+
48+
# 测试无效文件路径
49+
result = analyzer.analyze_document("nonexistent_file.pdf")
50+
assert result['error'] is not None
51+
assert result['analysis'] is None

0 commit comments

Comments
 (0)