Skip to content

Commit 11c721d

Browse files
committed
Fix CJK title fix, added a test
1 parent be72501 commit 11c721d

File tree

4 files changed

+63
-12
lines changed

4 files changed

+63
-12
lines changed

readability/encoding.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
import re
22
try:
3-
import cchardet
3+
import cchardet as chardet
44
except ImportError:
55
import chardet
6-
import sys
76

87

98
RE_CHARSET = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)

readability/htmls.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -123,8 +123,8 @@ def shorten_title(doc):
123123
if (len(p0.split()) >= 4) or (len(p0) >= 4 and cjk.search(p0)):
124124
title = p0
125125
break
126-
elif (len(p1.split()) >= 4) or (len(p1) >= 4 and cjk.search(p1)):
127-
title = p1
126+
elif (len(pl.split()) >= 4) or (len(pl) >= 4 and cjk.search(pl)):
127+
title = pl
128128
break
129129
else:
130130
if ": " in title:
@@ -134,11 +134,12 @@ def shorten_title(doc):
134134
else:
135135
title = orig.split(": ", 1)[1]
136136

137-
if cjk.search(title) and not (4 <= len(title) < 100):
138-
return orig
137+
if cjk.search(title):
138+
if not (4 <= len(title) < 100): # Allow length >= 4, cap at 100
139+
return orig
139140
elif not 15 < len(title) < 150:
140141
return orig
141-
142+
142143
return title
143144

144145

readability/readability.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,11 @@
4242
"divToPElementsRe": re.compile(
4343
r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)", re.I
4444
),
45-
#'replaceBrsRe': re.compile(r'(<br[^>]*>[ \n\r\t]*){2,}',re.I),
46-
#'replaceFontsRe': re.compile(r'<(\/?)font[^>]*>',re.I),
47-
#'trimRe': re.compile(r'^\s+|\s+$/'),
48-
#'normalizeRe': re.compile(r'\s{2,}/'),
49-
#'killBreaksRe': re.compile(r'(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
45+
# 'replaceBrsRe': re.compile(r'(<br[^>]*>[ \n\r\t]*){2,}',re.I),
46+
# 'replaceFontsRe': re.compile(r'<(\/?)font[^>]*>',re.I),
47+
# 'trimRe': re.compile(r'^\s+|\s+$/'),
48+
# 'normalizeRe': re.compile(r'\s{2,}/'),
49+
# 'killBreaksRe': re.compile(r'(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
5050
"videoRe": re.compile(r"https?:\/\/(www\.)?(youtube|vimeo)\.com", re.I),
5151
# skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
5252
}

tests/test_article_only.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ def test_utf8_kanji(self):
149149
sample = load_sample("utf-8-kanji.sample.html")
150150
doc = Document(sample)
151151
res = doc.summary()
152+
assert 0 < len(res) < 10000
152153

153154
def test_author_present(self):
154155
sample = load_sample("the-hurricane-rubin-carter-denzel-washington.html")
@@ -180,3 +181,53 @@ def test_keep_images_absent_by_defautl(self):
180181
doc = Document(sample)
181182

182183
assert "<img" not in doc.summary()
184+
185+
def test_cjk_summary(self):
186+
"""Check we can extract CJK text correctly."""
187+
html = """
188+
<html>
189+
<head>
190+
<title>这是标题</title>
191+
</head>
192+
<body>
193+
<div>一些无关紧要的内容</div>
194+
<div class="article-content">
195+
<h1>主要文章标题</h1>
196+
<p>这是主要内容的第一段。</p>
197+
<p>これはコンテンツの第2段落です。</p>
198+
<p>이것은 콘텐츠의 세 번째 단락입니다.</p>
199+
<p>This is the fourth paragraph.</p>
200+
</div>
201+
<div>More irrelevant stuff</div>
202+
</body>
203+
</html>
204+
"""
205+
doc = Document(html)
206+
summary = doc.summary()
207+
# Check that the main CJK content is present in the summary
208+
self.assertTrue("这是主要内容的第一段" in summary)
209+
self.assertTrue("これはコンテンツの第2段落です" in summary)
210+
self.assertTrue("이것은 콘텐츠의 세 번째 단락입니다" in summary)
211+
# Check that irrelevant content is mostly gone
212+
self.assertFalse("一些无关紧要的内容" in summary)
213+
214+
def test_shorten_title_delimiter_bug(self):
215+
"""Test that shorten_title handles delimiters correctly when the last part is valid.
216+
217+
This specifically targets a potential bug where 'p1' might be used instead of 'pl'.
218+
"""
219+
html = """
220+
<html>
221+
<head>
222+
<title>Short Part | これは長いです</title>
223+
</head>
224+
<body>
225+
<div>Content</div>
226+
</body>
227+
</html>
228+
"""
229+
doc = Document(html)
230+
# With the bug, this call might raise NameError: name 'p1' is not defined
231+
# With the fix, it should correctly return the last part.
232+
short_title = doc.short_title()
233+
self.assertEqual(short_title, "これは長いです")

0 commit comments

Comments
 (0)