Fix CJK title fix, added a test

buriy · buriy · commit 11c721d920c6 · 2025-05-04T03:57:01.000+07:00
diff --git a/readability/encoding.py b/readability/encoding.py
@@ -1,9 +1,8 @@
 import re
 try:
-    import cchardet
+    import cchardet as chardet
 except ImportError:
     import chardet
-import sys
 
 
 RE_CHARSET = re.compile(r'<meta.*?charset=["\']*(.+?)["\'>]', flags=re.I)
diff --git a/readability/htmls.py b/readability/htmls.py
@@ -123,8 +123,8 @@ def shorten_title(doc):
                 if (len(p0.split()) >= 4) or (len(p0) >= 4 and cjk.search(p0)):
                     title = p0
                     break
-                elif (len(p1.split()) >= 4) or (len(p1) >= 4 and cjk.search(p1)):
-                    title = p1
+                elif (len(pl.split()) >= 4) or (len(pl) >= 4 and cjk.search(pl)):
+                    title = pl
                     break
         else:
             if ": " in title:
@@ -134,11 +134,12 @@ def shorten_title(doc):
                 else:
                     title = orig.split(": ", 1)[1]
 
-    if cjk.search(title) and not (4 <= len(title) < 100):
-        return orig
+    if cjk.search(title):
+        if not (4 <= len(title) < 100):  # Allow length >= 4, cap at 100
+            return orig
     elif not 15 < len(title) < 150:
         return orig
-    
+
     return title
 
 
diff --git a/readability/readability.py b/readability/readability.py
@@ -42,11 +42,11 @@
     "divToPElementsRe": re.compile(
         r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)", re.I
     ),
-    #'replaceBrsRe': re.compile(r'(<br[^>]*>[ \n\r\t]*){2,}',re.I),
-    #'replaceFontsRe': re.compile(r'<(\/?)font[^>]*>',re.I),
-    #'trimRe': re.compile(r'^\s+|\s+$/'),
-    #'normalizeRe': re.compile(r'\s{2,}/'),
-    #'killBreaksRe': re.compile(r'(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
+    # 'replaceBrsRe': re.compile(r'(<br[^>]*>[ \n\r\t]*){2,}',re.I),
+    # 'replaceFontsRe': re.compile(r'<(\/?)font[^>]*>',re.I),
+    # 'trimRe': re.compile(r'^\s+|\s+$/'),
+    # 'normalizeRe': re.compile(r'\s{2,}/'),
+    # 'killBreaksRe': re.compile(r'(<br\s*\/?>(\s|&nbsp;?)*){1,}/'),
     "videoRe": re.compile(r"https?:\/\/(www\.)?(youtube|vimeo)\.com", re.I),
     # skipFootnoteLink:      /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i,
 }
diff --git a/tests/test_article_only.py b/tests/test_article_only.py
@@ -149,6 +149,7 @@ def test_utf8_kanji(self):
         sample = load_sample("utf-8-kanji.sample.html")
         doc = Document(sample)
         res = doc.summary()
+        assert 0 < len(res) < 10000
 
     def test_author_present(self):
         sample = load_sample("the-hurricane-rubin-carter-denzel-washington.html")
@@ -180,3 +181,53 @@ def test_keep_images_absent_by_defautl(self):
         doc = Document(sample)
 
         assert "<img" not in doc.summary()
+
+    def test_cjk_summary(self):
+        """Check we can extract CJK text correctly."""
+        html = """
+        <html>
+            <head>
+                <title>这是标题</title>
+            </head>
+            <body>
+                <div>一些无关紧要的内容</div>
+                <div class="article-content">
+                    <h1>主要文章标题</h1>
+                    <p>这是主要内容的第一段。</p>
+                    <p>これはコンテンツの第2段落です。</p>
+                    <p>이것은 콘텐츠의 세 번째 단락입니다.</p>
+                    <p>This is the fourth paragraph.</p>
+                </div>
+                <div>More irrelevant stuff</div>
+            </body>
+        </html>
+        """
+        doc = Document(html)
+        summary = doc.summary()
+        # Check that the main CJK content is present in the summary
+        self.assertTrue("这是主要内容的第一段" in summary)
+        self.assertTrue("これはコンテンツの第2段落です" in summary)
+        self.assertTrue("이것은 콘텐츠의 세 번째 단락입니다" in summary)
+        # Check that irrelevant content is mostly gone
+        self.assertFalse("一些无关紧要的内容" in summary)
+
+    def test_shorten_title_delimiter_bug(self):
+        """Test that shorten_title handles delimiters correctly when the last part is valid.
+
+        This specifically targets a potential bug where 'p1' might be used instead of 'pl'.
+        """
+        html = """
+        <html>
+            <head>
+                <title>Short Part | これは長いです</title>
+            </head>
+            <body>
+                <div>Content</div>
+            </body>
+        </html>
+        """
+        doc = Document(html)
+        # With the bug, this call might raise NameError: name 'p1' is not defined
+        # With the fix, it should correctly return the last part.
+        short_title = doc.short_title()
+        self.assertEqual(short_title, "これは長いです")