Skip to content

Commit 1a3a6c1

Browse files
committed
optimize extract_text
1 parent c703d49 commit 1a3a6c1

File tree

1 file changed

+16
-0
lines changed

1 file changed

+16
-0
lines changed

inoreader/utils.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,20 @@ def extract_text(html_content):
1818
return html_content
1919

2020
content = html.fromstring(html_content)
21+
for img in content.iter('img'):
22+
img_src = img.get('src')
23+
img_alt = img.get('alt') or img_src
24+
if not img_src:
25+
continue
26+
27+
img.text = '![%s](%s)' % (img_alt, img_src)
28+
29+
for link in content.iter('a'):
30+
url = link.get('href')
31+
text = link.text or url
32+
if not url:
33+
continue
34+
35+
link.text = '[%s](%s)' % (text, url)
36+
2137
return content.text_content().replace('\xa0', '').strip()

0 commit comments

Comments
 (0)