Skip to content

Commit b5cd26a

Browse files
authored
Merge pull request #14 from TeamHG-Memex/fix-webpage-tests
Fix webpage tests
2 parents 8696f80 + 6244867 commit b5cd26a

File tree

1 file changed

+36
-10
lines changed

1 file changed

+36
-10
lines changed

tests/test_html_text.py

Lines changed: 36 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,17 @@
11
# -*- coding: utf-8 -*-
2-
import pytest
32
import glob
3+
import os
4+
5+
import six
6+
import pytest
47

58
from html_text import (extract_text, parse_html, cleaned_selector,
69
selector_to_text, NEWLINE_TAGS, DOUBLE_NEWLINE_TAGS)
710

811

12+
ROOT = os.path.dirname(os.path.abspath(__file__))
13+
14+
915
@pytest.fixture(params=[
1016
{'guess_punct_space': True, 'guess_layout': False},
1117
{'guess_punct_space': False, 'guess_layout': False},
@@ -107,6 +113,14 @@ def test_selectors(all_options):
107113
assert selector_to_text(subsel, **all_options) == ''
108114

109115

116+
def test_nbsp():
117+
if six.PY2:
118+
raise pytest.xfail("  produces '\xa0' in Python 2, "
119+
"but ' ' in Python 3")
120+
html = "<h1>Foo&nbsp;Bar</h1>"
121+
assert extract_text(html) == "Foo Bar"
122+
123+
110124
def test_guess_layout():
111125
html = (u'<title> title </title><div>text_1.<p>text_2 text_3</p>'
112126
'<p id="demo"></p><ul><li>text_4</li><li>text_5</li></ul>'
@@ -149,12 +163,24 @@ def test_personalize_newlines_sets():
149163
assert text == 'text\n\nmore\n\nand more text\n\nand some more'
150164

151165

152-
def test_webpages():
153-
webpages = sorted(glob.glob('./test_webpages/*.html'))
154-
extracted = sorted(glob.glob('./test_webpages/*.txt'))
155-
for page, extr in zip(webpages, extracted):
156-
with open(page, 'r', encoding='utf8') as f_in:
157-
html = f_in.read()
158-
with open(extr, 'r', encoding='utf8') as f_in:
159-
expected = f_in.read()
160-
assert extract_text(html) == expected
166+
def _webpage_paths():
167+
webpages = sorted(glob.glob(os.path.join(ROOT, 'test_webpages', '*.html')))
168+
extracted = sorted(glob.glob(os.path.join(ROOT, 'test_webpages','*.txt')))
169+
return list(zip(webpages, extracted))
170+
171+
172+
def _load_file(path):
173+
with open(path, 'rb') as f:
174+
return f.read().decode('utf8')
175+
176+
177+
@pytest.mark.parametrize(['page', 'extracted'], _webpage_paths())
178+
def test_webpages(page, extracted):
179+
html = _load_file(page)
180+
if not six.PY3:
181+
# FIXME: &nbsp; produces '\xa0' in Python 2, but ' ' in Python 3
182+
# this difference is ignored in this test.
183+
# What is the correct behavior?
184+
html = html.replace('&nbsp;', ' ')
185+
expected = _load_file(extracted)
186+
assert extract_text(html) == expected

0 commit comments

Comments
 (0)