Skip to content

Commit 59b99ff

Browse files
authored
Merge pull request #105 from pypt/many_repeated_spaces_timeout
Trim many repeated spaces to make clean() faster
2 parents dca6e21 + 494b19e commit 59b99ff

File tree

5 files changed

+27
-3
lines changed

5 files changed

+27
-3
lines changed

.travis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ before_install:
2424

2525
install:
2626
- travis_retry pip install -U pip wheel tox
27-
- travis_retry pip install -U -r requirements.txt -e .
27+
- travis_retry pip install -U -r requirements.txt -e ".[test]"
2828

2929
script:
3030
- tox -e $TOX_ENV

readability/readability.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,9 @@ def to_int(x):
5454

5555

5656
def clean(text):
57+
# Many spaces make the following regexes run forever
58+
text = re.sub(r'\s{255,}', ' ' * 255, text)
59+
5760
text = re.sub('\s*\n\s*', '\n', text)
5861
text = re.sub('\t|[ \t]{2,}', ' ', text)
5962
return text.strip()

setup.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,14 @@
1616
print("Using lxml<2.4")
1717
lxml_requirement = "lxml<2.4"
1818

19+
test_deps = [
20+
# Test timeouts
21+
"timeout_decorator",
22+
]
23+
24+
extras = {
25+
'test': test_deps,
26+
}
1927

2028
# Adapted from https://github.com/pypa/pip/blob/master/setup.py
2129
def find_version(*file_paths):
@@ -35,7 +43,6 @@ def find_version(*file_paths):
3543

3644
raise RuntimeError("Unable to find version string.")
3745

38-
3946
setup(
4047
name="readability-lxml",
4148
version=find_version("readability", "__init__.py"),
@@ -52,6 +59,8 @@ def find_version(*file_paths):
5259
lxml_requirement,
5360
"cssselect"
5461
],
62+
tests_require=test_deps,
63+
extras_require=extras,
5564
classifiers=[
5665
"Environment :: Web Environment",
5766
"Intended Audience :: Developers",

tests/test_article_only.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import unittest
33

44
from readability import Document
5+
import timeout_decorator
56

67

78
SAMPLES = os.path.join(os.path.dirname(__file__), 'samples')
@@ -94,3 +95,14 @@ def test_correct_cleanup(self):
9495
assert('punctuation' in s)
9596
assert(not 'comment' in s)
9697
assert(not 'aside' in s)
98+
99+
# Many spaces make some regexes run forever
100+
@timeout_decorator.timeout(seconds=3, use_signals=False)
101+
def test_many_repeated_spaces(self):
102+
long_space = ' ' * 1000000
103+
sample = '<html><body><p>foo' + long_space + '</p></body></html>'
104+
105+
doc = Document(sample)
106+
s = doc.summary()
107+
108+
assert 'foo' in s

tox.ini

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,5 +16,5 @@ deps=pytest
1616
# $PYTHONDIR\Scripts\pip.exe install *.whl
1717
sitepackages=True
1818
commands =
19-
pip install -r requirements.txt
19+
pip install -r requirements.txt -e ".[test]"
2020
py.test

0 commit comments

Comments
 (0)