Skip to content

Commit 3fb1a1d

Browse files
author
theblackcat102
committed
add default value when html structure not enough
1 parent f02c54d commit 3fb1a1d

File tree

6 files changed

+15
-5
lines changed

6 files changed

+15
-5
lines changed

extractnet/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from extractnet.pipeline import Extractor
22

3-
__version__ = '2.0.5'
3+
__version__ = '2.0.6'
44

55

66
_LOADED_MODELS = {}

extractnet/metadata_extraction/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
AUTHOR_REMOVE_SPECIAL, AUTHOR_REPLACE_JOIN, AUTHOR_REMOVE_NICKNAME,
2929
AUTHOR_REMOVE_NUMBERS, AUTHOR_REMOVE_PREPOSITION, AUTHOR_TWITTER
3030
)
31+
3132
LOGGER = logging.getLogger(__name__)
3233

3334

@@ -149,7 +150,6 @@ def load_html(htmlobject, encoding='utf-8'):
149150
# further test: is it (well-formed) HTML at all?
150151
if tree is not None and check_flag is True:
151152
if len(tree) < 2:
152-
LOGGER.error('parsed tree length: %s, wrong data type or not valid HTML', len(tree))
153153
tree = None
154154
#if tree is None:
155155
# if isinstance(htmlobject, bytes) or isinstance(htmlobject, str):

extractnet/nn_models.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from .util import get_and_union_features, get_module_res, fix_encoding
77
from .blocks import TagCountReadabilityBlockifier
88

9-
9+
EMPTY_HTML = "<article><p>content</p><p>blocked</p><p>404</p></article>"
1010

1111
class NewsNet():
1212
'''
@@ -30,6 +30,10 @@ def __init__(self, model_weight=None, cls_threshold=0.1, binary_threshold=0.5):
3030

3131
def preprocess(self, html):
3232
blocks = TagCountReadabilityBlockifier.blockify(html, encoding='utf-8')
33+
if len(blocks) == 0: # warning failed extraction
34+
blocks = TagCountReadabilityBlockifier.blockify(EMPTY_HTML, encoding='utf-8')
35+
elif len(blocks) < 3: # pad block
36+
blocks = [blocks[0]]+blocks + [blocks[-1]]
3337
blocks = np.array(blocks)
3438
feat = self.feature_transform.transform(blocks).astype(np.float32)
3539
return feat, blocks

extractnet/pipeline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,5 +121,5 @@ def postprocess(self, html, output, meta):
121121
sanity_check_params = {}
122122
if 'url' in results:
123123
sanity_check_params['url'] = results['url']
124-
print(sanity_check_params)
124+
125125
return attribute_sanity_check(results, **sanity_check_params)

extractnet/util.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ def attribute_sanity_check(content, **kwargs):
219219
if isinstance(date, str):
220220
content['date'] = dateparser.parse(date)
221221

222-
if 'url' in kwargs:
222+
if 'url' in kwargs and 'date' in content:
223223
url = kwargs['url']
224224
content['date'] = validate_date(url, content['date'])
225225

test/test_readability.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,9 @@ def test_readability_make_readability_features():
1919
expected_features = np.array(json.loads(fin.read()))
2020
assert np.allclose(actual_features.flatten(), expected_features, rtol=0.0005)
2121
assert actual_features.shape[1] == 1
22+
23+
24+
def test_default_features():
25+
EMPTY_HTML = "<article><p>content</p><p>blocked</p><p>404</p></article>"
26+
blks = TagCountReadabilityBlockifier.blockify(EMPTY_HTML)
27+
print(blks)

0 commit comments

Comments
 (0)