add default value when html structure not enough

theblackcat102 · theblackcat102 · commit 3fb1a1d87188 · 2022-10-29T22:00:03.000+08:00
diff --git a/extractnet/__init__.py b/extractnet/__init__.py
@@ -1,6 +1,6 @@
 from extractnet.pipeline import Extractor
 
-__version__ = '2.0.5'
+__version__ = '2.0.6'
 
 
 _LOADED_MODELS = {}
diff --git a/extractnet/metadata_extraction/utils.py b/extractnet/metadata_extraction/utils.py
@@ -28,6 +28,7 @@
     AUTHOR_REMOVE_SPECIAL, AUTHOR_REPLACE_JOIN, AUTHOR_REMOVE_NICKNAME, 
     AUTHOR_REMOVE_NUMBERS, AUTHOR_REMOVE_PREPOSITION, AUTHOR_TWITTER
 )
+
 LOGGER = logging.getLogger(__name__)
 
 
@@ -149,7 +150,6 @@ def load_html(htmlobject, encoding='utf-8'):
     # further test: is it (well-formed) HTML at all?
     if tree is not None and check_flag is True:
         if len(tree) < 2:
-            LOGGER.error('parsed tree length: %s, wrong data type or not valid HTML', len(tree))
             tree = None
     #if tree is None:
     #    if isinstance(htmlobject, bytes) or isinstance(htmlobject, str):
diff --git a/extractnet/nn_models.py b/extractnet/nn_models.py
@@ -6,7 +6,7 @@
 from .util import get_and_union_features, get_module_res, fix_encoding
 from .blocks import TagCountReadabilityBlockifier
 
-
+EMPTY_HTML = "<article><p>content</p><p>blocked</p><p>404</p></article>"
 
 class NewsNet():
     '''
@@ -30,6 +30,10 @@ def __init__(self, model_weight=None, cls_threshold=0.1, binary_threshold=0.5):
 
     def preprocess(self, html):
         blocks = TagCountReadabilityBlockifier.blockify(html, encoding='utf-8')
+        if len(blocks) == 0: # warning failed extraction
+            blocks = TagCountReadabilityBlockifier.blockify(EMPTY_HTML, encoding='utf-8')
+        elif len(blocks) < 3: # pad block
+            blocks = [blocks[0]]+blocks + [blocks[-1]]
         blocks = np.array(blocks)
         feat = self.feature_transform.transform(blocks).astype(np.float32)
         return feat, blocks
diff --git a/extractnet/pipeline.py b/extractnet/pipeline.py
@@ -121,5 +121,5 @@ def postprocess(self, html, output, meta):
         sanity_check_params = {}
         if 'url' in results:
             sanity_check_params['url'] = results['url']
-            print(sanity_check_params)
+
         return attribute_sanity_check(results, **sanity_check_params)
diff --git a/extractnet/util.py b/extractnet/util.py
@@ -219,7 +219,7 @@ def attribute_sanity_check(content, **kwargs):
         if isinstance(date, str):
             content['date'] = dateparser.parse(date)
 
-    if 'url' in kwargs:
+    if 'url' in kwargs and 'date' in content:
         url = kwargs['url']
         content['date'] = validate_date(url, content['date'])
 
diff --git a/test/test_readability.py b/test/test_readability.py
@@ -19,3 +19,9 @@ def test_readability_make_readability_features():
         expected_features = np.array(json.loads(fin.read()))
     assert np.allclose(actual_features.flatten(), expected_features, rtol=0.0005)
     assert actual_features.shape[1] == 1
+
+
+def test_default_features():
+    EMPTY_HTML = "<article><p>content</p><p>blocked</p><p>404</p></article>"
+    blks = TagCountReadabilityBlockifier.blockify(EMPTY_HTML)
+    print(blks)