@@ -53,7 +53,14 @@ def extract_one_meta(document):
5353 def __call__ (self , html , ** kwargs ):
5454 return self .extract (html , ** kwargs )
5555
56- def extract (self , html , encoding = None , as_blocks = False , extract_target = None , debug = True , metadata_mining = True ):
56+ def extract (self , html ,
57+ encoding = None ,
58+ as_blocks = False ,
59+ extract_target = None ,
60+ debug = False ,
61+ metadata_mining = True ,
62+ ** kwargs ):
63+
5764 if isinstance (html , (str , bytes , unicode_ , np .unicode_ )):
5865 documents_meta_data = {}
5966 if metadata_mining :
@@ -78,11 +85,11 @@ def extract(self, html, encoding=None, as_blocks=False, extract_target=None, deb
7885
7986 output = self .content_extractor .predict (html )
8087 if isinstance (output , dict ):
81- return self .postprocess (html , output , documents_meta_data )
88+ return self .postprocess (html , output , documents_meta_data , ** kwargs )
8289
83- return [ self .postprocess (h , o , meta ) for h , o , meta in zip (html , output , documents_meta_data )]
90+ return [ self .postprocess (h , o , meta , ** kwargs ) for h , o , meta in zip (html , output , documents_meta_data )]
8491
85- def postprocess (self , html , output , meta ):
92+ def postprocess (self , html , output , meta , ** kwargs ):
8693 results = {}
8794 if 'author' in output and len (output ['author' ]) > 0 :
8895 author_text , confidence = output ['author' ][0 ]
@@ -119,7 +126,9 @@ def postprocess(self, html, output, meta):
119126 results = priority_merge (post_ml_results_ , results )
120127
121128 sanity_check_params = {}
122- if 'url' in results :
129+ if 'url' in kwargs :
130+ sanity_check_params ['url' ] = kwargs ['url' ]
131+ elif 'url' in results :
123132 sanity_check_params ['url' ] = results ['url' ]
124133
125134 return attribute_sanity_check (results , ** sanity_check_params )
0 commit comments