|
3 | 3 | import numpy as np |
4 | 4 | from sklearn.base import BaseEstimator, ClassifierMixin |
5 | 5 | from sklearn.ensemble import ExtraTreesClassifier |
| 6 | +import os |
| 7 | +import dateparser |
| 8 | +import joblib |
6 | 9 |
|
7 | 10 | from .compat import string_, str_cast, unicode_ |
8 | 11 | from .util import get_and_union_features, convert_segmentation_to_text |
9 | 12 | from .blocks import TagCountReadabilityBlockifier |
10 | 13 | from .features.author import AuthorFeatures |
11 | 14 | from .sequence_tagger.models import word2features |
12 | 15 |
|
13 | | -import dateparser |
14 | | -import joblib |
15 | | - |
16 | 16 | from sklearn.base import clone |
17 | 17 |
|
| 18 | +BASE_EXTRACTOR_DIR = __file__.replace('/extractor.py','') |
| 19 | + |
18 | 20 | class MultiExtractor(BaseEstimator, ClassifierMixin): |
19 | 21 | """ |
20 | 22 | An sklearn-style classifier that extracts the main content (and/or comments) |
@@ -62,10 +64,15 @@ def state_dict(self): |
62 | 64 | def __init__(self, blockifier=TagCountReadabilityBlockifier, |
63 | 65 | features=('kohlschuetter', 'weninger', 'readability'), |
64 | 66 | model=None, |
65 | | - css_tokenizer_path='extractnet/models/css_tokenizer.pkl.gz', |
66 | | - text_tokenizer_path='extractnet/models/text_tokenizer.pkl.gz', |
| 67 | + css_tokenizer_path=None, |
| 68 | + text_tokenizer_path=None, |
67 | 69 | num_labels=2, prob_threshold=0.5, max_block_weight=200, |
68 | 70 | features_type=None, author_feature_transforms=None): |
| 71 | + if css_tokenizer_path is None: |
| 72 | + css_tokenizer_path = os.path.join(BASE_EXTRACTOR_DIR, 'models/css_tokenizer.pkl.gz') |
| 73 | + if text_tokenizer_path is None: |
| 74 | + text_tokenizer_path = os.path.join(BASE_EXTRACTOR_DIR, 'models/text_tokenizer.pkl.gz') |
| 75 | + |
69 | 76 | self.params = { |
70 | 77 | 'features': features, |
71 | 78 | 'num_labels': num_labels, |
|
0 commit comments