diff --git a/newspaper/configuration.py b/newspaper/configuration.py index 94688e705..7ef508457 100644 --- a/newspaper/configuration.py +++ b/newspaper/configuration.py @@ -11,6 +11,7 @@ __copyright__ = 'Copyright 2014, Lucas Ou-Yang' import logging +import requests from .parsers import Parser from .text import (StopWords, StopWordsArabic, StopWordsChinese, @@ -63,6 +64,7 @@ def __init__(self): # Unique stopword classes for oriental languages, don't toggle self.stopwords_class = StopWords + self.session = requests.Session() self.browser_user_agent = 'newspaper/%s' % __version__ self.headers = {} self.request_timeout = 7 diff --git a/newspaper/extractors.py b/newspaper/extractors.py index 962554014..1e4729783 100644 --- a/newspaper/extractors.py +++ b/newspaper/extractors.py @@ -681,18 +681,29 @@ def get_category_urls(self, source_url, doc): 'subdomain' % p_url)) continue else: - valid_categories.append(scheme + '://' + domain) - # TODO account for case where category is in form - # http://subdomain.domain.tld/category/ <-- still legal! + if subdomain_contains: + valid_categories.append(scheme + '://' + domain) + # TODO account for case where category is in form + # http://subdomain.domain.tld/category/ <-- still legal! + else: + # support for urls like + # http://domain.tld/category/[category] + path_chunks = [x for x in path.split('/') if len(x) > 0] + path_chunks = [x for x in path_chunks if x not in set(['index.html', 'category'])] + path_chunks = [x for x in path_chunks if not x.isnumeric()] + + if len(path_chunks) == 1 and len(path_chunks[0]) < 14: + valid_categories.append('//' + domain + path) else: # we want a path with just one subdir # cnn.com/world and cnn.com/world/ are both valid_categories + # carbon-pulse.com/category/international/ is a valid_categories path_chunks = [x for x in path.split('/') if len(x) > 0] - if 'index.html' in path_chunks: - path_chunks.remove('index.html') + path_chunks = [x for x in path_chunks if x not in set(['index.html', 'category'])] + path_chunks = [x for x in path_chunks if not x.isnumeric()] if len(path_chunks) == 1 and len(path_chunks[0]) < 14: - valid_categories.append(domain + path) + valid_categories.append(path) else: if self.config.verbose: print(('elim category url %s for >1 path chunks ' @@ -709,8 +720,9 @@ def get_category_urls(self, source_url, doc): 'tickets', 'coupons', 'forum', 'board', 'archive', 'browse', 'howto', 'how to', 'faq', 'terms', 'charts', 'services', 'contact', 'plus', 'admin', 'login', 'signup', 'register', - 'developer', 'proxy'] + 'developer', 'proxy', 'what-we-offer', 'staff'] + valid_categories = list(set(valid_categories)) _valid_categories = [] # TODO Stop spamming urlparse and tldextract calls... @@ -747,8 +759,7 @@ def get_category_urls(self, source_url, doc): _valid_categories = list(set(_valid_categories)) - category_urls = [urls.prepare_url(p_url, source_url) - for p_url in _valid_categories] + category_urls = [urls.prepare_url(p_url, source_url) for p_url in _valid_categories] category_urls = [c for c in category_urls if c is not None] return category_urls diff --git a/newspaper/network.py b/newspaper/network.py index 29f0e699d..c91706fab 100644 --- a/newspaper/network.py +++ b/newspaper/network.py @@ -55,11 +55,12 @@ def get_html_2XX_only(url, config=None, response=None): timeout = config.request_timeout proxies = config.proxies headers = config.headers + session = config.session if response is not None: return _get_html_from_response(response, config) - response = requests.get( + response = session.get( url=url, **get_request_kwargs(timeout, useragent, proxies, headers)) html = _get_html_from_response(response, config) @@ -102,11 +103,12 @@ def __init__(self, url, config=None): self.timeout = config.request_timeout self.proxies = config.proxies self.headers = config.headers + self.session = config.session self.resp = None def send(self): try: - self.resp = requests.get(self.url, **get_request_kwargs( + self.resp = self.session.get(self.url, **get_request_kwargs( self.timeout, self.useragent, self.proxies, self.headers)) if self.config.http_success_only: self.resp.raise_for_status() diff --git a/newspaper/urls.py b/newspaper/urls.py index 2126aaf3c..bee1e1e08 100644 --- a/newspaper/urls.py +++ b/newspaper/urls.py @@ -211,6 +211,11 @@ def valid_url(url, verbose=False, test=False): if verbose: print('%s verified for being a slug' % url) return True + # Allow for paths like /[numeric] (eg: https://carbon-pulse.com/226570/) + if len(path_chunks) == 1 and path_chunks[0].isnumeric(): + if verbose: print('%s verified for isnumeric' % url) + return True + # There must be at least 2 subpaths if len(path_chunks) <= 1: if verbose: print('%s caught for path chunks too small' % url) diff --git a/newspaper/utils.py b/newspaper/utils.py index bfa441482..1f2fbe0cd 100644 --- a/newspaper/utils.py +++ b/newspaper/utils.py @@ -235,6 +235,7 @@ def inner_function(*args, **kwargs): # call the decorated function... result = function(*args, **kwargs) # ... and save the cached object for next time + os.makedirs(cache_folder, exist_ok=True) pickle.dump(result, open(filepath, "wb")) return result return inner_function @@ -324,6 +325,7 @@ def memoize_articles(source, articles): memo_text = '' # TODO if source: source.write_upload_times(prev_length, new_length) + os.makedirs(settings.MEMO_DIR, exist_ok=True) ff = codecs.open(d_pth, 'w', 'utf-8') ff.write(memo_text) ff.close() diff --git a/requirements.txt b/requirements.txt index 619746017..4348fccfd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,11 +4,11 @@ feedfinder2>=0.0.4 feedparser>=5.2.1 jieba3k>=0.35.1 lxml>=3.6.0 -nltk>=3.2.1 -Pillow>=3.3.0 +nltk>=3.6.6 +Pillow>=10.0.1 pythainlp>=1.7.2 python-dateutil>=2.5.3 PyYAML>=3.11 requests>=2.10.0 -tinysegmenter==0.3 # TODO(codelucas): Investigate making this >=0.3 +tinysegmenter>=0.4 tldextract>=2.0.1 \ No newline at end of file