Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 2 additions & 19 deletions indexer/storyapp.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from typing import Dict, List, Optional, TypeAlias, TypedDict
from urllib.parse import urlsplit

from mcmetadata.urls import NON_NEWS_DOMAINS
from mcmetadata.urls import is_non_news_domain
from pika import BasicProperties
from pika.adapters.blocking_connection import BlockingChannel

Expand Down Expand Up @@ -55,23 +55,6 @@ def url_fqdn(url: str) -> str:
return hn.lower()


def non_news_fqdn(fqdn: str) -> bool:
"""
check if a FQDN (fully qualified domain name, ie; DNS name)
is (in) a domain embargoed as "non-news"

maybe belongs in mcmetadata??
"""
# could be written as "any" on a comprehension:
# looks like that's 15% slower in Python 3.10,
# and harder to for me to... comprehend!
fqdn = fqdn.lower()
for nnd in NON_NEWS_DOMAINS:
if fqdn == nnd or fqdn.endswith("." + nnd):
return True
return False


class StoryMixin(AppProtocol):
"""
The place for Story-specific methods for both
Expand Down Expand Up @@ -161,7 +144,7 @@ def check_story_url(self, url: str) -> bool:

# check for schema?

if non_news_fqdn(hostname):
if is_non_news_domain(hostname):
self.incr_stories("non-news", url)
return False

Expand Down
6 changes: 3 additions & 3 deletions indexer/workers/fetcher/tqfetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@

import requests
from mcmetadata.requests_arcana import insecure_requests_session
from mcmetadata.urls import is_non_news_domain
from mcmetadata.webpages import MEDIA_CLOUD_USER_AGENT
from requests.exceptions import RequestException

Expand All @@ -56,7 +57,6 @@
from indexer.storyapp import (
MultiThreadStoryWorker,
StorySender,
non_news_fqdn,
url_fqdn,
)
from indexer.worker import InputMessage, QuarantineException, RequeueException
Expand Down Expand Up @@ -351,7 +351,7 @@ def fetch(self, sess: requests.Session, url: str) -> FetchReturn:
# NOTE: adding a counter here would count each story fetch attempt more than once

logger.info("redirect (%d) => %s", resp.status_code, url)
if non_news_fqdn(fqdn):
if is_non_news_domain(fqdn):
return FetchReturn(None, "non-news2", False) # in redirect

# end infinite redirect loop
Expand Down Expand Up @@ -396,7 +396,7 @@ def get_id(self, story: BaseStory) -> GetIdReturn:
except (TypeError, ValueError):
return GetIdReturn("badurl1", url, fqdn)

if non_news_fqdn(fqdn):
if is_non_news_domain(fqdn):
# unlikely, if queuer does their job!
return GetIdReturn("non-news", url, fqdn)

Expand Down