Skip to content

Commit 6e337ea

Browse files
authored
Merge pull request #772 from flairNLP/add-training-disclaimer
Add `skip_publishers_disallowing_training`
2 parents 1b86722 + 669c285 commit 6e337ea

File tree

12 files changed

+503
-97
lines changed

12 files changed

+503
-97
lines changed

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,11 @@ Developed at <a href="https://www.informatik.hu-berlin.de/en/forschung-en/gebiet
2525

2626
---
2727

28+
***Disclaimer**: Although we try to provide an indication of whether a publisher has not explicitly objected to the training of AI models on its data, we would like to point out that this information must be verified independently before their content is used.
29+
More details can be found [here](docs/5_advanced_topics.md#filtering-publishers-for-ai-training).*
30+
31+
32+
---
2833
Fundus is:
2934

3035
* **A static news crawler.**

docs/5_advanced_topics.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
* [How to search for publishers](#how-to-search-for-publishers)
55
* [Using `search()`](#using-search)
66
* [Working with deprecated publishers](#working-with-deprecated-publishers)
7+
* [Filtering publishers for AI training](#filtering-publishers-for-ai-training)
78

89
# Advanced Topics
910

@@ -33,4 +34,11 @@ When we notice that a publisher is uncrawlable for whatever reason, we will mark
3334
This mostly has internal usages, since the default value for the `Crawler` `ignore_deprecated` flag is `False`.
3435
You can alter this behaviour when initiating the `Crawler` and setting the `ignore_deprecated` flag.
3536

37+
## Filtering publishers for AI training
38+
39+
Some publishers explicitly disallow the use of their content for AI training purposes.
40+
We _try_ to respect these wishes by introducing the `skip_publishers_disallowing_training` parameter in the `crawl()` function.
41+
Users intending to use Fundus to gather training data for AI models should set this parameter to `True` to avoid collecting articles from publishers that wish for their content to not be used in this way.
42+
Yet, as publishers are not required to mention this in their robots.txt file, users should additionally check the terms of use of the publishers they want to crawl and set the `disallows_training` attribute of the `Publisher` class accordingly.
43+
3644
In the [next section](6_logging.md) we introduce you to Fundus logging mechanics.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ dependencies = [
3939
"dill>=0.3, <1",
4040
"dict2xml>=1.7.6, <2",
4141
"xmltodict>=0.13.0, <1",
42+
"bidict>=0.23, <1"
4243
]
4344

4445
[project.urls]

src/fundus/publishers/base_objects.py

Lines changed: 72 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from collections import defaultdict
22
from textwrap import indent
3-
from typing import Dict, Iterator, List, Optional, Set, Type, Union
3+
from typing import Dict, Iterable, Iterator, List, Optional, Set, Type, Union
44
from urllib.robotparser import RobotFileParser
55
from warnings import warn
66

@@ -11,7 +11,7 @@
1111
from fundus.logging import create_logger
1212
from fundus.parser.base_parser import ParserProxy
1313
from fundus.scraping.filter import URLFilter
14-
from fundus.scraping.session import session_handler
14+
from fundus.scraping.session import _default_header, session_handler
1515
from fundus.scraping.url import NewsMap, RSSFeed, Sitemap, URLSource
1616
from fundus.utils.iteration import iterate_all_subclasses
1717

@@ -27,43 +27,90 @@ class CustomRobotFileParser(RobotFileParser):
2727
This is in order to avoid 403 errors when fetching the robots.txt file.
2828
"""
2929

30+
_disallow_training_keywords: Set[str] = {
31+
"machine",
32+
"learning",
33+
"training",
34+
"train",
35+
"model",
36+
"models",
37+
"artificial",
38+
"intelligence",
39+
"large",
40+
"language",
41+
"llm",
42+
"llms",
43+
}
44+
45+
def __init__(self, url: str, headers: Optional[Dict[str, str]] = None):
46+
self.headers = headers
47+
self.disallows_training: bool = False
48+
self.url = url
49+
super().__init__(url)
50+
3051
# noinspection PyAttributeOutsideInit
31-
def read(self, headers: Optional[Dict[str, str]] = None) -> None:
52+
def read(self) -> None:
3253
"""Reads the robots.txt URL and feeds it to the parser."""
3354
try:
3455
# noinspection PyUnresolvedReferences
3556
session = session_handler.get_session()
36-
response = session.get_with_interrupt(self.url, headers=headers) # type: ignore[attr-defined]
57+
response = session.get_with_interrupt(self.url, headers=self.headers)
3758
except HTTPError as err:
3859
if err.response.status_code in (401, 403):
60+
logger.warning(
61+
f"Robots {self.url!r} disallowed access with status code {err.response.status_code}."
62+
" Defaulting to disallow all."
63+
)
3964
self.disallow_all = True
4065
elif 400 <= err.response.status_code < 500:
4166
self.allow_all = True
4267
else:
4368
self.parse(response.text.splitlines())
4469

70+
def parse(self, lines: Iterable[str]) -> None:
71+
for line in lines:
72+
if line.strip().startswith("#") and set(line.split(" ")) & self._disallow_training_keywords:
73+
self.disallows_training = True
74+
break
75+
super().parse(lines)
76+
4577

4678
class Robots:
47-
def __init__(self, url: str):
79+
def __init__(self, url: str, headers: Optional[Dict[str, str]] = None):
4880
self.url = url
49-
self.robots_file_parser = CustomRobotFileParser(url)
81+
self.robots_file_parser = CustomRobotFileParser(url, headers=headers)
5082
self.ready: bool = False
5183

52-
def read(self, headers: Optional[Dict[str, str]] = None) -> None:
84+
def _read(self) -> None:
5385
try:
54-
self.robots_file_parser.read(headers=headers)
86+
self.robots_file_parser.read()
5587
except (ConnectionError, ReadTimeout):
5688
logger.warning(f"Could not load robots {self.url!r}. Ignoring robots and continuing.")
5789
self.robots_file_parser.allow_all = True
5890
self.ready = True
5991

92+
def ensure_ready(self) -> None:
93+
"""Ensure that the robots.txt file is read and parsed."""
94+
if not self.ready:
95+
self._read()
96+
6097
def can_fetch(self, useragent: str, url: str) -> bool:
98+
self.ensure_ready()
6199
return self.robots_file_parser.can_fetch(useragent, url)
62100

63101
def crawl_delay(self, useragent: str) -> Optional[float]:
102+
self.ensure_ready()
64103
delay = self.robots_file_parser.crawl_delay(useragent)
65104
return delay if delay is None else float(delay)
66105

106+
def disallows_training(self) -> bool:
107+
self.ensure_ready()
108+
return self.robots_file_parser.disallows_training
109+
110+
def disallow_all(self) -> bool:
111+
self.ensure_ready()
112+
return self.robots_file_parser.disallow_all
113+
67114

68115
class Publisher:
69116
__name__: str
@@ -83,8 +130,9 @@ def __init__(
83130
sources: List[URLSource],
84131
query_parameter: Optional[Dict[str, str]] = None,
85132
url_filter: Optional[URLFilter] = None,
86-
request_header: Optional[Dict[str, str]] = None,
133+
request_header: Optional[Dict[str, str]] = _default_header,
87134
deprecated: bool = False,
135+
disallows_training: bool = False,
88136
suppress_robots: bool = False,
89137
):
90138
"""Initialization of a new Publisher object
@@ -98,6 +146,10 @@ def __init__(
98146
appended to crawled URLs
99147
url_filter (Optional[URLFilter]): Regex filter to apply determining URLs to be skipped
100148
request_header (Optional[Dict[str, str]]): Request header to be used for the GET-request
149+
deprecated (bool): If True, the publisher is deprecated and skipped by default
150+
disallows_training (bool): If True, the publisher disallows training on its articles in it's robots.txt file.
151+
Note that this is only an indicator and users should verify the terms of use of the publisher before
152+
using the articles for training purposes.
101153
102154
"""
103155
if not (name and domain and parser and sources):
@@ -109,14 +161,20 @@ def __init__(
109161
self.url_filter = url_filter
110162
self.request_header = request_header
111163
self.deprecated = deprecated
112-
self.robots = Robots(self.domain + "robots.txt" if self.domain.endswith("/") else self.domain + "/robots.txt")
164+
self.robots = Robots(
165+
url=self.domain + "robots.txt" if self.domain.endswith("/") else self.domain + "/robots.txt",
166+
headers=self.request_header,
167+
)
168+
self._disallows_training = disallows_training
113169

114170
# Temporary fix to compensate for a bug in the RobotsFileParser treating rule lines
115171
# like /? as / disallowing the entire site. we could think about replacing the urllib
116172
# implementation with https://github.com/seomoz/reppy
117173
if suppress_robots:
118174
self.robots.robots_file_parser.allow_all = True
119175

176+
# we define the dict here manually instead of using default dict so that we can control
177+
# the order in which sources are proceeded.
120178
source_mapping: Dict[Type[URLSource], List[URLSource]] = defaultdict(list)
121179

122180
for url_source in sources:
@@ -129,6 +187,10 @@ def __init__(
129187

130188
self._source_mapping = dict(sorted(source_mapping.items(), key=lambda item: self.__SOURCE_ORDER__[item[0]]))
131189

190+
@property
191+
def disallows_training(self) -> bool:
192+
return self._disallows_training or self.robots.disallows_training()
193+
132194
@property
133195
def source_mapping(self) -> Dict[Type[URLSource], List[URLSource]]:
134196
return self._source_mapping

0 commit comments

Comments
 (0)