flairNLP
diff --git a/‎.github/workflows/publish-package.yml‎
Lines changed: 27 additions & 3 deletions b/‎.github/workflows/publish-package.yml‎
Lines changed: 27 additions & 3 deletions
diff --git a/‎.github/workflows/publisher_coverage.yaml‎
Lines changed: 13 additions & 7 deletions b/‎.github/workflows/publisher_coverage.yaml‎
Lines changed: 13 additions & 7 deletions
diff --git a/‎.github/workflows/tests.yml‎
Lines changed: 0 additions & 1 deletion b/‎.github/workflows/tests.yml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 120 additions & 33 deletions b/‎README.md‎
Lines changed: 120 additions & 33 deletions
diff --git a/‎docs/1_getting_started.md‎
Lines changed: 15 additions & 0 deletions b/‎docs/1_getting_started.md‎
Lines changed: 15 additions & 0 deletions
@@ -7,9 +7,28 @@ on:
   release:
     types:
       - released
+  workflow_dispatch:
 
 jobs:
 
+  permission-check:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Guard `workflow_dispatch`
+        if: github.event_name == 'workflow_dispatch'
+        id: check-admin
+        run: |
+          RESPONSE=$(curl -s -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \
+            https://api.github.com/repos/${{ github.repository }}/collaborators/${{ github.actor }}/permission)
+          
+          PERMISSION=$(echo "$RESPONSE" | jq -r '.permission')
+          
+          if [[ "$PERMISSION" != "admin" ]]; then
+            echo "User ${{ github.actor }} does not have admin rights."
+            exit 1
+          fi
+
   test:
     name: Test the latest release commit
     uses: ./.github/workflows/tests.yml
@@ -23,6 +42,7 @@ jobs:
     needs:
       - test
       - lint
+      - permission-check
     runs-on: ubuntu-latest
 
     steps:
@@ -43,7 +63,7 @@ jobs:
         run: python3 -m build
 
       - name: Store the distribution packages
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
         with:
           name: python-package-distributions
           path: dist/
@@ -63,7 +83,7 @@ jobs:
 
     steps:
       - name: Download all the dists
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
           name: python-package-distributions
           path: dist/
@@ -72,6 +92,8 @@ jobs:
         uses: pypa/gh-action-pypi-publish@release/v1
         with:
           repository-url: https://test.pypi.org/legacy/
+          verbose: true
+
 
       - name: Sleep for 2 minutes
         run: sleep 2m
@@ -113,11 +135,13 @@ jobs:
 
     steps:
       - name: Download all the dists
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
         with:
           name: python-package-distributions
           path: dist/
 
       - name: Publish distribution 📦 to PyPI
         uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          verbose: true
 
@@ -2,13 +2,14 @@ name: Publisher Coverage
 
 on:
   schedule:
-    - cron: '0 1 * * *'  # Runs at 01:00
+    - cron: '0 14 * * *'  # Runs at 14:00
 
   workflow_dispatch:
 
 jobs:
   validate_crawlers:
     runs-on: ubuntu-latest
+    timeout-minutes: 30
 
     steps:
       - name: Set up Git repository
@@ -25,12 +26,16 @@ jobs:
         run: pip install -e .
 
       - name: Validate Crawlers
+        env:
+          PYTHONPATH: .
+        # Set up a timeout to avoid long-running tests
+        # We skip the Kicker APNews publishers, because they are IP-blocked
         run: |
           set -o pipefail
-          exec python scripts/publisher_coverage.py | tee publisher_coverage.txt
+          timeout 25m python -u scripts/publisher_coverage.py --skip Kicker APNews Tageblatt | tee publisher_coverage.txt
 
       - name: Upload Coverage Report
-        if: success() || failure()
+        if: always()
         uses: actions/upload-artifact@v4
         with:
           name: Publisher Coverage
@@ -61,12 +66,13 @@ jobs:
           echo "TOTAL_PUBLISHERS=$(echo ${{ env.SUCCESS_RATE }} | grep -P -o '\d+' | tail -1)" >> $GITHUB_ENV
           echo "PASSED_PUBLISHERS=$(echo ${{ env.SUCCESS_RATE }} | grep -P -o '\d+' | head -1)" >> $GITHUB_ENV
 
-      - name: Get Red Threshold
-        # We set the badge colour to red when at least one publisher failed the tests.
-        run: echo "RED_THRESHOLD=$(( ${{ env.TOTAL_PUBLISHERS }} - 1 ))" >> $GITHUB_ENV
+      - name: Get Thresholds
+        # We set the badge colour to red when at least half of the publishers failed the tests.
+        run: |
+          echo "RED_THRESHOLD=$(( ${{ env.TOTAL_PUBLISHERS }} / 2 ))" >> $GITHUB_ENV
 
       - name: Create Badge
-        uses: schneegans/dynamic-badges-action@v1.6.0
+        uses: schneegans/dynamic-badges-action@v1.7.0
         with:
           auth: ${{ secrets.DOBBERSC_GIST_SECRET }}
           gistID: ca0ae056b05cbfeaf30fa42f84ddf458
 
@@ -28,7 +28,6 @@ jobs:
         if: steps.cache.outputs.cache-hit != 'true'
         run: |
           pip install -e .[dev]
-
       - name: Run pytest
         run: python -m pytest -vv
 
 
@@ -18,7 +18,7 @@ Developed at <a href="https://www.informatik.hu-berlin.de/en/forschung-en/gebiet
 <div align="center">
 <hr>
 
-[Quick Start](#quick-start) | [Tutorials](#tutorials) | [News Sources](/docs/supported_publishers.md) | [Paper](https://arxiv.org/abs/2403.15279)
+[Quick Start](#quick-start) | [Tutorials](#tutorials) | [News Sources](/docs/supported_publishers.md) | [Paper](https://aclanthology.org/2024.acl-demos.29/)
 
 </div>
 
@@ -68,24 +68,25 @@ That's already it!
 If you run this code, it should print out something like this:
 
 ```console
-Fundus-Article:
+Fundus-Article including 1 image(s):
 - Title: "Feinstein's Return Not Enough for Confirmation of Controversial New [...]"
-- Text:  "Democrats jammed three of President Joe Biden's controversial court nominees
-          through committee votes on Thursday thanks to a last-minute [...]"
+- Text:  "89-year-old California senator arrived hour late to Judiciary Committee hearing
+          to advance President Biden's stalled nominations  Democrats [...]"
 - URL:    https://freebeacon.com/politics/feinsteins-return-not-enough-for-confirmation-of-controversial-new-hampshire-judicial-nominee/
-- From:   FreeBeacon (2023-05-11 18:41)
+- From:   The Washington Free Beacon (2023-05-11 18:41)
 
-Fundus-Article:
+Fundus-Article including 3 image(s):
 - Title: "Northwestern student government freezes College Republicans funding over [...]"
 - Text:  "Student government at Northwestern University in Illinois "indefinitely" froze
           the funds of the university's chapter of College Republicans [...]"
 - URL:    https://www.foxnews.com/us/northwestern-student-government-freezes-college-republicans-funding-poster-critical-lgbtq-community
-- From:   FoxNews (2023-05-09 14:37)
+- From:   Fox News (2023-05-09 14:37)
 ```
 
 This printout tells you that you successfully crawled two articles!
 
 For each article, the printout details:
+- the number of images included in the article
 - the "Title" of the article, i.e. its headline 
 - the "Text", i.e. the main article body text
 - the "URL" from which it was crawled
@@ -94,7 +95,7 @@ For each article, the printout details:
 
 ## Example 2: Crawl a specific news source
 
-Maybe you want to crawl a specific news source instead. Let's crawl news articles from Washington Times only:
+Maybe you want to crawl a specific news source instead. Let's crawl news articles from The New Yorker only:
 
 ```python
 from fundus import PublisherCollection, Crawler
@@ -107,21 +108,95 @@ for article in crawler.crawl(max_articles=2):
     print(article)
 ```
 
-## Example 3: Crawl articles from CC-NEWS
+## Example 3: Crawl 1 Million articles
 
-If you're not familiar with CC-NEWS, check out their [paper](https://paperswithcode.com/dataset/cc-news).
+To crawl such a vast amount of data, Fundus relies on the `CommonCrawl` web archive, in particular the news crawl `CC-NEWS`.
+If you're not familiar with [`CommonCrawl`](https://commoncrawl.org/) or [`CC-NEWS`](https://commoncrawl.org/blog/news-dataset-available) check out their websites.
+Simply import our `CCNewsCrawler` and make sure to check out our [tutorial](docs/2_crawl_from_cc_news.md) beforehand.
 
 ````python
 from fundus import PublisherCollection, CCNewsCrawler
 
-# initialize the crawler for news publishers based in the US
-crawler = CCNewsCrawler(*PublisherCollection.us)
+# initialize the crawler using all publishers supported by fundus
+crawler = CCNewsCrawler(*PublisherCollection)
 
-# crawl 2 articles and print
-for article in crawler.crawl(max_articles=2):
+# crawl 1 million articles and print
+for article in crawler.crawl(max_articles=1000000):
   print(article)
 ````
 
+**_Note_**: By default, the crawler utilizes all available CPU cores on your system. 
+For optimal performance, we recommend manually setting the number of processes using the `processes` parameter. 
+A good rule of thumb is to allocate `one process per 200 Mbps of bandwidth`.
+This can vary depending on core speed.
+
+**_Note_**: The crawl above took ~7 hours using the entire `PublisherCollection` on a machine with 1000 Mbps connection, Core i9-13905H, 64GB Ram, Windows 11 and without printing the articles.
+The estimated time can vary substantially depending on the publisher used and the available bandwidth.
+Additionally, not all publishers are included in the `CC-NEWS` crawl (especially US based publishers).
+For large corpus creation, one can also use the regular crawler by utilizing only sitemaps, which requires significantly less bandwidth.
+
+````python
+from fundus import PublisherCollection, Crawler, Sitemap
+
+# initialize a crawler for us/uk based publishers and restrict to Sitemaps only
+crawler = Crawler(PublisherCollection.us, PublisherCollection.uk, restrict_sources_to=[Sitemap])
+
+# crawl 1 million articles and print
+for article in crawler.crawl(max_articles=1000000):
+  print(article)
+````
+
+
+## Example 4: Crawl some images
+
+By default, Fundus tries to parse the images included in every crawled article.
+Let's crawl an article and print out the images for some more details.
+
+```python
+from fundus import PublisherCollection, Crawler
+
+# initialize the crawler for The LA Times
+crawler = Crawler(PublisherCollection.us.LATimes)
+
+# crawl 1 article and print the images
+for article in crawler.crawl(max_articles=1):
+    for image in article.images:
+        print(image)
+```
+
+For [this article](https://www.latimes.com/sports/lakers/story/2024-12-13/lakers-lebron-james-away-from-team-timberwolves) you will get the following output:
+
+```console
+Fundus-Article Cover-Image:
+-URL:			 'https://ca-times.brightspotcdn.com/dims4/default/41c9bc4/2147483647/strip/true/crop/4598x3065+0+0/resize/1200x800!/format/webp/quality/75/?url=https%3A%2F%2Fcalifornia-times-brightspot.s3.amazonaws.com%2F77%2Feb%2F7fed2d3942fd97b0f7325e7060cf%2Flakers-timberwolves-basketball-33765.jpg'
+-Description:	         'Minnesota Timberwolves forward Julius Randle (30) works toward the basket.'
+-Caption:		 'Minnesota Timberwolves forward Julius Randle, left, controls the ball in front of Lakers forward Anthony Davis during the first half of the Lakers’ 97-87 loss Friday.'
+-Authors:		 ['Abbie Parr / Associated Press']
+-Versions:		 [320x213, 568x379, 768x512, 1024x683, 1200x800]
+
+Fundus-Article Image:
+-URL:			 'https://ca-times.brightspotcdn.com/dims4/default/9a22715/2147483647/strip/true/crop/4706x3137+0+0/resize/1200x800!/format/webp/quality/75/?url=https%3A%2F%2Fcalifornia-times-brightspot.s3.amazonaws.com%2Ff7%2F52%2Fdcd6b263480ab579ac583a4fdbbf%2Flakers-timberwolves-basketball-48004.jpg'
+-Description:	         'Lakers coach JJ Redick talks with forward Anthony Davis during a loss to the Timberwolves.'
+-Caption:		 'Lakers coach JJ Redick, right, talks with forward Anthony Davis during the first half of a 97-87 loss to the Timberwolves on Friday night.'
+-Authors:		 ['Abbie Parr / Associated Press']
+-Versions:		 [320x213, 568x379, 768x512, 1024x683, 1200x800]
+
+Fundus-Article Image:
+-URL:			 'https://ca-times.brightspotcdn.com/dims4/default/580bae4/2147483647/strip/true/crop/5093x3470+0+0/resize/1200x818!/format/webp/quality/75/?url=https%3A%2F%2Fcalifornia-times-brightspot.s3.amazonaws.com%2F3b%2Fdf%2F64c0198b4c2fb2b5824aaccb64b7%2F1486148-sp-nba-lakers-trailblazers-25-gmf.jpg'
+-Description:	         'Lakers star LeBron James sits in street clothes on the bench next to his son, Bronny James.'
+-Caption:		 'Lakers star LeBron James sits in street clothes on the bench next to his son, Bronny James, during a win over Portland at Crypto.com Arena on Dec. 8.'
+-Authors:		 ['Gina Ferazzi / Los Angeles Times']
+-Versions:		 [320x218, 568x387, 768x524, 1024x698, 1200x818]
+```
+
+For each image, the printout details:
+- The cover image designation (if applicable).
+- The URL for the highest-resolution version of the image.
+- A description of the image.
+- The image's caption.
+- The name of the copyright holder.
+- A list of all available versions of the image.
+
 
 ## Tutorials
 
@@ -131,7 +206,8 @@ We provide **quick tutorials** to get you started with the library:
 2. [**Tutorial 2: How to crawl articles from CC-NEWS**](docs/2_crawl_from_cc_news.md)
 3. [**Tutorial 3: The Article Class**](docs/3_the_article_class.md)
 4. [**Tutorial 4: How to filter articles**](docs/4_how_to_filter_articles.md)
-5. [**Tutorial 5: How to search for publishers**](docs/5_how_to_search_for_publishers.md)
+5. [**Tutorial 5: Advanced topics**](docs/5_advanced_topics.md)
+6. [**Tutorial 6: Logging**](docs/6_logging.md)
 
 If you wish to contribute check out these tutorials:
 1. [**How to contribute**](docs/how_to_contribute.md)
@@ -143,32 +219,43 @@ You can find the publishers currently supported [**here**](/docs/supported_publi
 
 Also: **Adding a new publisher is easy - consider contributing to the project!**
 
-## Evaluation benchmark
+## Evaluation Benchmark
 
 Check out our evaluation [benchmark](https://github.com/dobbersc/fundus-evaluation).
 
-| **Scraper** | **Precision**             | **Recall**                | **F1-Score**              |
-|-------------|---------------------------|---------------------------|---------------------------|
-| [Fundus](https://github.com/flairNLP/fundus)      | **99.89**<sub>±0.57</sub> | 96.75<sub>±12.75</sub>    | **97.69**<sub>±9.75</sub> |
-| [Trafilatura](https://github.com/adbar/trafilatura) | 90.54<sub>±18.86</sub>    | 93.23<sub>±23.81</sub>    | 89.81<sub>±23.69</sub>    |
-| [BTE](https://github.com/dobbersc/fundus-evaluation/blob/master/src/fundus_evaluation/scrapers/bte.py)         | 81.09<sub>±19.41</sub>    | **98.23**<sub>±8.61</sub> | 87.14<sub>±15.48</sub>    |
-| [jusText](https://github.com/miso-belica/jusText)     | 86.51<sub>±18.92</sub>    | 90.23<sub>±20.61</sub>    | 86.96<sub>±19.76</sub>    |
-| [news-please](https://github.com/fhamborg/news-please) | 92.26<sub>±12.40</sub>    | 86.38<sub>±27.59</sub>    | 85.81<sub>±23.29</sub>    |
-| [BoilerNet](https://github.com/dobbersc/fundus-evaluation/tree/master/src/fundus_evaluation/scrapers/boilernet)   | 84.73<sub>±20.82</sub>    | 90.66<sub>±21.05</sub>    | 85.77<sub>±20.28</sub>    |
-| [Boilerpipe](https://github.com/kohlschutter/boilerpipe)  | 82.89<sub>±20.65</sub>    | 82.11<sub>±29.99</sub>    | 79.90<sub>±25.86</sub>    |
+The following table summarizes the overall performance of Fundus and evaluated scrapers in terms of averaged ROUGE-LSum precision, recall and F1-score and their standard deviation. The table is sorted in descending order over the F1-score:
+
+| **Scraper**                                                                                                     | **Precision**             | **Recall**                | **F1-Score**              | **Version** |
+|-----------------------------------------------------------------------------------------------------------------|:--------------------------|---------------------------|---------------------------|-------------|
+| [Fundus](https://github.com/flairNLP/fundus)                                                                    | **99.89**<sub>±0.57</sub> | 96.75<sub>±12.75</sub>    | **97.69**<sub>±9.75</sub> | 0.4.1       |
+| [Trafilatura](https://github.com/adbar/trafilatura)                                                             | 93.91<sub>±12.89</sub>    | 96.85<sub>±15.69</sub>    | 93.62<sub>±16.73</sub>    | 1.12.0      |
+| [news-please](https://github.com/fhamborg/news-please)                                                          | 97.95<sub>±10.08</sub>    | 91.89<sub>±16.15</sub>    | 93.39<sub>±14.52</sub>    | 1.6.13      |
+| [BTE](https://github.com/dobbersc/fundus-evaluation/blob/master/src/fundus_evaluation/scrapers/bte.py)          | 81.09<sub>±19.41</sub>    | **98.23**<sub>±8.61</sub> | 87.14<sub>±15.48</sub>    | /           |
+| [jusText](https://github.com/miso-belica/jusText)                                                               | 86.51<sub>±18.92</sub>    | 90.23<sub>±20.61</sub>    | 86.96<sub>±19.76</sub>    | 3.0.1       |
+| [BoilerNet](https://github.com/dobbersc/fundus-evaluation/tree/master/src/fundus_evaluation/scrapers/boilernet) | 85.96<sub>±18.55</sub>    | 91.21<sub>±19.15</sub>    | 86.52<sub>±18.03</sub>    | /           |
+| [Boilerpipe](https://github.com/kohlschutter/boilerpipe)                                                        | 82.89<sub>±20.65</sub>    | 82.11<sub>±29.99</sub>    | 79.90<sub>±25.86</sub>    | 1.3.0       |
 
 ## Cite
 
-Please cite the following [paper](https://arxiv.org/abs/2403.15279) when using Fundus or building upon our work:
+Please cite the following [paper](https://aclanthology.org/2024.acl-demos.29/) when using Fundus or building upon our work:
 
 ```bibtex
-@misc{dallabetta2024fundus,
-      title={Fundus: A Simple-to-Use News Scraper Optimized for High Quality Extractions}, 
-      author={Max Dallabetta and Conrad Dobberstein and Adrian Breiding and Alan Akbik},
-      year={2024},
-      eprint={2403.15279},
-      archivePrefix={arXiv},
-      primaryClass={cs.CL}
+@inproceedings{dallabetta-etal-2024-fundus,
+    title = "Fundus: A Simple-to-Use News Scraper Optimized for High Quality Extractions",
+    author = "Dallabetta, Max  and
+      Dobberstein, Conrad  and
+      Breiding, Adrian  and
+      Akbik, Alan",
+    editor = "Cao, Yixin  and
+      Feng, Yang  and
+      Xiong, Deyi",
+    booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)",
+    month = aug,
+    year = "2024",
+    address = "Bangkok, Thailand",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2024.acl-demos.29",
+    pages = "305--314",
 }
 ```
 
 
@@ -4,6 +4,7 @@
   * [What is the `PublisherCollection`](#what-is-the-publishercollection)
   * [What is a `Crawler`](#what-is-a-crawler)
 * [How to crawl articles](#how-to-crawl-articles)
+* [Saving crawled articles](#saving-crawled-articles)
 
 # Basics
 
@@ -83,5 +84,19 @@ for article in crawler.crawl():
     print(article)
 ````
 
+Additionally, you can set a timeout for the crawler in seconds.
+If the crawler does not receive a new article within the specified timeout period, it will terminate automatically.
+```` python
+for article in crawler.crawl(timeout=10):
+    print(article)
+````
+This is especially useful when working with date-related article filters.
+Refer to [this section](4_how_to_filter_articles.md) to learn more about how to filter articles.
+
+# Saving crawled articles
+
+To save all crawled articles to a file use the `save_to_file` parameter of the `crawl` method.
+When given a path, the crawled articles will be saved as a JSON list using the 
+[default article serialization](3_the_article_class.md#saving-an-article) and `UTF-8` encoding.
 
 In the [next](2_crawl_from_cc_news.md) section we will show you how to crawl articles from the CC-NEWS dataset.