Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 103 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
name: CI Pipeline

on:
push:
pull_request:

env:
REGISTRY: ghcr.io
IMAGE_NAME: ${{ github.repository }}

jobs:
test-and-build-stats:
runs-on: ubuntu-latest
permissions:
contents: read
packages: write

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Log in to Container Registry
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Extract metadata for stats image
id: meta-stats
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/stats
tags: |
type=ref,event=branch
type=ref,event=pr
type=sha
type=raw,value=latest,enable={{is_default_branch}}

- name: Build stats Docker image
uses: docker/build-push-action@v5
with:
context: .
file: ./stats.Dockerfile
push: false
tags: |
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/stats:${{ github.sha }}
${{ steps.meta-stats.outputs.tags }}
labels: ${{ steps.meta-stats.outputs.labels }}

- name: Run unit tests
run: |
docker run --rm \
-v ${{ github.workspace }}/tests:/app/tests \
${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/stats:${{ github.sha }} \
python -m pytest -s tests/

- name: Push stats Docker image
if: success() && github.event.pull_request.head.repo.full_name == github.repository
uses: docker/build-push-action@v5
with:
context: .
file: ./stats.Dockerfile
push: true
tags: ${{ steps.meta-stats.outputs.tags }}
labels: ${{ steps.meta-stats.outputs.labels }}

build-site:
runs-on: ubuntu-latest
permissions:
contents: read
packages: write

steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Log in to Container Registry
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Extract metadata for site image
id: meta-site
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/site
tags: |
type=ref,event=branch
type=ref,event=pr
type=sha
type=raw,value=latest,enable={{is_default_branch}}

- name: Build and push site Docker image
uses: docker/build-push-action@v5
with:
context: .
file: ./site.Dockerfile
push: ${{ github.event.pull_request.head.repo.full_name == github.repository }}
tags: ${{ steps.meta-site.outputs.tags }}
labels: ${{ steps.meta-site.outputs.labels }}
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,30 @@ To preview local changes, it's possible to serve the site locally:

... and then the site will be served on http://0.0.0.0:4000 instead. (You will of course need to rebuild the Docker image after updating the Dockerfile.)


Run via Container
-----------------

The whole workflow can be run as a container (docker or podman) including downloading stats files from Common Crawl's S3 bucket and generating new plots.

```bash
# clone the repository (to have the latest crawl IDs)
git clone https://github.com/commoncrawl/cc-crawl-statistics.git
cd cc-crawl-statistics

# download stats and generate plots
# SSH, AWS keys, and stats and plots directories must be mounted into the container
podman run --rm -v ~/.ssh:/root/.ssh:ro -v ~/.aws:/root/.aws:ro -v $(pwd -P)/stats:/app/stats -v $(pwd -P)/plots:/app/plots ghcr.io/commoncrawl/cc-crawl-statistics/stats:latest

# if needed you can manually build the container image
podman build -f stats.Dockerfile -t ghcr.io/commoncrawl/cc-crawl-statistics/stats:latest

# for development it is recommend to mount the whole repository into the container
podman run -it -v ~/.ssh:/root/.ssh:ro -v ~/.aws:/root/.aws:ro -v $(pwd -P):/app ghcr.io/commoncrawl/cc-crawl-statistics/stats:latest /bin/bash

```


Related Projects
----------------

Expand Down
14 changes: 11 additions & 3 deletions crawlplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,14 @@
from rpy2.robjects.lib import ggplot2
from rpy2.robjects import pandas2ri
pandas2ri.activate()
GGPLOT2_THEME = ggplot2.theme_minimal()
# use minimal theme with white background set in plot constructor
# https://ggplot2.tidyverse.org/reference/ggtheme.html
GGPLOT2_THEME = ggplot2.theme_minimal(base_size=12, base_family="Helvetica")

GGPLOT2_THEME_KWARGS = {
'panel.background': ggplot2.element_rect(fill='white', color='white'),
'plot.background': ggplot2.element_rect(fill='white', color='white')
}
# GGPLOT2_THEME = ggplot2.theme_grey()


Expand Down Expand Up @@ -48,10 +55,11 @@ def line_plot(self, data, title, ylabel, img_file,
data['size'] = data['size'].astype(float)
p = ggplot2.ggplot(data) \
+ ggplot2.aes_string(x=x, y=y, color=c) \
+ ggplot2.geom_line(linewidth=.2) + ggplot2.geom_point() \
+ ggplot2.geom_line(linewidth=.5) + ggplot2.geom_point() \
+ GGPLOT2_THEME \
+ ggplot2.theme(**{'legend.position': 'bottom',
'aspect.ratio': ratio}) \
'aspect.ratio': ratio,
**GGPLOT2_THEME_KWARGS}) \
+ ggplot2.labs(title=title, x='', y=ylabel, color=clabel)
img_path = os.path.join(PLOTDIR, img_file)
p.save(img_path)
Expand Down
17 changes: 17 additions & 0 deletions get_stats_and_plot.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash
set -e

echo "Starting ..."

./get_stats.sh

# make sure plot directories exist
mkdir -p plots/crawler
mkdir -p plots/crawloverlap
mkdir -p plots/crawlsize
mkdir -p plots/throughput
mkdir -p plots/tld

./plot.sh

echo "Done."
8 changes: 5 additions & 3 deletions plot/crawl_size.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@

from rpy2.robjects.lib import ggplot2
from rpy2.robjects import pandas2ri
from rpy2 import robjects

from crawlplot import CrawlPlot, PLOTDIR, GGPLOT2_THEME
from crawlplot import CrawlPlot, PLOTDIR, GGPLOT2_THEME, GGPLOT2_THEME_KWARGS

from crawlstats import CST, CrawlStatsJSONDecoder, HYPERLOGLOG_ERROR,\
MonthlyCrawl
Expand Down Expand Up @@ -286,9 +287,10 @@ def plot(self):
color='black', size=2,
position=ggplot2.position_dodge(width=.5)) \
+ GGPLOT2_THEME \
+ ggplot2.scale_fill_hue() \
+ ggplot2.scale_fill_manual(values=robjects.r('c("duplicate"="#00BA38", "revisit"="#619CFF", "new"="#F8766D")')) \
+ ggplot2.theme(**{'legend.position': 'right',
'aspect.ratio': .7},
'aspect.ratio': .7,
**GGPLOT2_THEME_KWARGS},
**{'axis.text.x':
ggplot2.element_text(angle=45, size=10,
vjust=1, hjust=1)}) \
Expand Down
8 changes: 5 additions & 3 deletions plot/crawler_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from rpy2.robjects.lib import ggplot2
from rpy2.robjects import pandas2ri

from crawlplot import PLOTDIR, GGPLOT2_THEME
from crawlplot import PLOTDIR, GGPLOT2_THEME, GGPLOT2_THEME_KWARGS

from crawlstats import CST, MultiCount
from crawl_size import CrawlSizePlot
Expand Down Expand Up @@ -143,7 +143,8 @@ def plot_fetch_status(self, data, row_filter, img_file, ratio=1.0):
guide=ggplot2.guide_legend(reverse=True)) \
+ GGPLOT2_THEME \
+ ggplot2.theme(**{'legend.position': 'bottom',
'aspect.ratio': ratio}) \
'aspect.ratio': ratio,
**GGPLOT2_THEME_KWARGS}) \
+ ggplot2.labs(title='Percentage of Fetch Status',
x='', y='', fill='')
img_path = os.path.join(PLOTDIR, img_file)
Expand Down Expand Up @@ -172,7 +173,8 @@ def plot_crawldb_status(self, data, row_filter, img_file, ratio=1.0):
guide=ggplot2.guide_legend(reverse=False)) \
+ GGPLOT2_THEME \
+ ggplot2.theme(**{'legend.position': 'bottom',
'aspect.ratio': ratio}) \
'aspect.ratio': ratio,
**GGPLOT2_THEME_KWARGS}) \
+ ggplot2.labs(title='CrawlDb Size and Status Counts',
x='', y='', fill='')
img_path = os.path.join(PLOTDIR, img_file)
Expand Down
3 changes: 2 additions & 1 deletion plot/histogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from rpy2.robjects.lib import ggplot2
from rpy2.robjects import pandas2ri

from crawlplot import CrawlPlot, PLOTDIR, GGPLOT2_THEME
from crawlplot import CrawlPlot, PLOTDIR, GGPLOT2_THEME, GGPLOT2_THEME_KWARGS

pandas2ri.activate()

Expand Down Expand Up @@ -119,6 +119,7 @@ def plot_domain_cumul(self, crawl):
+ ggplot2.aes_string(x='cum_domains', y='cum_urls') \
+ ggplot2.geom_line() + ggplot2.geom_point() \
+ GGPLOT2_THEME \
+ ggplot2.theme(**GGPLOT2_THEME_KWARGS) \
+ ggplot2.labs(title=title, x='domains cumulative',
y='URLs cumulative') \
+ ggplot2.scale_y_log10() \
Expand Down
5 changes: 3 additions & 2 deletions plot/overlap.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

import pygraphviz

from crawlplot import CrawlPlot, PLOTDIR, GGPLOT2_THEME
from crawlplot import CrawlPlot, PLOTDIR, GGPLOT2_THEME, GGPLOT2_THEME_KWARGS

pandas2ri.activate()

Expand Down Expand Up @@ -135,7 +135,8 @@ def plot_similarity_matrix(self, item_type, image_file, title):
+ ggplot2.coord_fixed() \
+ ggplot2.theme(**{'axis.text.x':
ggplot2.element_text(angle=45,
vjust=1, hjust=1)}) \
vjust=1, hjust=1),
**GGPLOT2_THEME_KWARGS}) \
+ ggplot2.labs(title=title, x='', y='') \
+ ggplot2.geom_text(color='black', size=textsize)
img_path = os.path.join(PLOTDIR, image_file)
Expand Down
3 changes: 2 additions & 1 deletion plot/tld_by_continent.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from rpy2.robjects.lib import ggplot2

from crawlplot import PLOTDIR, GGPLOT2_THEME
from crawlplot import PLOTDIR, GGPLOT2_THEME, GGPLOT2_THEME_KWARGS
from crawlstats import MonthlyCrawl, MultiCount
from top_level_domain import TopLevelDomain

Expand Down Expand Up @@ -226,6 +226,7 @@ def tld2continent(tld):
x='', y='Percentage', fill='TLD / Continent') \
+ ggplot2.theme(**{'legend.position': 'right',
'aspect.ratio': .7,
**GGPLOT2_THEME_KWARGS,
'axis.text.x':
ggplot2.element_text(angle=45,
vjust=1, hjust=1)})
Expand Down
Binary file modified plots/crawler/crawldb_status.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified plots/crawler/fetch_status_percentage.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified plots/crawler/metrics.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified plots/crawler/url_protocols.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified plots/crawler/url_protocols_percentage.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified plots/crawloverlap/crawlsimilarity_matrix_digest.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified plots/crawloverlap/crawlsimilarity_matrix_url.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified plots/crawlsize/cumulative.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified plots/crawlsize/digest_last_n_crawls.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified plots/crawlsize/domain.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified plots/crawlsize/monthly.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified plots/crawlsize/monthly_new.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified plots/crawlsize/registered-domains.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified plots/crawlsize/url_last_n_crawls.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified plots/crawlsize/url_page_ratio_last_n_crawls.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified plots/crawlsize/url_status_by_year.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified plots/tld/groups.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified plots/tld/tlds-by-year-and-continent.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
13 changes: 9 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
hyperloglog==0.0.14
isoweek
mrjob
tldextract
ujson
isoweek==1.3.3
mrjob==0.7.4
tldextract==5.1.2
ujson==5.10.0

# tests
pytest
jsonpickle
setuptools
11 changes: 6 additions & 5 deletions requirements_plot.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
ggplot
idna
pandas
pygraphviz
rpy2
ggplot==0.11.5
idna==3.7
#pandas==2.1.4+dfsg
pandas==2.1.4
pygraphviz==1.13
rpy2==3.5.15
40 changes: 40 additions & 0 deletions stats.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Replicating pjox/cc-crawl-statistics
FROM python:3.12

# Install system dependencies
RUN apt-get update && apt-get install -y \
git \
python3-rpy2 \
r-cran-ggplot2 \
graphviz-dev \
r-base jq \
awscli

# Set working directory
WORKDIR /app

# Copy dependency config files (first for cache)
COPY requirements.txt .
COPY requirements_plot.txt .

# Install Python dependencies
RUN pip3 install -r requirements.txt
RUN pip3 install -r requirements_plot.txt

# Copy the remaining repository files
COPY stats/crawler ./stats/crawler
COPY plots/ ./plots/
COPY plot/ ./plot/
COPY tests/ ./tests/

COPY *.sh ./
COPY *.py ./
COPY _config.yml ./

# Set PYTHONPATH environment variable
ENV PYTHONPATH=/app

# ggplot2 is already installed via r-cran-ggplot2 system package above

# Default command
CMD ["./get_stats_and_plot.sh"]