diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..e94d8ce --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,103 @@ +name: CI Pipeline + +on: + push: + pull_request: + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + +jobs: + test-and-build-stats: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Log in to Container Registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata for stats image + id: meta-stats + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/stats + tags: | + type=ref,event=branch + type=ref,event=pr + type=sha + type=raw,value=latest,enable={{is_default_branch}} + + - name: Build stats Docker image + uses: docker/build-push-action@v5 + with: + context: . + file: ./stats.Dockerfile + push: false + tags: | + ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/stats:${{ github.sha }} + ${{ steps.meta-stats.outputs.tags }} + labels: ${{ steps.meta-stats.outputs.labels }} + + - name: Run unit tests + run: | + docker run --rm \ + -v ${{ github.workspace }}/tests:/app/tests \ + ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/stats:${{ github.sha }} \ + python -m pytest -s tests/ + + - name: Push stats Docker image + if: success() && github.event.pull_request.head.repo.full_name == github.repository + uses: docker/build-push-action@v5 + with: + context: . + file: ./stats.Dockerfile + push: true + tags: ${{ steps.meta-stats.outputs.tags }} + labels: ${{ steps.meta-stats.outputs.labels }} + + build-site: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Log in to Container Registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract metadata for site image + id: meta-site + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/site + tags: | + type=ref,event=branch + type=ref,event=pr + type=sha + type=raw,value=latest,enable={{is_default_branch}} + + - name: Build and push site Docker image + uses: docker/build-push-action@v5 + with: + context: . + file: ./site.Dockerfile + push: ${{ github.event.pull_request.head.repo.full_name == github.repository }} + tags: ${{ steps.meta-site.outputs.tags }} + labels: ${{ steps.meta-site.outputs.labels }} \ No newline at end of file diff --git a/README.md b/README.md index 01fe894..7cd2810 100644 --- a/README.md +++ b/README.md @@ -129,6 +129,30 @@ To preview local changes, it's possible to serve the site locally: ... and then the site will be served on http://0.0.0.0:4000 instead. (You will of course need to rebuild the Docker image after updating the Dockerfile.) + +Run via Container +----------------- + +The whole workflow can be run as a container (docker or podman) including downloading stats files from Common Crawl's S3 bucket and generating new plots. + +```bash +# clone the repository (to have the latest crawl IDs) +git clone https://github.com/commoncrawl/cc-crawl-statistics.git +cd cc-crawl-statistics + +# download stats and generate plots +# SSH, AWS keys, and stats and plots directories must be mounted into the container +podman run --rm -v ~/.ssh:/root/.ssh:ro -v ~/.aws:/root/.aws:ro -v $(pwd -P)/stats:/app/stats -v $(pwd -P)/plots:/app/plots ghcr.io/commoncrawl/cc-crawl-statistics/stats:latest + +# if needed you can manually build the container image +podman build -f stats.Dockerfile -t ghcr.io/commoncrawl/cc-crawl-statistics/stats:latest + +# for development it is recommend to mount the whole repository into the container +podman run -it -v ~/.ssh:/root/.ssh:ro -v ~/.aws:/root/.aws:ro -v $(pwd -P):/app ghcr.io/commoncrawl/cc-crawl-statistics/stats:latest /bin/bash + +``` + + Related Projects ---------------- diff --git a/crawlplot.py b/crawlplot.py index 465231c..429054b 100644 --- a/crawlplot.py +++ b/crawlplot.py @@ -11,7 +11,14 @@ from rpy2.robjects.lib import ggplot2 from rpy2.robjects import pandas2ri pandas2ri.activate() - GGPLOT2_THEME = ggplot2.theme_minimal() + # use minimal theme with white background set in plot constructor + # https://ggplot2.tidyverse.org/reference/ggtheme.html + GGPLOT2_THEME = ggplot2.theme_minimal(base_size=12, base_family="Helvetica") + + GGPLOT2_THEME_KWARGS = { + 'panel.background': ggplot2.element_rect(fill='white', color='white'), + 'plot.background': ggplot2.element_rect(fill='white', color='white') + } # GGPLOT2_THEME = ggplot2.theme_grey() @@ -48,10 +55,11 @@ def line_plot(self, data, title, ylabel, img_file, data['size'] = data['size'].astype(float) p = ggplot2.ggplot(data) \ + ggplot2.aes_string(x=x, y=y, color=c) \ - + ggplot2.geom_line(linewidth=.2) + ggplot2.geom_point() \ + + ggplot2.geom_line(linewidth=.5) + ggplot2.geom_point() \ + GGPLOT2_THEME \ + ggplot2.theme(**{'legend.position': 'bottom', - 'aspect.ratio': ratio}) \ + 'aspect.ratio': ratio, + **GGPLOT2_THEME_KWARGS}) \ + ggplot2.labs(title=title, x='', y=ylabel, color=clabel) img_path = os.path.join(PLOTDIR, img_file) p.save(img_path) diff --git a/get_stats_and_plot.sh b/get_stats_and_plot.sh new file mode 100755 index 0000000..7bcf544 --- /dev/null +++ b/get_stats_and_plot.sh @@ -0,0 +1,17 @@ +#!/bin/bash +set -e + +echo "Starting ..." + +./get_stats.sh + +# make sure plot directories exist +mkdir -p plots/crawler +mkdir -p plots/crawloverlap +mkdir -p plots/crawlsize +mkdir -p plots/throughput +mkdir -p plots/tld + +./plot.sh + +echo "Done." \ No newline at end of file diff --git a/plot/crawl_size.py b/plot/crawl_size.py index 7cab6b1..e858a87 100644 --- a/plot/crawl_size.py +++ b/plot/crawl_size.py @@ -9,8 +9,9 @@ from rpy2.robjects.lib import ggplot2 from rpy2.robjects import pandas2ri +from rpy2 import robjects -from crawlplot import CrawlPlot, PLOTDIR, GGPLOT2_THEME +from crawlplot import CrawlPlot, PLOTDIR, GGPLOT2_THEME, GGPLOT2_THEME_KWARGS from crawlstats import CST, CrawlStatsJSONDecoder, HYPERLOGLOG_ERROR,\ MonthlyCrawl @@ -286,9 +287,10 @@ def plot(self): color='black', size=2, position=ggplot2.position_dodge(width=.5)) \ + GGPLOT2_THEME \ - + ggplot2.scale_fill_hue() \ + + ggplot2.scale_fill_manual(values=robjects.r('c("duplicate"="#00BA38", "revisit"="#619CFF", "new"="#F8766D")')) \ + ggplot2.theme(**{'legend.position': 'right', - 'aspect.ratio': .7}, + 'aspect.ratio': .7, + **GGPLOT2_THEME_KWARGS}, **{'axis.text.x': ggplot2.element_text(angle=45, size=10, vjust=1, hjust=1)}) \ diff --git a/plot/crawler_metrics.py b/plot/crawler_metrics.py index 062c6ce..c9b1588 100644 --- a/plot/crawler_metrics.py +++ b/plot/crawler_metrics.py @@ -7,7 +7,7 @@ from rpy2.robjects.lib import ggplot2 from rpy2.robjects import pandas2ri -from crawlplot import PLOTDIR, GGPLOT2_THEME +from crawlplot import PLOTDIR, GGPLOT2_THEME, GGPLOT2_THEME_KWARGS from crawlstats import CST, MultiCount from crawl_size import CrawlSizePlot @@ -143,7 +143,8 @@ def plot_fetch_status(self, data, row_filter, img_file, ratio=1.0): guide=ggplot2.guide_legend(reverse=True)) \ + GGPLOT2_THEME \ + ggplot2.theme(**{'legend.position': 'bottom', - 'aspect.ratio': ratio}) \ + 'aspect.ratio': ratio, + **GGPLOT2_THEME_KWARGS}) \ + ggplot2.labs(title='Percentage of Fetch Status', x='', y='', fill='') img_path = os.path.join(PLOTDIR, img_file) @@ -172,7 +173,8 @@ def plot_crawldb_status(self, data, row_filter, img_file, ratio=1.0): guide=ggplot2.guide_legend(reverse=False)) \ + GGPLOT2_THEME \ + ggplot2.theme(**{'legend.position': 'bottom', - 'aspect.ratio': ratio}) \ + 'aspect.ratio': ratio, + **GGPLOT2_THEME_KWARGS}) \ + ggplot2.labs(title='CrawlDb Size and Status Counts', x='', y='', fill='') img_path = os.path.join(PLOTDIR, img_file) diff --git a/plot/histogram.py b/plot/histogram.py index a5d7b55..8a77820 100644 --- a/plot/histogram.py +++ b/plot/histogram.py @@ -9,7 +9,7 @@ from rpy2.robjects.lib import ggplot2 from rpy2.robjects import pandas2ri -from crawlplot import CrawlPlot, PLOTDIR, GGPLOT2_THEME +from crawlplot import CrawlPlot, PLOTDIR, GGPLOT2_THEME, GGPLOT2_THEME_KWARGS pandas2ri.activate() @@ -119,6 +119,7 @@ def plot_domain_cumul(self, crawl): + ggplot2.aes_string(x='cum_domains', y='cum_urls') \ + ggplot2.geom_line() + ggplot2.geom_point() \ + GGPLOT2_THEME \ + + ggplot2.theme(**GGPLOT2_THEME_KWARGS) \ + ggplot2.labs(title=title, x='domains cumulative', y='URLs cumulative') \ + ggplot2.scale_y_log10() \ diff --git a/plot/overlap.py b/plot/overlap.py index 64b34ef..944f228 100644 --- a/plot/overlap.py +++ b/plot/overlap.py @@ -12,7 +12,7 @@ import pygraphviz -from crawlplot import CrawlPlot, PLOTDIR, GGPLOT2_THEME +from crawlplot import CrawlPlot, PLOTDIR, GGPLOT2_THEME, GGPLOT2_THEME_KWARGS pandas2ri.activate() @@ -135,7 +135,8 @@ def plot_similarity_matrix(self, item_type, image_file, title): + ggplot2.coord_fixed() \ + ggplot2.theme(**{'axis.text.x': ggplot2.element_text(angle=45, - vjust=1, hjust=1)}) \ + vjust=1, hjust=1), + **GGPLOT2_THEME_KWARGS}) \ + ggplot2.labs(title=title, x='', y='') \ + ggplot2.geom_text(color='black', size=textsize) img_path = os.path.join(PLOTDIR, image_file) diff --git a/plot/tld_by_continent.py b/plot/tld_by_continent.py index 3c0ab2e..aae09a3 100644 --- a/plot/tld_by_continent.py +++ b/plot/tld_by_continent.py @@ -8,7 +8,7 @@ from rpy2.robjects.lib import ggplot2 -from crawlplot import PLOTDIR, GGPLOT2_THEME +from crawlplot import PLOTDIR, GGPLOT2_THEME, GGPLOT2_THEME_KWARGS from crawlstats import MonthlyCrawl, MultiCount from top_level_domain import TopLevelDomain @@ -226,6 +226,7 @@ def tld2continent(tld): x='', y='Percentage', fill='TLD / Continent') \ + ggplot2.theme(**{'legend.position': 'right', 'aspect.ratio': .7, + **GGPLOT2_THEME_KWARGS, 'axis.text.x': ggplot2.element_text(angle=45, vjust=1, hjust=1)}) diff --git a/plots/crawler/crawldb_status.png b/plots/crawler/crawldb_status.png index 93d1ec9..847a605 100644 Binary files a/plots/crawler/crawldb_status.png and b/plots/crawler/crawldb_status.png differ diff --git a/plots/crawler/fetch_status_percentage.png b/plots/crawler/fetch_status_percentage.png index fde8707..567922c 100644 Binary files a/plots/crawler/fetch_status_percentage.png and b/plots/crawler/fetch_status_percentage.png differ diff --git a/plots/crawler/metrics.png b/plots/crawler/metrics.png index 593aa93..067919b 100644 Binary files a/plots/crawler/metrics.png and b/plots/crawler/metrics.png differ diff --git a/plots/crawler/url_protocols.png b/plots/crawler/url_protocols.png index 20ddde7..b209b01 100644 Binary files a/plots/crawler/url_protocols.png and b/plots/crawler/url_protocols.png differ diff --git a/plots/crawler/url_protocols_percentage.png b/plots/crawler/url_protocols_percentage.png index 050d5f7..33bd88c 100644 Binary files a/plots/crawler/url_protocols_percentage.png and b/plots/crawler/url_protocols_percentage.png differ diff --git a/plots/crawloverlap/crawlsimilarity_matrix_digest.png b/plots/crawloverlap/crawlsimilarity_matrix_digest.png index 12e0f3d..7e6c740 100644 Binary files a/plots/crawloverlap/crawlsimilarity_matrix_digest.png and b/plots/crawloverlap/crawlsimilarity_matrix_digest.png differ diff --git a/plots/crawloverlap/crawlsimilarity_matrix_url.png b/plots/crawloverlap/crawlsimilarity_matrix_url.png index 4921576..1acfa79 100644 Binary files a/plots/crawloverlap/crawlsimilarity_matrix_url.png and b/plots/crawloverlap/crawlsimilarity_matrix_url.png differ diff --git a/plots/crawlsize/cumulative.png b/plots/crawlsize/cumulative.png index 570f931..174fc2e 100644 Binary files a/plots/crawlsize/cumulative.png and b/plots/crawlsize/cumulative.png differ diff --git a/plots/crawlsize/digest_last_n_crawls.png b/plots/crawlsize/digest_last_n_crawls.png index 28a85e7..3b4edab 100644 Binary files a/plots/crawlsize/digest_last_n_crawls.png and b/plots/crawlsize/digest_last_n_crawls.png differ diff --git a/plots/crawlsize/domain.png b/plots/crawlsize/domain.png index 732d09f..644c2e7 100644 Binary files a/plots/crawlsize/domain.png and b/plots/crawlsize/domain.png differ diff --git a/plots/crawlsize/monthly.png b/plots/crawlsize/monthly.png index 8b06c29..06ce8e4 100644 Binary files a/plots/crawlsize/monthly.png and b/plots/crawlsize/monthly.png differ diff --git a/plots/crawlsize/monthly_new.png b/plots/crawlsize/monthly_new.png index e2162cd..dfac577 100644 Binary files a/plots/crawlsize/monthly_new.png and b/plots/crawlsize/monthly_new.png differ diff --git a/plots/crawlsize/registered-domains.png b/plots/crawlsize/registered-domains.png index dc4f594..6da5428 100644 Binary files a/plots/crawlsize/registered-domains.png and b/plots/crawlsize/registered-domains.png differ diff --git a/plots/crawlsize/url_last_n_crawls.png b/plots/crawlsize/url_last_n_crawls.png index 0a97fb5..3030b87 100644 Binary files a/plots/crawlsize/url_last_n_crawls.png and b/plots/crawlsize/url_last_n_crawls.png differ diff --git a/plots/crawlsize/url_page_ratio_last_n_crawls.png b/plots/crawlsize/url_page_ratio_last_n_crawls.png index 9c74ec8..35984d3 100644 Binary files a/plots/crawlsize/url_page_ratio_last_n_crawls.png and b/plots/crawlsize/url_page_ratio_last_n_crawls.png differ diff --git a/plots/crawlsize/url_status_by_year.png b/plots/crawlsize/url_status_by_year.png index c7b776d..042c1a9 100644 Binary files a/plots/crawlsize/url_status_by_year.png and b/plots/crawlsize/url_status_by_year.png differ diff --git a/plots/tld/groups.png b/plots/tld/groups.png index a1658ba..8aa6163 100644 Binary files a/plots/tld/groups.png and b/plots/tld/groups.png differ diff --git a/plots/tld/tlds-by-year-and-continent.png b/plots/tld/tlds-by-year-and-continent.png index fca3fd2..4e9eaf4 100644 Binary files a/plots/tld/tlds-by-year-and-continent.png and b/plots/tld/tlds-by-year-and-continent.png differ diff --git a/requirements.txt b/requirements.txt index 8829ee0..93e0293 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,10 @@ hyperloglog==0.0.14 -isoweek -mrjob -tldextract -ujson \ No newline at end of file +isoweek==1.3.3 +mrjob==0.7.4 +tldextract==5.1.2 +ujson==5.10.0 + +# tests +pytest +jsonpickle +setuptools diff --git a/requirements_plot.txt b/requirements_plot.txt index 672a5f5..5f6e9e9 100644 --- a/requirements_plot.txt +++ b/requirements_plot.txt @@ -1,5 +1,6 @@ -ggplot -idna -pandas -pygraphviz -rpy2 \ No newline at end of file +ggplot==0.11.5 +idna==3.7 +#pandas==2.1.4+dfsg +pandas==2.1.4 +pygraphviz==1.13 +rpy2==3.5.15 diff --git a/stats.Dockerfile b/stats.Dockerfile new file mode 100644 index 0000000..d683e70 --- /dev/null +++ b/stats.Dockerfile @@ -0,0 +1,40 @@ +# Replicating pjox/cc-crawl-statistics +FROM python:3.12 + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + git \ + python3-rpy2 \ + r-cran-ggplot2 \ + graphviz-dev \ + r-base jq \ + awscli + +# Set working directory +WORKDIR /app + +# Copy dependency config files (first for cache) +COPY requirements.txt . +COPY requirements_plot.txt . + +# Install Python dependencies +RUN pip3 install -r requirements.txt +RUN pip3 install -r requirements_plot.txt + +# Copy the remaining repository files +COPY stats/crawler ./stats/crawler +COPY plots/ ./plots/ +COPY plot/ ./plot/ +COPY tests/ ./tests/ + +COPY *.sh ./ +COPY *.py ./ +COPY _config.yml ./ + +# Set PYTHONPATH environment variable +ENV PYTHONPATH=/app + +# ggplot2 is already installed via r-cran-ggplot2 system package above + +# Default command +CMD ["./get_stats_and_plot.sh"] \ No newline at end of file