acl-org · mjpost · Sep 18, 2025 · Sep 18, 2025 · Sep 18, 2025 · Sep 18, 2025
diff --git a/README.md b/README.md
@@ -122,6 +122,35 @@ You will need this software on the server
 Note that generating the anthology takes quite a bit of RAM, so make
 sure it is available on your machine.
 
+## PDF Watermark / Footer Tool
+
+The Anthology includes a lightweight tool to add an ACL‑style footer (first page) and optional page numbers to arbitrary PDFs.
+
+Components:
+
+* `hugo/static/watermark.html` – Client interface (drag/drop PDF, footer text, starting page number).
+* `hugo/static/cgi-bin/watermark.cgi` – CGI endpoint invoking `bin/add_footer.py`.
+* `bin/add_footer.py` – Core logic (already part of the repository) supporting inline italics with `<i>…</i>` and multi‑line centered layout.
+
+Setup:
+1. Create a python3.10 venv at /opt/venv/watermark
+2. Activate and install dependencies: `pip install pypdf reportlab`
+
+Usage:
+1. Serve the site (or just open the HTML file if CGI is reachable at `/cgi-bin/watermark.cgi`).
+2. Browse to `https://aclanthology.org/watermark.html`.
+3. Provide an optional starting page number and multi‑line footer block (press Enter for new lines). Inline italics via `<i>…</i>`.
+4. Click “Generate PDF” to download the processed file (`*.watermarked.pdf`).
+
+Server / security notes:
+* Upload limit: 25MB (client) / 30MB (server hard cap).
+* Basic validation checks `%PDF-` header.
+* Uses temporary directory per request; no persistence.
+* Requires Python environment satisfying dependencies for `add_footer.py` (pypdf, reportlab).
+* Errors returned as plain text with HTTP status codes.
+
+To disable, simply remove the HTML or CGI script; no other components are affected.
+
 ## Contributing
 
 If you'd like to contribute to the ACL Anthology, please take a look at:

diff --git a/bin/add_footer.py b/bin/add_footer.py
@@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+"""
+Add ACL-like footer (first page) and optional page numbers (all pages).
+Inline italics with <i>…</i>.
+
+Examples:
+    python add_footer.py in.pdf out.pdf \
+ "<i>Proceedings … pages 8697–8727</i>\nJuly 27 - August 1, 2025 ©2025 ACL"
+    python add_footer.py -p 199 in.pdf out.pdf "…"
+    python add_footer.py -p 199 --footer-size 9 --pagenum-size 10 --bottom-margin 14 in.pdf out.pdf "…"
+
+Copyright 2025, Matt Post
+"""
+
+
+import io
+import re
+import argparse
+from pathlib import Path
+from pypdf import PdfReader, PdfWriter
+from reportlab.pdfgen import canvas
+
+# Defaults tuned for ACL footer look
+DEFAULT_BOTTOM_MARGIN_PT = 14
+DEFAULT_LINE_SPACING = 1.2
+DEFAULT_FOOTER_SIZE = 9  # footer text size
+DEFAULT_PAGENUM_SIZE = 11  # page number size
+
+FONT_REG = "Times-Roman"
+FONT_ITAL = "Times-Italic"
+
+TAG_RE = re.compile(r"(</?i>)")
+
+
+def parse_inline_italics(s):
+    """Yield (text, is_italic) spans from a string with <i>…</i> regions."""
+    parts = TAG_RE.split(s)
+    italic = False
+    for tok in parts:
+        if tok == "<i>":
+            italic = True
+        elif tok == "</i>":
+            italic = False
+        elif tok:
+            yield tok, italic
+
+
+def measure_line(c, line, size):
+    """Total width of a mixed-style line."""
+    w = 0.0
+    for txt, it in parse_inline_italics(line):
+        font = FONT_ITAL if it else FONT_REG
+        w += c.stringWidth(txt, font, size)
+    return w
+
+
+def draw_rich_centered(c, page_w, y, line, size):
+    """Draw a mixed-style line centered at y."""
+    total_w = measure_line(c, line, size)
+    x = (page_w - total_w) / 2.0
+    for txt, it in parse_inline_italics(line):
+        font = FONT_ITAL if it else FONT_REG
+        c.setFont(font, size)
+        c.drawString(x, y, txt)
+        x += c.stringWidth(txt, font, size)
+
+
+def mk_footer_overlay(w, h, text_block, bottom_margin, size, line_spacing):
+    """Footer block near bottom: render lines in given order, stacking downward."""
+    buf = io.BytesIO()
+    c = canvas.Canvas(buf, pagesize=(w, h))
+    lines = text_block.split("\n") if text_block else []
+    if not lines:
+        c.showPage()
+        c.save()
+        buf.seek(0)
+        return buf
+
+    line_h = size * line_spacing
+    # Start y so that the FIRST line appears above subsequent lines,
+    # with the LAST line's baseline at bottom_margin.
+    y = bottom_margin + (len(lines) - 1) * line_h
+    for line in lines:
+        draw_rich_centered(c, w, y, line, size)
+        y -= line_h  # next line goes BELOW
+    c.showPage()
+    c.save()
+    buf.seek(0)
+    return buf
+
+
+def mk_pagenum_overlay(w, h, page_num, bottom_margin, size):
+    buf = io.BytesIO()
+    c = canvas.Canvas(buf, pagesize=(w, h))
+    c.setFont(FONT_REG, size)
+    text = str(page_num)
+    tw = c.stringWidth(text, FONT_REG, size)
+    x = (w - tw) / 2.0
+    y = bottom_margin
+    c.drawString(x, y, text)
+    c.showPage()
+    c.save()
+    buf.seek(0)
+    return buf
+
+
+def process(
+    input_pdf,
+    output_pdf,
+    text_block,
+    page_start,
+    bottom_margin,
+    footer_size,
+    pagenum_size,
+    line_spacing,
+):
+    reader = PdfReader(str(input_pdf))
+    writer = PdfWriter()
+
+    footer_cache, pnum_cache = {}, {}
+
+    for idx, page in enumerate(reader.pages, start=1):
+        w = float(page.mediabox.width)
+        h = float(page.mediabox.height)
+
+        disp_num = None if page_start is None else page_start + idx - 1
+
+        # Page number: SAME bottom margin on every page
+        if disp_num is not None:
+            nkey = (w, h, disp_num, pagenum_size, bottom_margin)
+            if nkey not in pnum_cache:
+                pnum_cache[nkey] = PdfReader(
+                    mk_pagenum_overlay(w, h, disp_num, bottom_margin, pagenum_size)
+                ).pages[0]
+            page.merge_page(pnum_cache[nkey])
+
+        # Footer only on first page; place it ABOVE the fixed page number
+        if idx == 1 and text_block:
+            # raise footer so its LAST line sits above the page number by a small gap
+            gap = 0.6 * footer_size
+            footer_bottom = bottom_margin + pagenum_size + gap
+            fkey = (w, h, "footer", footer_size, footer_bottom, line_spacing, text_block)
+            if fkey not in footer_cache:
+                footer_cache[fkey] = PdfReader(
+                    mk_footer_overlay(
+                        w, h, text_block, footer_bottom, footer_size, line_spacing
+                    )
+                ).pages[0]
+            page.merge_page(footer_cache[fkey])
+
+        writer.add_page(page)
+
+    with open(output_pdf, "wb") as f:
+        writer.write(f)
+
+
+def main():
+    ap = argparse.ArgumentParser(
+        description="Add ACL-like footer (first page) and optional page numbers (all pages)."
+    )
+    ap.add_argument(
+        "--page-number",
+        "-p",
+        type=int,
+        metavar="N",
+        help="Enable page numbers starting at N (e.g., -p 5).",
+    )
+    ap.add_argument(
+        "--bottom-margin",
+        type=float,
+        default=14,
+        help="Baseline distance from bottom (pt).",
+    )
+    ap.add_argument(
+        "--footer-size",
+        type=float,
+        default=DEFAULT_FOOTER_SIZE,
+        help="Footer font size (pt).",
+    )
+    ap.add_argument(
+        "--pagenum-size",
+        type=float,
+        default=DEFAULT_PAGENUM_SIZE,
+        help="Page number font size (pt).",
+    )
+    ap.add_argument(
+        "--line-spacing", type=float, default=1.2, help="Footer line spacing multiplier."
+    )
+    ap.add_argument("input_pdf", type=Path)
+    ap.add_argument("output_pdf", type=Path)
+    ap.add_argument(
+        "text_block",
+        nargs="?",
+        default="",
+        help="Footer text for FIRST page only. Use \\n for newlines. Use <i>…</i> for inline italics.",
+    )
+    args = ap.parse_args()
+
+    # normalize literal "\n"
+    args.text_block = args.text_block.replace("\\n", "\n")
+
+    process(
+        args.input_pdf,
+        args.output_pdf,
+        args.text_block,
+        args.page_number,
+        args.bottom_margin,
+        args.footer_size,
+        args.pagenum_size,
+        args.line_spacing,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/add_revision.py b/bin/add_revision.py
@@ -1,7 +1,7 @@
 #! /usr/bin/env python3
 # -*- coding: utf-8 -*-
 #
-# Copyright 2019 Matt Post <[email protected]>
+# Copyright 2019–2025 Matt Post <[email protected]>
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -41,6 +41,7 @@
 import shutil
 import sys
 import tempfile
+import io
 
 from git.repo.base import Repo
 
@@ -58,6 +59,66 @@
 import lxml.etree as ET
 
 from datetime import datetime
+from pypdf import PdfReader, PdfWriter
+from reportlab.pdfgen import canvas
+
+WATERMARK_FONT = "Times-Roman"
+WATERMARK_SIZE = 16
+WATERMARK_LEFT_OFFSET_PT = (
+    27  # distance from left edge in points (50% increase for margin)
+)
+WATERMARK_GRAY = 0.55  # medium gray like arXiv
+
+
+def _make_vertical_watermark_page(w, h, text):
+    """Return a single-page PDF with vertical (rotated 90° CCW) watermark at left."""
+    buf = io.BytesIO()
+    c = canvas.Canvas(buf, pagesize=(w, h))
+    c.saveState()
+    c.setFont(WATERMARK_FONT, WATERMARK_SIZE)
+    c.setFillGray(WATERMARK_GRAY)
+    # Translate slightly from left then rotate so text reads bottom-to-top along left side.
+    c.translate(WATERMARK_LEFT_OFFSET_PT, 0)
+    c.rotate(90)
+    text_w = c.stringWidth(text, WATERMARK_FONT, WATERMARK_SIZE)
+    # Center along original page height (which becomes horizontal span after rotation)
+    x_draw = (h - text_w) / 2.0
+    y_draw = 0
+    c.drawString(x_draw, y_draw, text)
+    c.restoreState()
+    c.showPage()
+    c.save()
+    buf.seek(0)
+    return buf
+
+
+def add_revision_watermark(pdf_path, anth_id, revno, date):
+    """Return path to temp PDF with watermark added to first page (revisions only)."""
+    reader = PdfReader(pdf_path)
+    if not reader.pages:
+        return pdf_path
+    writer = PdfWriter()
+    first = reader.pages[0]
+    w = float(first.mediabox.width)
+    h = float(first.mediabox.height)
+    # Format date as DD-Mon-YYYY (e.g., 17-Sep-2025) for watermark display only.
+    try:
+        dt = datetime.strptime(date, "%Y-%m-%d")
+        display_date = dt.strftime("%d %b %Y")
+    except ValueError:
+        # If already in some unexpected format, just use original string.
+        display_date = date
+    text = f"ACL Anthology ID {anth_id} / revision {revno} / {display_date}"
+    overlay = PdfReader(_make_vertical_watermark_page(w, h, text)).pages[0]
+    first.merge_page(overlay)
+    writer.add_page(first)
+    for p in reader.pages[1:]:
+        writer.add_page(p)
+    fd, tmp_path = tempfile.mkstemp(suffix=".pdf")
+    os.close(fd)
+    with open(tmp_path, "wb") as out_f:
+        writer.write(out_f)
+    return tmp_path
 
 
 def validate_file_type(path):
@@ -101,7 +162,7 @@ def maybe_copy(file_from, file_to):
 
     change_letter = "e" if change_type == "erratum" else "v"
 
-    checksum = compute_hash_from_file(pdf_path)
+    # checksum will be computed after potential watermark insertion
 
     # Files for old-style IDs are stored under anthology-files/pdf/P/P19/*
     # Files for new-style IDs are stored under anthology-files/pdf/2020.acl/*
@@ -130,6 +191,14 @@ def maybe_copy(file_from, file_to):
         for revision in revisions:
             revno = int(revision.attrib["id"]) + 1
 
+        # Insert watermark for revisions before computing checksum / updating XML
+        watermarked_temp_path = None
+        if change_type == "revision":
+            watermarked_temp_path = add_revision_watermark(pdf_path, anth_id, revno, date)
+            pdf_path = watermarked_temp_path
+
+        checksum = compute_hash_from_file(pdf_path)
+
         if not dry_run:
             # Update the URL hash on the <url> tag
             if change_type != "erratum":
@@ -201,6 +270,17 @@ def maybe_copy(file_from, file_to):
     if change_type == "revision":
         maybe_copy(pdf_path, canonical_path)
 
+    # Cleanup temp watermarked file if created
+    if (
+        'watermarked_temp_path' in locals()
+        and watermarked_temp_path
+        and os.path.exists(watermarked_temp_path)
+    ):
+        try:
+            os.remove(watermarked_temp_path)
+        except OSError:
+            pass
+
 
 def main(args):
     change_type = "erratum" if args.erratum else "revision"
@@ -222,6 +302,7 @@ def main(args):
         args.explanation,
         change_type=change_type,
         dry_run=args.dry_run,
+        date=args.date,
     )
 
     if args.path.startswith("http"):

diff --git a/bin/requirements.txt b/bin/requirements.txt
@@ -20,6 +20,7 @@ pytest-cov
 python-slugify>=2.0
 pytz
 PyYAML>=3.0
+reportlab
 requests
 ruff~=0.3.4
 setuptools