diff --git a/README.md b/README.md index abf4b3a69b..1d6e22ce1a 100644 --- a/README.md +++ b/README.md @@ -122,6 +122,35 @@ You will need this software on the server Note that generating the anthology takes quite a bit of RAM, so make sure it is available on your machine. +## PDF Watermark / Footer Tool + +The Anthology includes a lightweight tool to add an ACL‑style footer (first page) and optional page numbers to arbitrary PDFs. + +Components: + +* `hugo/static/watermark.html` – Client interface (drag/drop PDF, footer text, starting page number). +* `hugo/static/cgi-bin/watermark.cgi` – CGI endpoint invoking `bin/add_footer.py`. +* `bin/add_footer.py` – Core logic (already part of the repository) supporting inline italics with `` and multi‑line centered layout. + +Setup: +1. Create a python3.10 venv at /opt/venv/watermark +2. Activate and install dependencies: `pip install pypdf reportlab` + +Usage: +1. Serve the site (or just open the HTML file if CGI is reachable at `/cgi-bin/watermark.cgi`). +2. Browse to `https://aclanthology.org/watermark.html`. +3. Provide an optional starting page number and multi‑line footer block (press Enter for new lines). Inline italics via ``. +4. Click “Generate PDF” to download the processed file (`*.watermarked.pdf`). + +Server / security notes: +* Upload limit: 25MB (client) / 30MB (server hard cap). +* Basic validation checks `%PDF-` header. +* Uses temporary directory per request; no persistence. +* Requires Python environment satisfying dependencies for `add_footer.py` (pypdf, reportlab). +* Errors returned as plain text with HTTP status codes. + +To disable, simply remove the HTML or CGI script; no other components are affected. + ## Contributing If you'd like to contribute to the ACL Anthology, please take a look at: diff --git a/bin/add_footer.py b/bin/add_footer.py new file mode 100755 index 0000000000..6b262e1e56 --- /dev/null +++ b/bin/add_footer.py @@ -0,0 +1,215 @@ +#!/usr/bin/env python3 +""" +Add ACL-like footer (first page) and optional page numbers (all pages). +Inline italics with . + +Examples: + python add_footer.py in.pdf out.pdf \ + "Proceedings … pages 8697–8727\nJuly 27 - August 1, 2025 ©2025 ACL" + python add_footer.py -p 199 in.pdf out.pdf "…" + python add_footer.py -p 199 --footer-size 9 --pagenum-size 10 --bottom-margin 14 in.pdf out.pdf "…" + +Copyright 2025, Matt Post +""" + + +import io +import re +import argparse +from pathlib import Path +from pypdf import PdfReader, PdfWriter +from reportlab.pdfgen import canvas + +# Defaults tuned for ACL footer look +DEFAULT_BOTTOM_MARGIN_PT = 14 +DEFAULT_LINE_SPACING = 1.2 +DEFAULT_FOOTER_SIZE = 9 # footer text size +DEFAULT_PAGENUM_SIZE = 11 # page number size + +FONT_REG = "Times-Roman" +FONT_ITAL = "Times-Italic" + +TAG_RE = re.compile(r"()") + + +def parse_inline_italics(s): + """Yield (text, is_italic) spans from a string with regions.""" + parts = TAG_RE.split(s) + italic = False + for tok in parts: + if tok == "": + italic = True + elif tok == "": + italic = False + elif tok: + yield tok, italic + + +def measure_line(c, line, size): + """Total width of a mixed-style line.""" + w = 0.0 + for txt, it in parse_inline_italics(line): + font = FONT_ITAL if it else FONT_REG + w += c.stringWidth(txt, font, size) + return w + + +def draw_rich_centered(c, page_w, y, line, size): + """Draw a mixed-style line centered at y.""" + total_w = measure_line(c, line, size) + x = (page_w - total_w) / 2.0 + for txt, it in parse_inline_italics(line): + font = FONT_ITAL if it else FONT_REG + c.setFont(font, size) + c.drawString(x, y, txt) + x += c.stringWidth(txt, font, size) + + +def mk_footer_overlay(w, h, text_block, bottom_margin, size, line_spacing): + """Footer block near bottom: render lines in given order, stacking downward.""" + buf = io.BytesIO() + c = canvas.Canvas(buf, pagesize=(w, h)) + lines = text_block.split("\n") if text_block else [] + if not lines: + c.showPage() + c.save() + buf.seek(0) + return buf + + line_h = size * line_spacing + # Start y so that the FIRST line appears above subsequent lines, + # with the LAST line's baseline at bottom_margin. + y = bottom_margin + (len(lines) - 1) * line_h + for line in lines: + draw_rich_centered(c, w, y, line, size) + y -= line_h # next line goes BELOW + c.showPage() + c.save() + buf.seek(0) + return buf + + +def mk_pagenum_overlay(w, h, page_num, bottom_margin, size): + buf = io.BytesIO() + c = canvas.Canvas(buf, pagesize=(w, h)) + c.setFont(FONT_REG, size) + text = str(page_num) + tw = c.stringWidth(text, FONT_REG, size) + x = (w - tw) / 2.0 + y = bottom_margin + c.drawString(x, y, text) + c.showPage() + c.save() + buf.seek(0) + return buf + + +def process( + input_pdf, + output_pdf, + text_block, + page_start, + bottom_margin, + footer_size, + pagenum_size, + line_spacing, +): + reader = PdfReader(str(input_pdf)) + writer = PdfWriter() + + footer_cache, pnum_cache = {}, {} + + for idx, page in enumerate(reader.pages, start=1): + w = float(page.mediabox.width) + h = float(page.mediabox.height) + + disp_num = None if page_start is None else page_start + idx - 1 + + # Page number: SAME bottom margin on every page + if disp_num is not None: + nkey = (w, h, disp_num, pagenum_size, bottom_margin) + if nkey not in pnum_cache: + pnum_cache[nkey] = PdfReader( + mk_pagenum_overlay(w, h, disp_num, bottom_margin, pagenum_size) + ).pages[0] + page.merge_page(pnum_cache[nkey]) + + # Footer only on first page; place it ABOVE the fixed page number + if idx == 1 and text_block: + # raise footer so its LAST line sits above the page number by a small gap + gap = 0.6 * footer_size + footer_bottom = bottom_margin + pagenum_size + gap + fkey = (w, h, "footer", footer_size, footer_bottom, line_spacing, text_block) + if fkey not in footer_cache: + footer_cache[fkey] = PdfReader( + mk_footer_overlay( + w, h, text_block, footer_bottom, footer_size, line_spacing + ) + ).pages[0] + page.merge_page(footer_cache[fkey]) + + writer.add_page(page) + + with open(output_pdf, "wb") as f: + writer.write(f) + + +def main(): + ap = argparse.ArgumentParser( + description="Add ACL-like footer (first page) and optional page numbers (all pages)." + ) + ap.add_argument( + "--page-number", + "-p", + type=int, + metavar="N", + help="Enable page numbers starting at N (e.g., -p 5).", + ) + ap.add_argument( + "--bottom-margin", + type=float, + default=14, + help="Baseline distance from bottom (pt).", + ) + ap.add_argument( + "--footer-size", + type=float, + default=DEFAULT_FOOTER_SIZE, + help="Footer font size (pt).", + ) + ap.add_argument( + "--pagenum-size", + type=float, + default=DEFAULT_PAGENUM_SIZE, + help="Page number font size (pt).", + ) + ap.add_argument( + "--line-spacing", type=float, default=1.2, help="Footer line spacing multiplier." + ) + ap.add_argument("input_pdf", type=Path) + ap.add_argument("output_pdf", type=Path) + ap.add_argument( + "text_block", + nargs="?", + default="", + help="Footer text for FIRST page only. Use \\n for newlines. Use for inline italics.", + ) + args = ap.parse_args() + + # normalize literal "\n" + args.text_block = args.text_block.replace("\\n", "\n") + + process( + args.input_pdf, + args.output_pdf, + args.text_block, + args.page_number, + args.bottom_margin, + args.footer_size, + args.pagenum_size, + args.line_spacing, + ) + + +if __name__ == "__main__": + main() diff --git a/bin/add_revision.py b/bin/add_revision.py index 8d15cf554e..c707f79d1f 100755 --- a/bin/add_revision.py +++ b/bin/add_revision.py @@ -1,7 +1,7 @@ #! /usr/bin/env python3 # -*- coding: utf-8 -*- # -# Copyright 2019 Matt Post +# Copyright 2019–2025 Matt Post # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -41,6 +41,7 @@ import shutil import sys import tempfile +import io from git.repo.base import Repo @@ -58,6 +59,66 @@ import lxml.etree as ET from datetime import datetime +from pypdf import PdfReader, PdfWriter +from reportlab.pdfgen import canvas + +WATERMARK_FONT = "Times-Roman" +WATERMARK_SIZE = 16 +WATERMARK_LEFT_OFFSET_PT = ( + 27 # distance from left edge in points (50% increase for margin) +) +WATERMARK_GRAY = 0.55 # medium gray like arXiv + + +def _make_vertical_watermark_page(w, h, text): + """Return a single-page PDF with vertical (rotated 90° CCW) watermark at left.""" + buf = io.BytesIO() + c = canvas.Canvas(buf, pagesize=(w, h)) + c.saveState() + c.setFont(WATERMARK_FONT, WATERMARK_SIZE) + c.setFillGray(WATERMARK_GRAY) + # Translate slightly from left then rotate so text reads bottom-to-top along left side. + c.translate(WATERMARK_LEFT_OFFSET_PT, 0) + c.rotate(90) + text_w = c.stringWidth(text, WATERMARK_FONT, WATERMARK_SIZE) + # Center along original page height (which becomes horizontal span after rotation) + x_draw = (h - text_w) / 2.0 + y_draw = 0 + c.drawString(x_draw, y_draw, text) + c.restoreState() + c.showPage() + c.save() + buf.seek(0) + return buf + + +def add_revision_watermark(pdf_path, anth_id, revno, date): + """Return path to temp PDF with watermark added to first page (revisions only).""" + reader = PdfReader(pdf_path) + if not reader.pages: + return pdf_path + writer = PdfWriter() + first = reader.pages[0] + w = float(first.mediabox.width) + h = float(first.mediabox.height) + # Format date as DD-Mon-YYYY (e.g., 17-Sep-2025) for watermark display only. + try: + dt = datetime.strptime(date, "%Y-%m-%d") + display_date = dt.strftime("%d %b %Y") + except ValueError: + # If already in some unexpected format, just use original string. + display_date = date + text = f"ACL Anthology ID {anth_id} / revision {revno} / {display_date}" + overlay = PdfReader(_make_vertical_watermark_page(w, h, text)).pages[0] + first.merge_page(overlay) + writer.add_page(first) + for p in reader.pages[1:]: + writer.add_page(p) + fd, tmp_path = tempfile.mkstemp(suffix=".pdf") + os.close(fd) + with open(tmp_path, "wb") as out_f: + writer.write(out_f) + return tmp_path def validate_file_type(path): @@ -101,7 +162,7 @@ def maybe_copy(file_from, file_to): change_letter = "e" if change_type == "erratum" else "v" - checksum = compute_hash_from_file(pdf_path) + # checksum will be computed after potential watermark insertion # Files for old-style IDs are stored under anthology-files/pdf/P/P19/* # Files for new-style IDs are stored under anthology-files/pdf/2020.acl/* @@ -130,6 +191,14 @@ def maybe_copy(file_from, file_to): for revision in revisions: revno = int(revision.attrib["id"]) + 1 + # Insert watermark for revisions before computing checksum / updating XML + watermarked_temp_path = None + if change_type == "revision": + watermarked_temp_path = add_revision_watermark(pdf_path, anth_id, revno, date) + pdf_path = watermarked_temp_path + + checksum = compute_hash_from_file(pdf_path) + if not dry_run: # Update the URL hash on the tag if change_type != "erratum": @@ -201,6 +270,17 @@ def maybe_copy(file_from, file_to): if change_type == "revision": maybe_copy(pdf_path, canonical_path) + # Cleanup temp watermarked file if created + if ( + 'watermarked_temp_path' in locals() + and watermarked_temp_path + and os.path.exists(watermarked_temp_path) + ): + try: + os.remove(watermarked_temp_path) + except OSError: + pass + def main(args): change_type = "erratum" if args.erratum else "revision" @@ -222,6 +302,7 @@ def main(args): args.explanation, change_type=change_type, dry_run=args.dry_run, + date=args.date, ) if args.path.startswith("http"): diff --git a/bin/requirements.txt b/bin/requirements.txt index 52d8c07fbd..f92f685098 100644 --- a/bin/requirements.txt +++ b/bin/requirements.txt @@ -20,6 +20,7 @@ pytest-cov python-slugify>=2.0 pytz PyYAML>=3.0 +reportlab requests ruff~=0.3.4 setuptools diff --git a/hugo/content/info/corrections.md b/hugo/content/info/corrections.md index 043fc5b850..7521209988 100644 --- a/hugo/content/info/corrections.md +++ b/hugo/content/info/corrections.md @@ -96,8 +96,8 @@ For requests to change paper *content* (either a revision or an erratum), again, This summary will be included in the Anthology. Its intended audience is users of the Anthology, and should therefore be written from a neutral, scientific perspective. - If the metadata also needs to change, please also follow the instructions in the previous section. -- If possible, when generating your revision, it would be good to add the proper proceedings header and footer stamps, as well as the correct page numbering. - Unfortunately, we cannot provide any assistance with this task, but [this template from the ACLPUB2 repo](https://github.com/rycolab/aclpub2/blob/main/aclpub2/templates/watermarked_pdf.tex) may be helpful for this. +- If possible, when generating your revision, it would be good to add the proper proceedings header and footer stamps to match the original. + A web service providing this feature can be found at [https://aclanthology.org/watermark.html](https://aclanthology.org/watermark.html) ([source code](https://github.com/acl-org/acl-anthology/tree/main/bin/add_footer.py)). For revisions, the brief summary should allow readers to find the changed parts, but need not be too detailed. Here are some good examples: @@ -112,6 +112,8 @@ Submissions not meeting these standards will be rejected, potentially without no A revision that changes the author list needs permission (see below). +Revisions will be marked with a front-page watermark, similar to the arXiv watermark. + #### Retractions To initiate a retraction, please communicate directly with the Anthology director. diff --git a/hugo/static/cgi-bin/watermark.cgi b/hugo/static/cgi-bin/watermark.cgi new file mode 100755 index 0000000000..7bb047bf57 --- /dev/null +++ b/hugo/static/cgi-bin/watermark.cgi @@ -0,0 +1,166 @@ +#!/opt/venv/watermark/bin/python3 +# -*- coding: utf-8 -*- +""" +watermark.cgi - On-the-fly PDF footer + page number service for the ACL Anthology. + +POST a multipart/form-data request with fields: + pdf (file, required) The input PDF + footer_text (text, optional) First-page footer block; use for italics; newlines allowed + page_start (int, optional) Starting page number (>=1) + +Returns: Modified PDF (application/pdf) with Content-Disposition: attachment. + +Security / Resource considerations: + * Rejects files > 30MB (quick limit; adjust as needed). + * Basic PDF validation (magic header %PDF-). + * Uses temporary files; they are deleted at end of request. + * No persistent storage. + +Depends on: reportlab, pypdf (already required by bin/add_footer.py) +""" + +import cgi +import os +import sys +import tempfile +import subprocess +from pathlib import Path +import shutil +import urllib.parse +MAX_BYTES = 30 * 1024 * 1024 + +def http_error(status_code: int, message: str): + print(f"Status: {status_code} Bad Request" if status_code == 400 else f"Status: {status_code}") + print("Content-Type: text/plain; charset=utf-8") + print("X-Content-Type-Options: nosniff") + print() + print(message) + sys.exit(0) + +def find_add_footer() -> Path: + """Resolve location of add_footer.py relative to this CGI script.""" + here = Path(__file__).resolve() + homedir = Path(os.environ.get('HOME', '/home/anthologizer')) + # Walk up to locate bin/add_footer.py + for parent in [here.parent, *here.parents, homedir / "acl-anthology"]: + candidate = parent.parent / "bin" / "add_footer.py" if parent.name == 'cgi-bin' else parent / "bin" / "add_footer.py" + if candidate.exists(): + return candidate + # Fallback relative guess (3 levels up) + guess = here.parents[2] / 'bin' / 'add_footer.py' + return guess + +def main(): + # Debug flag: append ?debug=1 to request URL to get full stderr on failure. + qs = os.environ.get('QUERY_STRING', '') + qparams = dict(urllib.parse.parse_qsl(qs, keep_blank_values=True)) if qs else {} + debug_mode = qparams.get('debug') in {'1','true','yes','on'} + method = os.environ.get('REQUEST_METHOD', 'GET').upper() + if method != 'POST': + http_error(405, 'Use POST with multipart/form-data.') + + try: + length = int(os.environ.get('CONTENT_LENGTH', '0')) + except ValueError: + length = 0 + if length <= 0: + http_error(400, 'Empty request body.') + if length > MAX_BYTES: + http_error(400, f'File too large (> {MAX_BYTES//1024//1024}MB).') + + form = cgi.FieldStorage() + + if 'pdf' not in form or not getattr(form['pdf'], 'file', None): + http_error(400, 'Missing PDF file.') + pdf_item = form['pdf'] + footer_text = form.getfirst('footer_text', '')[:10000] # cap length + page_start_raw = form.getfirst('page_start') + page_start = None + if page_start_raw: + try: + page_start = int(page_start_raw) + if page_start < 1: + raise ValueError + except ValueError: + http_error(400, 'Invalid page_start (must be positive integer).') + + # Write uploaded PDF to temp file + tmp_dir = tempfile.mkdtemp(prefix='wmk_') + input_pdf = Path(tmp_dir) / 'input.pdf' + output_pdf = Path(tmp_dir) / 'output.pdf' + + with open(input_pdf, 'wb') as f: + # stream copy to avoid loading entire file in memory + chunked = 0 + while True: + buf = pdf_item.file.read(64 * 1024) + if not buf: + break + chunked += len(buf) + if chunked > MAX_BYTES: + f.close() + http_error(400, 'File exceeded size limit during upload.') + f.write(buf) + + # Validate PDF magic + try: + with open(input_pdf, 'rb') as f: + head = f.read(8) + if b'%PDF-' not in head: + http_error(400, 'Uploaded file is not a PDF.') + except Exception: + http_error(400, 'Could not read uploaded PDF.') + + add_footer = find_add_footer() + if not add_footer.exists(): + http_error(500, 'Server configuration error: add_footer.py not found.') + + cmd = [sys.executable, str(add_footer)] + if page_start is not None: + cmd += ['-p', str(page_start)] + # Convert embedded newlines are preserved; add_footer.py will handle them + cmd += [str(input_pdf), str(output_pdf), footer_text] + + try: + proc = subprocess.run(cmd, capture_output=True, text=True, timeout=40) + except subprocess.TimeoutExpired: + http_error(500, 'Processing timed out.') + + if proc.returncode != 0 or not output_pdf.exists(): + stderr = (proc.stderr or '').strip() + stdout = (proc.stdout or '').strip() + if debug_mode: + # Return full diagnostic (no truncation) for troubleshooting. + diag = [ + 'Status: processing failed', + 'Command: ' + ' '.join(cmd), + f'Return code: {proc.returncode}', + '--- STDERR ---', stderr or '', + '--- STDOUT ---', stdout or '' + ] + http_error(500, '\n'.join(diag)) + else: + merged = stderr or stdout or 'Unknown error' + http_error(500, f'Failed to process PDF. (Add ?debug=1 for details)\n{merged[:400]}') + + # Success: stream file + size = output_pdf.stat().st_size + # Some Apache configurations are picky about the "Status" header in CGI output. + # Emit only standard headers followed by a blank line, then raw PDF bytes. + sys.stdout.write('Content-Type: application/pdf\r\n') + sys.stdout.write('X-Content-Type-Options: nosniff\r\n') + sys.stdout.write(f'Content-Disposition: attachment; filename="watermarked.pdf"\r\n') + sys.stdout.write(f'Content-Length: {size}\r\n') + sys.stdout.write('\r\n') + sys.stdout.flush() + with open(output_pdf, 'rb') as f: + shutil.copyfileobj(f, sys.stdout.buffer) + + # Cleanup temp dir + try: + shutil.rmtree(tmp_dir) + except Exception: + pass + +if __name__ == '__main__': + main() diff --git a/hugo/static/watermark.html b/hugo/static/watermark.html new file mode 100644 index 0000000000..138aad0bd8 --- /dev/null +++ b/hugo/static/watermark.html @@ -0,0 +1,231 @@ + + + + + + ACL Anthology PDF Watermark / Footer Tool + + + +
ACL Anthology • PDF Footer/Watermark Tool
+
+

Add ACL-style footer & page numbers

+

Upload a PDF and optionally add page numbers plus a first-page footer block. Inline italics: wrap spans in <i>...</i>. Press Enter for a new footer line; lines are sent as literal \n sequences to the server.

+
+
+
+ +
+ Drop PDF here
+ (or click to browse) + +
No file selected
+
+
+
+ + +
+
+
+
+ + +
+
+ +
+ Use <i>italics</i>. Multi‑line footers are centered. Page numbers appear on every page. Footer sits above the number.
Maximum upload size: 25MB. +
+
+
+
+ + +
+
+ +

Nothing is permanently stored; processing happens in-memory / temporary files and the result is streamed back. Avoid uploading sensitive material.
Source script: add_footer.py.

+
+
+
© 2025 ACL Anthology. Tool generated dynamically.
+ + +