From 6648848a1fa3da9bac7d3479ffbbe146d8d6d8e8 Mon Sep 17 00:00:00 2001 From: Matt Post Date: Thu, 18 Sep 2025 09:08:46 -0400 Subject: [PATCH 1/7] Add script to add footer --- bin/add_footer.py | 157 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100755 bin/add_footer.py diff --git a/bin/add_footer.py b/bin/add_footer.py new file mode 100755 index 0000000000..106e3d9831 --- /dev/null +++ b/bin/add_footer.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python3 +""" +Add ACL-like footer (first page) and optional page numbers (all pages). +Inline italics with . + +Examples: + python add_footer.py in.pdf out.pdf \ + "Proceedings … pages 8697–8727\nJuly 27 - August 1, 2025 ©2025 ACL" + python add_footer.py -p 199 in.pdf out.pdf "…" + python add_footer.py -p 199 --footer-size 9 --pagenum-size 10 --bottom-margin 14 in.pdf out.pdf "…" + +Copyright 2025, Matt Post +""" + + +import io, re, argparse +from pathlib import Path +from pypdf import PdfReader, PdfWriter +from reportlab.pdfgen import canvas + +# Defaults tuned for ACL footer look +DEFAULT_BOTTOM_MARGIN_PT = 14 +DEFAULT_LINE_SPACING = 1.2 +DEFAULT_FOOTER_SIZE = 9 # footer text size +DEFAULT_PAGENUM_SIZE = 11 # page number size + +FONT_REG = "Times-Roman" +FONT_ITAL = "Times-Italic" + +TAG_RE = re.compile(r"()") + +def parse_inline_italics(s): + """Yield (text, is_italic) spans from a string with regions.""" + parts = TAG_RE.split(s) + italic = False + for tok in parts: + if tok == "": + italic = True + elif tok == "": + italic = False + elif tok: + yield tok, italic + +def measure_line(c, line, size): + """Total width of a mixed-style line.""" + w = 0.0 + for txt, it in parse_inline_italics(line): + font = FONT_ITAL if it else FONT_REG + w += c.stringWidth(txt, font, size) + return w + +def draw_rich_centered(c, page_w, y, line, size): + """Draw a mixed-style line centered at y.""" + total_w = measure_line(c, line, size) + x = (page_w - total_w) / 2.0 + for txt, it in parse_inline_italics(line): + font = FONT_ITAL if it else FONT_REG + c.setFont(font, size) + c.drawString(x, y, txt) + x += c.stringWidth(txt, font, size) + +def mk_footer_overlay(w, h, text_block, bottom_margin, size, line_spacing): + """Footer block near bottom: render lines in given order, stacking downward.""" + buf = io.BytesIO() + c = canvas.Canvas(buf, pagesize=(w, h)) + lines = text_block.split("\n") if text_block else [] + if not lines: + c.showPage(); c.save(); buf.seek(0); return buf + + line_h = size * line_spacing + # Start y so that the FIRST line appears above subsequent lines, + # with the LAST line's baseline at bottom_margin. + y = bottom_margin + (len(lines) - 1) * line_h + for line in lines: + draw_rich_centered(c, w, y, line, size) + y -= line_h # next line goes BELOW + c.showPage(); c.save(); buf.seek(0) + return buf + +def mk_pagenum_overlay(w, h, page_num, bottom_margin, size): + buf = io.BytesIO() + c = canvas.Canvas(buf, pagesize=(w, h)) + c.setFont(FONT_REG, size) + text = str(page_num) + tw = c.stringWidth(text, FONT_REG, size) + x = (w - tw) / 2.0 + y = bottom_margin + c.drawString(x, y, text) + c.showPage(); c.save(); buf.seek(0) + return buf + + +def process(input_pdf, output_pdf, text_block, page_start, + bottom_margin, footer_size, pagenum_size, line_spacing): + reader = PdfReader(str(input_pdf)) + writer = PdfWriter() + + footer_cache, pnum_cache = {}, {} + + for idx, page in enumerate(reader.pages, start=1): + w = float(page.mediabox.width) + h = float(page.mediabox.height) + + disp_num = None if page_start is None else page_start + idx - 1 + + # Page number: SAME bottom margin on every page + if disp_num is not None: + nkey = (w, h, disp_num, pagenum_size, bottom_margin) + if nkey not in pnum_cache: + pnum_cache[nkey] = PdfReader( + mk_pagenum_overlay(w, h, disp_num, bottom_margin, pagenum_size) + ).pages[0] + page.merge_page(pnum_cache[nkey]) + + # Footer only on first page; place it ABOVE the fixed page number + if idx == 1 and text_block: + # raise footer so its LAST line sits above the page number by a small gap + gap = 0.6 * footer_size + footer_bottom = bottom_margin + pagenum_size + gap + fkey = (w, h, "footer", footer_size, footer_bottom, line_spacing, text_block) + if fkey not in footer_cache: + footer_cache[fkey] = PdfReader( + mk_footer_overlay(w, h, text_block, footer_bottom, footer_size, line_spacing) + ).pages[0] + page.merge_page(footer_cache[fkey]) + + writer.add_page(page) + + with open(output_pdf, "wb") as f: + writer.write(f) + + + +def main(): + ap = argparse.ArgumentParser(description="Add ACL-like footer (first page) and optional page numbers (all pages).") + ap.add_argument("--page-number", "-p", type=int, metavar="N", + help="Enable page numbers starting at N (e.g., -p 5).") + ap.add_argument("--bottom-margin", type=float, default=14, help="Baseline distance from bottom (pt).") + ap.add_argument("--footer-size", type=float, default=DEFAULT_FOOTER_SIZE, help="Footer font size (pt).") + ap.add_argument("--pagenum-size", type=float, default=DEFAULT_PAGENUM_SIZE, help="Page number font size (pt).") + ap.add_argument("--line-spacing", type=float, default=1.2, help="Footer line spacing multiplier.") + ap.add_argument("input_pdf", type=Path) + ap.add_argument("output_pdf", type=Path) + ap.add_argument("text_block", nargs="?", default="", help="Footer text for FIRST page only. Use \\n for newlines. Use for inline italics.") + args = ap.parse_args() + + # normalize literal "\n" + args.text_block = args.text_block.replace("\\n","\n") + + process( + args.input_pdf, args.output_pdf, args.text_block, args.page_number, + args.bottom_margin, args.footer_size, args.pagenum_size, args.line_spacing, + + ) + +if __name__ == "__main__": + main() \ No newline at end of file From 8ea1e4ffecea14ead021879edad9307d94677dd8 Mon Sep 17 00:00:00 2001 From: Matt Post Date: Thu, 18 Sep 2025 09:41:07 -0400 Subject: [PATCH 2/7] black --- bin/add_footer.py | 104 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 81 insertions(+), 23 deletions(-) diff --git a/bin/add_footer.py b/bin/add_footer.py index 106e3d9831..6b262e1e56 100755 --- a/bin/add_footer.py +++ b/bin/add_footer.py @@ -13,7 +13,9 @@ """ -import io, re, argparse +import io +import re +import argparse from pathlib import Path from pypdf import PdfReader, PdfWriter from reportlab.pdfgen import canvas @@ -21,14 +23,15 @@ # Defaults tuned for ACL footer look DEFAULT_BOTTOM_MARGIN_PT = 14 DEFAULT_LINE_SPACING = 1.2 -DEFAULT_FOOTER_SIZE = 9 # footer text size -DEFAULT_PAGENUM_SIZE = 11 # page number size +DEFAULT_FOOTER_SIZE = 9 # footer text size +DEFAULT_PAGENUM_SIZE = 11 # page number size FONT_REG = "Times-Roman" FONT_ITAL = "Times-Italic" TAG_RE = re.compile(r"()") + def parse_inline_italics(s): """Yield (text, is_italic) spans from a string with regions.""" parts = TAG_RE.split(s) @@ -41,6 +44,7 @@ def parse_inline_italics(s): elif tok: yield tok, italic + def measure_line(c, line, size): """Total width of a mixed-style line.""" w = 0.0 @@ -49,6 +53,7 @@ def measure_line(c, line, size): w += c.stringWidth(txt, font, size) return w + def draw_rich_centered(c, page_w, y, line, size): """Draw a mixed-style line centered at y.""" total_w = measure_line(c, line, size) @@ -59,13 +64,17 @@ def draw_rich_centered(c, page_w, y, line, size): c.drawString(x, y, txt) x += c.stringWidth(txt, font, size) + def mk_footer_overlay(w, h, text_block, bottom_margin, size, line_spacing): """Footer block near bottom: render lines in given order, stacking downward.""" buf = io.BytesIO() c = canvas.Canvas(buf, pagesize=(w, h)) lines = text_block.split("\n") if text_block else [] if not lines: - c.showPage(); c.save(); buf.seek(0); return buf + c.showPage() + c.save() + buf.seek(0) + return buf line_h = size * line_spacing # Start y so that the FIRST line appears above subsequent lines, @@ -74,9 +83,12 @@ def mk_footer_overlay(w, h, text_block, bottom_margin, size, line_spacing): for line in lines: draw_rich_centered(c, w, y, line, size) y -= line_h # next line goes BELOW - c.showPage(); c.save(); buf.seek(0) + c.showPage() + c.save() + buf.seek(0) return buf + def mk_pagenum_overlay(w, h, page_num, bottom_margin, size): buf = io.BytesIO() c = canvas.Canvas(buf, pagesize=(w, h)) @@ -86,12 +98,22 @@ def mk_pagenum_overlay(w, h, page_num, bottom_margin, size): x = (w - tw) / 2.0 y = bottom_margin c.drawString(x, y, text) - c.showPage(); c.save(); buf.seek(0) + c.showPage() + c.save() + buf.seek(0) return buf -def process(input_pdf, output_pdf, text_block, page_start, - bottom_margin, footer_size, pagenum_size, line_spacing): +def process( + input_pdf, + output_pdf, + text_block, + page_start, + bottom_margin, + footer_size, + pagenum_size, + line_spacing, +): reader = PdfReader(str(input_pdf)) writer = PdfWriter() @@ -120,7 +142,9 @@ def process(input_pdf, output_pdf, text_block, page_start, fkey = (w, h, "footer", footer_size, footer_bottom, line_spacing, text_block) if fkey not in footer_cache: footer_cache[fkey] = PdfReader( - mk_footer_overlay(w, h, text_block, footer_bottom, footer_size, line_spacing) + mk_footer_overlay( + w, h, text_block, footer_bottom, footer_size, line_spacing + ) ).pages[0] page.merge_page(footer_cache[fkey]) @@ -130,28 +154,62 @@ def process(input_pdf, output_pdf, text_block, page_start, writer.write(f) - def main(): - ap = argparse.ArgumentParser(description="Add ACL-like footer (first page) and optional page numbers (all pages).") - ap.add_argument("--page-number", "-p", type=int, metavar="N", - help="Enable page numbers starting at N (e.g., -p 5).") - ap.add_argument("--bottom-margin", type=float, default=14, help="Baseline distance from bottom (pt).") - ap.add_argument("--footer-size", type=float, default=DEFAULT_FOOTER_SIZE, help="Footer font size (pt).") - ap.add_argument("--pagenum-size", type=float, default=DEFAULT_PAGENUM_SIZE, help="Page number font size (pt).") - ap.add_argument("--line-spacing", type=float, default=1.2, help="Footer line spacing multiplier.") + ap = argparse.ArgumentParser( + description="Add ACL-like footer (first page) and optional page numbers (all pages)." + ) + ap.add_argument( + "--page-number", + "-p", + type=int, + metavar="N", + help="Enable page numbers starting at N (e.g., -p 5).", + ) + ap.add_argument( + "--bottom-margin", + type=float, + default=14, + help="Baseline distance from bottom (pt).", + ) + ap.add_argument( + "--footer-size", + type=float, + default=DEFAULT_FOOTER_SIZE, + help="Footer font size (pt).", + ) + ap.add_argument( + "--pagenum-size", + type=float, + default=DEFAULT_PAGENUM_SIZE, + help="Page number font size (pt).", + ) + ap.add_argument( + "--line-spacing", type=float, default=1.2, help="Footer line spacing multiplier." + ) ap.add_argument("input_pdf", type=Path) ap.add_argument("output_pdf", type=Path) - ap.add_argument("text_block", nargs="?", default="", help="Footer text for FIRST page only. Use \\n for newlines. Use for inline italics.") + ap.add_argument( + "text_block", + nargs="?", + default="", + help="Footer text for FIRST page only. Use \\n for newlines. Use for inline italics.", + ) args = ap.parse_args() # normalize literal "\n" - args.text_block = args.text_block.replace("\\n","\n") + args.text_block = args.text_block.replace("\\n", "\n") process( - args.input_pdf, args.output_pdf, args.text_block, args.page_number, - args.bottom_margin, args.footer_size, args.pagenum_size, args.line_spacing, - + args.input_pdf, + args.output_pdf, + args.text_block, + args.page_number, + args.bottom_margin, + args.footer_size, + args.pagenum_size, + args.line_spacing, ) + if __name__ == "__main__": - main() \ No newline at end of file + main() From 7d131fe1f22eaf0fa81ddde245d2f04d47819f48 Mon Sep 17 00:00:00 2001 From: Matt Post Date: Thu, 18 Sep 2025 09:51:29 -0400 Subject: [PATCH 3/7] Add watermark to revised papers --- bin/add_revision.py | 79 ++++++++++++++++++++++++++++++++++++++++++-- bin/requirements.txt | 1 + 2 files changed, 78 insertions(+), 2 deletions(-) diff --git a/bin/add_revision.py b/bin/add_revision.py index 8d15cf554e..9c07d116a2 100755 --- a/bin/add_revision.py +++ b/bin/add_revision.py @@ -1,7 +1,7 @@ #! /usr/bin/env python3 # -*- coding: utf-8 -*- # -# Copyright 2019 Matt Post +# Copyright 2019–2025 Matt Post # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -41,6 +41,7 @@ import shutil import sys import tempfile +import io from git.repo.base import Repo @@ -58,6 +59,64 @@ import lxml.etree as ET from datetime import datetime +from pypdf import PdfReader, PdfWriter +from reportlab.pdfgen import canvas + +WATERMARK_FONT = "Times-Roman" +WATERMARK_SIZE = 16 +WATERMARK_LEFT_OFFSET_PT = 27 # distance from left edge in points (50% increase for margin) +WATERMARK_GRAY = 0.55 # medium gray like arXiv + + +def _make_vertical_watermark_page(w, h, text): + """Return a single-page PDF with vertical (rotated 90° CCW) watermark at left.""" + buf = io.BytesIO() + c = canvas.Canvas(buf, pagesize=(w, h)) + c.saveState() + c.setFont(WATERMARK_FONT, WATERMARK_SIZE) + c.setFillGray(WATERMARK_GRAY) + # Translate slightly from left then rotate so text reads bottom-to-top along left side. + c.translate(WATERMARK_LEFT_OFFSET_PT, 0) + c.rotate(90) + text_w = c.stringWidth(text, WATERMARK_FONT, WATERMARK_SIZE) + # Center along original page height (which becomes horizontal span after rotation) + x_draw = (h - text_w) / 2.0 + y_draw = 0 + c.drawString(x_draw, y_draw, text) + c.restoreState() + c.showPage() + c.save() + buf.seek(0) + return buf + + +def add_revision_watermark(pdf_path, anth_id, revno, date): + """Return path to temp PDF with watermark added to first page (revisions only).""" + reader = PdfReader(pdf_path) + if not reader.pages: + return pdf_path + writer = PdfWriter() + first = reader.pages[0] + w = float(first.mediabox.width) + h = float(first.mediabox.height) + # Format date as DD-Mon-YYYY (e.g., 17-Sep-2025) for watermark display only. + try: + dt = datetime.strptime(date, "%Y-%m-%d") + display_date = dt.strftime("%d %b %Y") + except ValueError: + # If already in some unexpected format, just use original string. + display_date = date + text = f"ACL Anthology ID {anth_id} / revision {revno} / {display_date}" + overlay = PdfReader(_make_vertical_watermark_page(w, h, text)).pages[0] + first.merge_page(overlay) + writer.add_page(first) + for p in reader.pages[1:]: + writer.add_page(p) + fd, tmp_path = tempfile.mkstemp(suffix=".pdf") + os.close(fd) + with open(tmp_path, "wb") as out_f: + writer.write(out_f) + return tmp_path def validate_file_type(path): @@ -101,7 +160,7 @@ def maybe_copy(file_from, file_to): change_letter = "e" if change_type == "erratum" else "v" - checksum = compute_hash_from_file(pdf_path) + # checksum will be computed after potential watermark insertion # Files for old-style IDs are stored under anthology-files/pdf/P/P19/* # Files for new-style IDs are stored under anthology-files/pdf/2020.acl/* @@ -130,6 +189,14 @@ def maybe_copy(file_from, file_to): for revision in revisions: revno = int(revision.attrib["id"]) + 1 + # Insert watermark for revisions before computing checksum / updating XML + watermarked_temp_path = None + if change_type == "revision": + watermarked_temp_path = add_revision_watermark(pdf_path, anth_id, revno, date) + pdf_path = watermarked_temp_path + + checksum = compute_hash_from_file(pdf_path) + if not dry_run: # Update the URL hash on the tag if change_type != "erratum": @@ -201,6 +268,13 @@ def maybe_copy(file_from, file_to): if change_type == "revision": maybe_copy(pdf_path, canonical_path) + # Cleanup temp watermarked file if created + if 'watermarked_temp_path' in locals() and watermarked_temp_path and os.path.exists(watermarked_temp_path): + try: + os.remove(watermarked_temp_path) + except OSError: + pass + def main(args): change_type = "erratum" if args.erratum else "revision" @@ -222,6 +296,7 @@ def main(args): args.explanation, change_type=change_type, dry_run=args.dry_run, + date=args.date, ) if args.path.startswith("http"): diff --git a/bin/requirements.txt b/bin/requirements.txt index 52d8c07fbd..f92f685098 100644 --- a/bin/requirements.txt +++ b/bin/requirements.txt @@ -20,6 +20,7 @@ pytest-cov python-slugify>=2.0 pytz PyYAML>=3.0 +reportlab requests ruff~=0.3.4 setuptools From ac597b628fac637b53344f174c961b34385756bf Mon Sep 17 00:00:00 2001 From: Matt Post Date: Thu, 18 Sep 2025 09:52:05 -0400 Subject: [PATCH 4/7] black --- bin/add_revision.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/bin/add_revision.py b/bin/add_revision.py index 9c07d116a2..c707f79d1f 100755 --- a/bin/add_revision.py +++ b/bin/add_revision.py @@ -64,7 +64,9 @@ WATERMARK_FONT = "Times-Roman" WATERMARK_SIZE = 16 -WATERMARK_LEFT_OFFSET_PT = 27 # distance from left edge in points (50% increase for margin) +WATERMARK_LEFT_OFFSET_PT = ( + 27 # distance from left edge in points (50% increase for margin) +) WATERMARK_GRAY = 0.55 # medium gray like arXiv @@ -269,7 +271,11 @@ def maybe_copy(file_from, file_to): maybe_copy(pdf_path, canonical_path) # Cleanup temp watermarked file if created - if 'watermarked_temp_path' in locals() and watermarked_temp_path and os.path.exists(watermarked_temp_path): + if ( + 'watermarked_temp_path' in locals() + and watermarked_temp_path + and os.path.exists(watermarked_temp_path) + ): try: os.remove(watermarked_temp_path) except OSError: From 9026a5165249ce4767e339cf360b46a0c902c348 Mon Sep 17 00:00:00 2001 From: Matt Post Date: Thu, 18 Sep 2025 12:45:00 -0400 Subject: [PATCH 5/7] Add and test watermark service --- README.md | 29 +++++ hugo/static/cgi-bin/watermark.cgi | 166 +++++++++++++++++++++++++++ hugo/static/watermark.html | 180 ++++++++++++++++++++++++++++++ 3 files changed, 375 insertions(+) create mode 100755 hugo/static/cgi-bin/watermark.cgi create mode 100644 hugo/static/watermark.html diff --git a/README.md b/README.md index abf4b3a69b..1d6e22ce1a 100644 --- a/README.md +++ b/README.md @@ -122,6 +122,35 @@ You will need this software on the server Note that generating the anthology takes quite a bit of RAM, so make sure it is available on your machine. +## PDF Watermark / Footer Tool + +The Anthology includes a lightweight tool to add an ACL‑style footer (first page) and optional page numbers to arbitrary PDFs. + +Components: + +* `hugo/static/watermark.html` – Client interface (drag/drop PDF, footer text, starting page number). +* `hugo/static/cgi-bin/watermark.cgi` – CGI endpoint invoking `bin/add_footer.py`. +* `bin/add_footer.py` – Core logic (already part of the repository) supporting inline italics with `` and multi‑line centered layout. + +Setup: +1. Create a python3.10 venv at /opt/venv/watermark +2. Activate and install dependencies: `pip install pypdf reportlab` + +Usage: +1. Serve the site (or just open the HTML file if CGI is reachable at `/cgi-bin/watermark.cgi`). +2. Browse to `https://aclanthology.org/watermark.html`. +3. Provide an optional starting page number and multi‑line footer block (press Enter for new lines). Inline italics via ``. +4. Click “Generate PDF” to download the processed file (`*.watermarked.pdf`). + +Server / security notes: +* Upload limit: 25MB (client) / 30MB (server hard cap). +* Basic validation checks `%PDF-` header. +* Uses temporary directory per request; no persistence. +* Requires Python environment satisfying dependencies for `add_footer.py` (pypdf, reportlab). +* Errors returned as plain text with HTTP status codes. + +To disable, simply remove the HTML or CGI script; no other components are affected. + ## Contributing If you'd like to contribute to the ACL Anthology, please take a look at: diff --git a/hugo/static/cgi-bin/watermark.cgi b/hugo/static/cgi-bin/watermark.cgi new file mode 100755 index 0000000000..7bb047bf57 --- /dev/null +++ b/hugo/static/cgi-bin/watermark.cgi @@ -0,0 +1,166 @@ +#!/opt/venv/watermark/bin/python3 +# -*- coding: utf-8 -*- +""" +watermark.cgi - On-the-fly PDF footer + page number service for the ACL Anthology. + +POST a multipart/form-data request with fields: + pdf (file, required) The input PDF + footer_text (text, optional) First-page footer block; use for italics; newlines allowed + page_start (int, optional) Starting page number (>=1) + +Returns: Modified PDF (application/pdf) with Content-Disposition: attachment. + +Security / Resource considerations: + * Rejects files > 30MB (quick limit; adjust as needed). + * Basic PDF validation (magic header %PDF-). + * Uses temporary files; they are deleted at end of request. + * No persistent storage. + +Depends on: reportlab, pypdf (already required by bin/add_footer.py) +""" + +import cgi +import os +import sys +import tempfile +import subprocess +from pathlib import Path +import shutil +import urllib.parse +MAX_BYTES = 30 * 1024 * 1024 + +def http_error(status_code: int, message: str): + print(f"Status: {status_code} Bad Request" if status_code == 400 else f"Status: {status_code}") + print("Content-Type: text/plain; charset=utf-8") + print("X-Content-Type-Options: nosniff") + print() + print(message) + sys.exit(0) + +def find_add_footer() -> Path: + """Resolve location of add_footer.py relative to this CGI script.""" + here = Path(__file__).resolve() + homedir = Path(os.environ.get('HOME', '/home/anthologizer')) + # Walk up to locate bin/add_footer.py + for parent in [here.parent, *here.parents, homedir / "acl-anthology"]: + candidate = parent.parent / "bin" / "add_footer.py" if parent.name == 'cgi-bin' else parent / "bin" / "add_footer.py" + if candidate.exists(): + return candidate + # Fallback relative guess (3 levels up) + guess = here.parents[2] / 'bin' / 'add_footer.py' + return guess + +def main(): + # Debug flag: append ?debug=1 to request URL to get full stderr on failure. + qs = os.environ.get('QUERY_STRING', '') + qparams = dict(urllib.parse.parse_qsl(qs, keep_blank_values=True)) if qs else {} + debug_mode = qparams.get('debug') in {'1','true','yes','on'} + method = os.environ.get('REQUEST_METHOD', 'GET').upper() + if method != 'POST': + http_error(405, 'Use POST with multipart/form-data.') + + try: + length = int(os.environ.get('CONTENT_LENGTH', '0')) + except ValueError: + length = 0 + if length <= 0: + http_error(400, 'Empty request body.') + if length > MAX_BYTES: + http_error(400, f'File too large (> {MAX_BYTES//1024//1024}MB).') + + form = cgi.FieldStorage() + + if 'pdf' not in form or not getattr(form['pdf'], 'file', None): + http_error(400, 'Missing PDF file.') + pdf_item = form['pdf'] + footer_text = form.getfirst('footer_text', '')[:10000] # cap length + page_start_raw = form.getfirst('page_start') + page_start = None + if page_start_raw: + try: + page_start = int(page_start_raw) + if page_start < 1: + raise ValueError + except ValueError: + http_error(400, 'Invalid page_start (must be positive integer).') + + # Write uploaded PDF to temp file + tmp_dir = tempfile.mkdtemp(prefix='wmk_') + input_pdf = Path(tmp_dir) / 'input.pdf' + output_pdf = Path(tmp_dir) / 'output.pdf' + + with open(input_pdf, 'wb') as f: + # stream copy to avoid loading entire file in memory + chunked = 0 + while True: + buf = pdf_item.file.read(64 * 1024) + if not buf: + break + chunked += len(buf) + if chunked > MAX_BYTES: + f.close() + http_error(400, 'File exceeded size limit during upload.') + f.write(buf) + + # Validate PDF magic + try: + with open(input_pdf, 'rb') as f: + head = f.read(8) + if b'%PDF-' not in head: + http_error(400, 'Uploaded file is not a PDF.') + except Exception: + http_error(400, 'Could not read uploaded PDF.') + + add_footer = find_add_footer() + if not add_footer.exists(): + http_error(500, 'Server configuration error: add_footer.py not found.') + + cmd = [sys.executable, str(add_footer)] + if page_start is not None: + cmd += ['-p', str(page_start)] + # Convert embedded newlines are preserved; add_footer.py will handle them + cmd += [str(input_pdf), str(output_pdf), footer_text] + + try: + proc = subprocess.run(cmd, capture_output=True, text=True, timeout=40) + except subprocess.TimeoutExpired: + http_error(500, 'Processing timed out.') + + if proc.returncode != 0 or not output_pdf.exists(): + stderr = (proc.stderr or '').strip() + stdout = (proc.stdout or '').strip() + if debug_mode: + # Return full diagnostic (no truncation) for troubleshooting. + diag = [ + 'Status: processing failed', + 'Command: ' + ' '.join(cmd), + f'Return code: {proc.returncode}', + '--- STDERR ---', stderr or '', + '--- STDOUT ---', stdout or '' + ] + http_error(500, '\n'.join(diag)) + else: + merged = stderr or stdout or 'Unknown error' + http_error(500, f'Failed to process PDF. (Add ?debug=1 for details)\n{merged[:400]}') + + # Success: stream file + size = output_pdf.stat().st_size + # Some Apache configurations are picky about the "Status" header in CGI output. + # Emit only standard headers followed by a blank line, then raw PDF bytes. + sys.stdout.write('Content-Type: application/pdf\r\n') + sys.stdout.write('X-Content-Type-Options: nosniff\r\n') + sys.stdout.write(f'Content-Disposition: attachment; filename="watermarked.pdf"\r\n') + sys.stdout.write(f'Content-Length: {size}\r\n') + sys.stdout.write('\r\n') + sys.stdout.flush() + with open(output_pdf, 'rb') as f: + shutil.copyfileobj(f, sys.stdout.buffer) + + # Cleanup temp dir + try: + shutil.rmtree(tmp_dir) + except Exception: + pass + +if __name__ == '__main__': + main() diff --git a/hugo/static/watermark.html b/hugo/static/watermark.html new file mode 100644 index 0000000000..a2fa55b04a --- /dev/null +++ b/hugo/static/watermark.html @@ -0,0 +1,180 @@ + + + + + + ACL Anthology PDF Watermark / Footer Tool + + + +
ACL Anthology • PDF Footer/Watermark Tool
+
+

Add ACL-style footer & page numbers

+

Upload a PDF and optionally add page numbers plus a first-page footer block. Inline italics: wrap spans in <i>...</i>. Separate footer lines with the Enter key.

+
+
+
+ +
+ Drop PDF here
+ (or click to browse) + +
No file selected
+
+
+
+ + +
+
+
+
+ + +
+
+ +
+ Use <i>italics</i>. Multi‑line footers are centered. Page numbers appear on every page. Footer sits above the number.
Maximum upload size: 25MB. +
+
+
+
+ + +
+
+

Nothing is permanently stored; processing happens in-memory / temporary files and the result is streamed back. Avoid uploading sensitive material.
Source script: add_footer.py.

+
+
+
© 2025 ACL Anthology. Tool generated dynamically.
+ + + From c58a0bbf6c2da6fdd6b4d45498f7de4f2dc90b7e Mon Sep 17 00:00:00 2001 From: Matt Post Date: Thu, 18 Sep 2025 12:48:17 -0400 Subject: [PATCH 6/7] Update documentation --- hugo/content/info/corrections.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/hugo/content/info/corrections.md b/hugo/content/info/corrections.md index 043fc5b850..7521209988 100644 --- a/hugo/content/info/corrections.md +++ b/hugo/content/info/corrections.md @@ -96,8 +96,8 @@ For requests to change paper *content* (either a revision or an erratum), again, This summary will be included in the Anthology. Its intended audience is users of the Anthology, and should therefore be written from a neutral, scientific perspective. - If the metadata also needs to change, please also follow the instructions in the previous section. -- If possible, when generating your revision, it would be good to add the proper proceedings header and footer stamps, as well as the correct page numbering. - Unfortunately, we cannot provide any assistance with this task, but [this template from the ACLPUB2 repo](https://github.com/rycolab/aclpub2/blob/main/aclpub2/templates/watermarked_pdf.tex) may be helpful for this. +- If possible, when generating your revision, it would be good to add the proper proceedings header and footer stamps to match the original. + A web service providing this feature can be found at [https://aclanthology.org/watermark.html](https://aclanthology.org/watermark.html) ([source code](https://github.com/acl-org/acl-anthology/tree/main/bin/add_footer.py)). For revisions, the brief summary should allow readers to find the changed parts, but need not be too detailed. Here are some good examples: @@ -112,6 +112,8 @@ Submissions not meeting these standards will be rejected, potentially without no A revision that changes the author list needs permission (see below). +Revisions will be marked with a front-page watermark, similar to the arXiv watermark. + #### Retractions To initiate a retraction, please communicate directly with the Anthology director. From 5083e2bfa954bce036e29f1328b631304fce6106 Mon Sep 17 00:00:00 2001 From: Matt Post Date: Thu, 18 Sep 2025 15:35:31 -0400 Subject: [PATCH 7/7] Fix newline issue --- hugo/static/watermark.html | 61 ++++++++++++++++++++++++++++++++++---- 1 file changed, 56 insertions(+), 5 deletions(-) diff --git a/hugo/static/watermark.html b/hugo/static/watermark.html index a2fa55b04a..138aad0bd8 100644 --- a/hugo/static/watermark.html +++ b/hugo/static/watermark.html @@ -56,7 +56,7 @@
ACL Anthology • PDF Footer/Watermark Tool

Add ACL-style footer & page numbers

-

Upload a PDF and optionally add page numbers plus a first-page footer block. Inline italics: wrap spans in <i>...</i>. Separate footer lines with the Enter key.

+

Upload a PDF and optionally add page numbers plus a first-page footer block. Inline italics: wrap spans in <i>...</i>. Press Enter for a new footer line; lines are sent as literal \n sequences to the server.

@@ -69,8 +69,8 @@

Add ACL-style footer & page numbers

- - + +
@@ -90,6 +90,11 @@

Add ACL-style footer & page numbers

+

Nothing is permanently stored; processing happens in-memory / temporary files and the result is streamed back. Avoid uploading sensitive material.
Source script: add_footer.py.

@@ -101,8 +106,12 @@

Add ACL-style footer & page numbers

const fileInput = document.getElementById('pdf'); const fileInfo = document.getElementById('fileInfo'); const status = document.getElementById('status'); - const submitBtn = document.getElementById('submitBtn'); + const submitBtn = document.getElementById('submitBtn'); const resetBtn = document.getElementById('resetBtn'); + const footerInput = document.getElementById('footerText'); + const preview = document.getElementById('footerPreview'); + const previewWrap = document.getElementById('previewWrap'); + const removedInfo = document.getElementById('removedInfo'); const MAX_SIZE = 25 * 1024 * 1024; // 25MB @@ -143,6 +152,42 @@

Add ACL-style footer & page numbers

fileInput.addEventListener('change', e => handleFiles(e.target.files)); resetBtn.addEventListener('click', () => { form.reset(); fileInfo.textContent='No file selected'; setStatus(''); }); + function sanitizeFooter(raw){ + if(!raw) return {clean:'', removed:0}; + let beforeLen = raw.length; + // Normalize line breaks + let s = raw.replace(/\r\n?/g,'\n'); + // Replace various Unicode space-ish chars with normal space + s = s.replace(/[\u00A0\u2000-\u200B\u202F\u205F\u3000]/g,' '); + // Remove zero-width joiners & directional marks + s = s.replace(/[\u200C\u200D\uFEFF\u202A-\u202E\u2066-\u2069]/g,''); + // Remove other control chars except tab/newline + s = s.replace(/[\x00-\x08\x0B-\x1F\x7F]/g,''); + // Collapse runs of spaces + s = s.replace(/ {2,}/g,' '); + // Trim each line's edges + s = s.split('\n').map(l=>l.trimEnd()).join('\n'); + // Remove leading/trailing blank lines + s = s.replace(/^\n+|\n+$/g,''); + let removed = beforeLen - s.length; + return {clean:s, removed}; + } + + function updatePreview(){ + const {clean, removed} = sanitizeFooter(footerInput.value); + if(clean || removed){ + previewWrap.style.display='block'; + preview.textContent = clean || '(empty)'; + removedInfo.textContent = removed ? `(${removed} hidden chars removed)` : ''; + } else { + previewWrap.style.display='none'; + removedInfo.textContent=''; + } + } + + footerInput.addEventListener('input', updatePreview); + footerInput.addEventListener('blur', updatePreview); + form.addEventListener('submit', async (e) => { e.preventDefault(); if(!fileInput.files.length){ setStatus('Choose a PDF first','error'); return; } @@ -151,7 +196,10 @@

Add ACL-style footer & page numbers

try { const fd = new FormData(); fd.append('pdf', fileInput.files[0]); - fd.append('footer_text', document.getElementById('footerText').value.trim()); + const {clean} = sanitizeFooter(footerInput.value); + // Encode actual newlines as literal \n so backend add_footer.py (already converts literal \n to real newline) can re-expand safely. + const encoded = clean.replace(/\n/g, '\\n'); + fd.append('footer_text', encoded); const pstart = document.getElementById('pageStart').value.trim(); if(pstart) fd.append('page_start', pstart); const resp = await fetch('/cgi-bin/watermark.cgi', { method:'POST', body: fd }); @@ -174,6 +222,9 @@

Add ACL-style footer & page numbers

submitBtn.disabled = false; submitBtn.textContent='Generate PDF'; } }); + + // Initialize preview if prefilled (unlikely) + updatePreview(); })();