diff --git a/.gitignore b/.gitignore index 7cf5fb6c8..154aeed44 100644 --- a/.gitignore +++ b/.gitignore @@ -119,6 +119,12 @@ docs/_site docs/.quarto docs/reference docs/objects.json +docs/user-guide-pdf-clean.qmd +docs/toc.html +docs/toc.pdf +docs/user-guilde-with-toc.pdf +docs/basic_validation.yaml +docs/validation_config.yaml datasets/ /*.parquet diff --git a/Makefile b/Makefile index c27a4f257..86089f6b3 100644 --- a/Makefile +++ b/Makefile @@ -63,6 +63,17 @@ docs-build: && quartodoc build --verbose \ && quarto render +docs-pdf: ## Build PDF version of User Guide (HTML to PDF preserving graphics) + @echo "Preparing PDF document (stripping YAML from includes)..." + uv run python scripts/create_pdf_doc.py + @echo "Rendering User Guide to self-contained HTML..." + cd docs && uv run quarto render user-guide-pdf-clean.qmd --to html --output user-guide-pdf.html + @echo "Converting HTML to PDF with Chrome (preserves validation reports)..." + uv run python scripts/html_to_pdf.py docs/_site/user-guide-pdf.html docs/user-guide.pdf + @echo "Creating Table of Contents page with actual page numbers..." + uv run python scripts/create_toc_pdf.py docs/user-guide.pdf + @echo "PDF available at docs/user-guide.pdf" + docs-llms: ## Generate llms.txt and llms-full.txt files for LLM consumption @uv run python scripts/generate_llms_txt.py diff --git a/docs/_quarto.yml b/docs/_quarto.yml index b088edc09..d771b9d45 100644 --- a/docs/_quarto.yml +++ b/docs/_quarto.yml @@ -3,6 +3,7 @@ project: post-render: scripts/post-render.py resources: - "assets/**" + - "user-guide.pdf" format: html: @@ -55,6 +56,9 @@ website: text: API Reference - href: blog/index.qmd text: Pointblog + - text: "User Guide (PDF)" + icon: file-pdf + href: user-guide.pdf right: - icon: discord href: https://discord.com/invite/YH7CybCNCQ diff --git a/docs/assets/pointblank_logo.png b/docs/assets/pointblank_logo.png new file mode 100644 index 000000000..120dfbdd7 Binary files /dev/null and b/docs/assets/pointblank_logo.png differ diff --git a/docs/print-styles.css b/docs/print-styles.css new file mode 100644 index 000000000..16561db69 --- /dev/null +++ b/docs/print-styles.css @@ -0,0 +1,217 @@ +/* Print-specific styles for PDF generation */ + +@media print { + /* Page settings - page numbers added via post-processing */ + @page { + size: letter landscape; + margin: 0.5in; + } + + /* Hide navigation and other UI elements for PDF (but keep TOC) */ + nav.navbar:not(#TOC), + .nav-footer, + #quarto-header, + .quarto-title-banner, + .quarto-title, + header.quarto-title-block, + .sidebar, + #quarto-sidebar, + .page-navigation { + display: none !important; + } + + /* Show and style the TOC for PDF */ + #TOC, + nav#TOC { + display: block !important; + page-break-after: always; + page-break-before: auto; + margin: 1in auto; + max-width: 8in; + padding: 1em; + } + + #TOC::before, + nav#TOC::before { + content: "Table of Contents"; + display: block; + font-size: 24pt; + font-weight: bold; + margin-bottom: 0.75em; + } + + #TOC ul, + nav#TOC ul { + list-style: none; + padding-left: 0; + } + + #TOC li, + nav#TOC li { + margin: 0.5em 0; + line-height: 1.4; + } + + #TOC ul ul, + nav#TOC ul ul { + padding-left: 1.5em; + font-size: 0.9em; + } + + #TOC a, + nav#TOC a { + text-decoration: none; + color: #333; + } + + #TOC a::after, + nav#TOC a::after { + content: leader('.') target-counter(attr(href), page); + } + + /* Manual Table of Contents */ + .manual-toc { + page-break-after: always; + page-break-inside: avoid; + padding: 2em; + max-width: 8in; + margin: 0 auto; + } + + .manual-toc h1 { + font-size: 24pt; + margin-bottom: 1.5em; + text-align: center; + } + + .manual-toc ol { + list-style: none; + padding: 0; + font-size: 14pt; + line-height: 2.5; + } + + .manual-toc li { + margin: 0.75em 0; + position: relative; + } + + .manual-toc a { + text-decoration: none; + color: #333; + } + + /* Style the title page */ + .title-page { + page-break-after: always; + page-break-inside: avoid; + text-align: center; + margin: 0; + padding: 0; + min-height: 8in; + } + + /* Page break utility */ + .page-break { + page-break-after: always; + height: 0; + margin: 0; + padding: 0; + } + + .title-page * { + page-break-before: avoid !important; + page-break-after: avoid !important; + } + + /* Page break helper */ + .page-break { + page-break-after: always; + height: 0; + margin: 0; + padding: 0; + } + + /* Avoid page breaks inside important elements */ + .validation-report, + pre, + code, + img { + page-break-inside: avoid; + } + + /* Ensure links are visible */ + a[href]:after { + content: none !important; + } + + /* Optimize table rendering - reduce font size for wide tables */ + table { + width: 100%; + border-collapse: collapse; + font-size: 9pt; + page-break-inside: avoid; + page-break-before: auto; + page-break-after: auto; + } + + table th, table td { + padding: 4px 6px; + font-size: 8pt; + page-break-inside: avoid; + } + + /* Make validation tables more compact */ + .validation-report table { + font-size: 7pt; + } + + .validation-report table th, + .validation-report table td { + padding: 2px 4px; + } + + /* Ensure table containers don't break */ + .cell-output, + .cell-output-display, + div:has(> table) { + page-break-inside: avoid; + } + + /* Ensure code blocks fit */ + pre code { + font-size: 8pt; + white-space: pre-wrap; + word-wrap: break-word; + } + + /* Better header spacing and page breaks */ + h1 { + page-break-before: always; + page-break-after: avoid; + page-break-inside: avoid; + margin-top: 0; + } + + /* Don't break page before the first h1 */ + body > h1:first-of-type, + main > h1:first-of-type, + #quarto-content > h1:first-of-type { + page-break-before: avoid; + } + + h2, h3, h4, h5, h6 { + page-break-after: avoid; + page-break-inside: avoid; + } + + /* Ensure images fit on page */ + img { + max-width: 100%; + height: auto; + } +} + +@media screen { + /* Screen-only: add print preview button styles if needed */ +} diff --git a/docs/user-guide-pdf.qmd b/docs/user-guide-pdf.qmd new file mode 100644 index 000000000..87fe3db84 --- /dev/null +++ b/docs/user-guide-pdf.qmd @@ -0,0 +1,97 @@ +--- +format: + html: + toc: true + toc-depth: 3 + number-sections: true + embed-resources: true + theme: flatly + css: + - styles.css + - print-styles.css + page-layout: full + self-contained: true +jupyter: python3 +--- + +```{python} +#| echo: false +#| output: false +import pointblank as pb +pb.config(report_incl_footer=False) +``` + +::: {.title-page} + +![](assets/pointblank_logo.png){width=400px style="display: block; margin: 0 auto; margin-top: 2in; margin-bottom: 0.75in;"} + +

+Data validation toolkit for assessing and monitoring data quality. +

+ +

+© 2024–2025 Posit Software, PBC +

+ +::: + +# Validation Plan + +{{< include user-guide/validation-overview.qmd >}} + +{{< include user-guide/validation-methods.qmd >}} + +{{< include user-guide/column-selection-patterns.qmd >}} + +{{< include user-guide/preprocessing.qmd >}} + +{{< include user-guide/segmentation.qmd >}} + +{{< include user-guide/thresholds.qmd >}} + +{{< include user-guide/actions.qmd >}} + +{{< include user-guide/briefs.qmd >}} + +# Advanced Validation + +{{< include user-guide/expressions.qmd >}} + +{{< include user-guide/schema-validation.qmd >}} + +{{< include user-guide/assertions.qmd >}} + +{{< include user-guide/draft-validation.qmd >}} + +# YAML + +{{< include user-guide/yaml-validation-workflows.qmd >}} + +{{< include user-guide/yaml-reference.qmd >}} + +# Post Interrogation + +{{< include user-guide/validation-reports.qmd >}} + +{{< include user-guide/step-reports.qmd >}} + +{{< include user-guide/extracts.qmd >}} + +{{< include user-guide/sundering.qmd >}} + +# Data Inspection + +{{< include user-guide/preview.qmd >}} + +{{< include user-guide/col-summary-tbl.qmd >}} + +{{< include user-guide/missing-vals-tbl.qmd >}} + +# The Pointblank CLI + +{{< include user-guide/cli-data-inspection.qmd >}} + +{{< include user-guide/cli-data-validation.qmd >}} + +{{< include user-guide/cli-reference.qmd >}} + diff --git a/docs/user-guide.pdf b/docs/user-guide.pdf new file mode 100644 index 000000000..b201b32b7 Binary files /dev/null and b/docs/user-guide.pdf differ diff --git a/scripts/create_pdf_doc.py b/scripts/create_pdf_doc.py new file mode 100644 index 000000000..5e324d466 --- /dev/null +++ b/scripts/create_pdf_doc.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +""" +Create a version of user-guide-pdf.qmd that includes content without YAML front matter. +""" + +import re +from pathlib import Path + + +def strip_yaml_frontmatter(content: str) -> str: + """Remove YAML front matter from content.""" + # Match YAML front matter (--- at start, content, --- or ... at end) + pattern = r"^---\s*\n.*?\n(?:---|\.\.\.)[ \t]*\n" + return re.sub(pattern, "", content, count=1, flags=re.DOTALL | re.MULTILINE) + + +def process_include(include_line: str, base_path: Path) -> str: + """Process an include directive and return content without YAML.""" + # Extract file path from {{< include path >}} + match = re.search(r"\{\{<\s*include\s+([^\s>]+)\s*>\}\}", include_line) + if not match: + return include_line + + file_path = base_path / match.group(1) + + if not file_path.exists(): + print(f"Warning: {file_path} not found") + return include_line + + # Read and strip YAML + content = file_path.read_text() + content = strip_yaml_frontmatter(content) + + return content + + +def create_pdf_version(): + """Create user-guide-pdf-clean.qmd with YAML stripped from includes.""" + docs_dir = Path(__file__).parent.parent / "docs" + source_file = docs_dir / "user-guide-pdf.qmd" + output_file = docs_dir / "user-guide-pdf-clean.qmd" + + if not source_file.exists(): + print(f"Error: {source_file} not found") + return + + content = source_file.read_text() + lines = content.split("\n") + + output_lines = [] + in_frontmatter = False + frontmatter_done = False + + for i, line in enumerate(lines): + # Keep the main document's YAML front matter + if i == 0 and line.strip() == "---": + in_frontmatter = True + output_lines.append(line) + continue + + if in_frontmatter: + output_lines.append(line) + if line.strip() in ["---", "..."]: + in_frontmatter = False + frontmatter_done = True + continue + + # Process include directives + if "{{< include" in line: + processed = process_include(line, docs_dir) + output_lines.append(processed) + else: + output_lines.append(line) + + output_file.write_text("\n".join(output_lines)) + print(f"Created {output_file}") + print(f"Size: {output_file.stat().st_size / 1024:.1f} KB") + + +if __name__ == "__main__": + create_pdf_version() diff --git a/scripts/create_toc_pdf.py b/scripts/create_toc_pdf.py new file mode 100644 index 000000000..6cf395702 --- /dev/null +++ b/scripts/create_toc_pdf.py @@ -0,0 +1,503 @@ +#!/usr/bin/env python3 +""" +Extract page numbers from PDF and create a Table of Contents page. +This script analyzes the generated PDF to find actual page numbers for each section. +""" + +import subprocess +import sys +from pathlib import Path + + +def find_section_pages_from_pdf(pdf_path: Path) -> dict[str, int]: + """ + Extract actual page numbers by analyzing the PDF content using PyPDF2. + Looks for the section headings in the PDF text. + """ + try: + from PyPDF2 import PdfReader + except ImportError: + print("Installing PyPDF2...") + subprocess.run([sys.executable, "-m", "pip", "install", "PyPDF2"], check=True) + from PyPDF2 import PdfReader + + reader = PdfReader(str(pdf_path)) + total_pages = len(reader.pages) + + section_pages = {} + # Search patterns - these match the actual H1 sections in the user guide + sections_to_find = [ + ("1 Validation Plan", "validation-plan"), + ("2 Advanced Validation", "advanced-validation"), + ("3 YAML", "yaml"), + ("4 Post Interrogation", "post-interrogation"), + ("5 Data Inspection", "data-inspection"), + ("6 The Pointblank CLI", "pointblank-cli"), + ] + + print(f"Scanning {total_pages} pages for section headings...") + + for page_num in range(total_pages): + try: + page = reader.pages[page_num] + page_text = page.extract_text() + + # Check if any section heading appears on this page + for section_heading, section_id in sections_to_find: + # Look for the pattern at the start of a line (after whitespace) + if section_heading in page_text and section_id not in section_pages: + # Page numbers are 1-indexed for display + section_pages[section_id] = page_num + 1 + print(f"Found '{section_heading}' on page {page_num + 1}") + except Exception as e: + print(f"Warning: Could not extract text from page {page_num + 1}: {e}") + continue + + return section_pages + + +def create_toc_html(section_pages: dict[str, int], output_path: Path) -> None: + """Create a standalone TOC HTML page.""" + + sections = [ + ("validation-plan", "1", "Validation Plan"), + ("advanced-validation", "2", "Advanced Validation"), + ("yaml", "3", "YAML"), + ("post-interrogation", "4", "Post Interrogation"), + ("data-inspection", "5", "Data Inspection"), + ("pointblank-cli", "6", "The Pointblank CLI"), + ] + + toc_entries = [] + for section_id, num, title in sections: + # Get the page number from the original PDF + original_page = section_pages.get(section_id, "...") + + # Adjust for display: original page N becomes display page N-1 + # (subtract 1 for title page; TOC is inserted separately and blank page exists) + if original_page != "...": + display_page = original_page - 1 + # Store the original page for link destination (will be offset by 1 after TOC insertion) + link_page = original_page + else: + display_page = "..." + link_page = None + + # Create clickable link if we have a valid page number + if link_page: + toc_entries.append(f""" + +
+
{num}. {title}
+
+
{display_page}
+
+
+ """) + else: + toc_entries.append(f""" +
+
{num}. {title}
+
+
{display_page}
+
+ """) + + html = f""" + + + + + + +

Table of Contents

+ {"".join(toc_entries)} + +""" + + output_path.write_text(html) + print(f"Created TOC HTML at {output_path}") + + +def html_to_pdf(html_path: Path, pdf_path: Path) -> None: + """Convert HTML to PDF using Chrome headless.""" + chrome_paths = [ + "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", + "/Applications/Chromium.app/Contents/MacOS/Chromium", + "google-chrome", + "chromium", + "chromium-browser", + ] + + chrome = None + for path in chrome_paths: + try: + result = subprocess.run([path, "--version"], capture_output=True, check=True) + chrome = path + break + except (subprocess.CalledProcessError, FileNotFoundError): + continue + + if not chrome: + print("Error: Chrome/Chromium not found") + sys.exit(1) + + subprocess.run( + [ + chrome, + "--headless=new", + "--disable-gpu", + "--no-pdf-header-footer", + f"--print-to-pdf={pdf_path}", + f"file://{html_path.absolute()}", + ], + check=True, + ) + + print(f"Created TOC PDF at {pdf_path}") + + +def add_links_to_toc_pdf(toc_pdf_path: Path, section_pages: dict[str, int]) -> None: + """Add clickable link annotations to the TOC PDF that point to section pages.""" + try: + from PyPDF2 import PdfReader, PdfWriter + from PyPDF2.generic import ( + ArrayObject, + DictionaryObject, + FloatObject, + NameObject, + NumberObject, + ) + except ImportError: + print("Installing PyPDF2...") + subprocess.run([sys.executable, "-m", "pip", "install", "PyPDF2"], check=True) + from PyPDF2 import PdfReader, PdfWriter + from PyPDF2.generic import ( + ArrayObject, + DictionaryObject, + FloatObject, + NameObject, + NumberObject, + ) + + reader = PdfReader(str(toc_pdf_path)) + writer = PdfWriter() + + # TOC has only one page + page = reader.pages[0] + page_width = float(page.mediabox.width) + page_height = float(page.mediabox.height) + + # Define sections with their target pages and vertical positions + # PDF coordinates are from BOTTOM-left corner + # Landscape letter: width=792, height=612 points + # Each entry is roughly 25 points tall (14pt * 1.8 line-height) + # Listed in visual order (top to bottom) with Y coordinates from bottom + sections = [ + ("validation-plan", 325), # Top entry in TOC (highest Y from bottom) + ("advanced-validation", 300), + ("yaml", 275), + ("post-interrogation", 250), + ("data-inspection", 225), + ("pointblank-cli", 200), # Bottom entry in TOC (lowest Y from bottom) + ] + + # Add link annotations for each TOC entry + annotations = [] + for section_id, y_from_bottom in sections: + if section_id in section_pages: + # Target page in the final merged PDF (accounting for title + TOC pages) + target_page = section_pages[section_id] # This is the page index in final PDF + + # Create link annotation + # The rectangle defines the clickable area (left, bottom, right, top) + # Make the entire TOC line clickable (height of ~25 points) + link_rect = ArrayObject( + [ + FloatObject(180), # Left - start after left margin/padding + FloatObject(y_from_bottom), # Bottom + FloatObject(page_width - 180), # Right - end before right margin/padding + FloatObject(y_from_bottom + 25), # Top (25 points tall) + ] + ) + + # Create the link annotation dictionary + link_dict = DictionaryObject() + link_dict.update( + { + NameObject("/Type"): NameObject("/Annot"), + NameObject("/Subtype"): NameObject("/Link"), + NameObject("/Rect"): link_rect, + NameObject("/Border"): ArrayObject( + [NumberObject(0), NumberObject(0), NumberObject(0)] + ), + NameObject("/A"): DictionaryObject( + { + NameObject("/S"): NameObject("/GoTo"), + NameObject("/D"): ArrayObject( + [ + NumberObject(target_page), # Page index + NameObject("/XYZ"), # Keep current zoom + NumberObject(0), # X position (left) + NumberObject(page_height), # Y position (top) + NumberObject(0), # Zoom (0 = keep current) + ] + ), + } + ), + } + ) + + annotations.append(link_dict) + + # Add all annotations to the page + if "/Annots" in page: + # Extend existing annotations + for annot in annotations: + page["/Annots"].append(writer._add_object(annot)) + else: + # Create new annotations array + page[NameObject("/Annots")] = ArrayObject( + [writer._add_object(annot) for annot in annotations] + ) + + writer.add_page(page) + + # Write back to file + with open(toc_pdf_path, "wb") as f: + writer.write(f) + + print(f"Added {len(annotations)} clickable links to TOC") + + +def add_page_numbers_to_pdf(pdf_path: Path) -> None: + """Add page numbers to the bottom center of each page (except title page).""" + try: + import io + + from PyPDF2 import PdfReader, PdfWriter + from reportlab.lib.pagesizes import landscape, letter + from reportlab.pdfgen import canvas + except ImportError: + print("Installing required packages...") + subprocess.run([sys.executable, "-m", "pip", "install", "PyPDF2", "reportlab"], check=True) + import io + + from PyPDF2 import PdfReader, PdfWriter + from reportlab.lib.pagesizes import landscape, letter + from reportlab.pdfgen import canvas + + reader = PdfReader(str(pdf_path)) + writer = PdfWriter() + + page_width, page_height = landscape(letter) + + for page_num, page in enumerate(reader.pages): + # Skip title page (page 0) and TOC (page 1) + if page_num < 2: + writer.add_page(page) + continue + + # Create a new PDF with just the page number + packet = io.BytesIO() + can = canvas.Canvas(packet, pagesize=landscape(letter)) + + # Add page number at bottom center + # Page numbers start from 1 for the first content page (after title + TOC) + display_page_num = page_num - 1 # Subtract 1 because TOC is page 2 + can.setFont("Helvetica", 10) + can.setFillColorRGB(0.4, 0.4, 0.4) # Gray color + text_width = can.stringWidth(str(display_page_num), "Helvetica", 10) + can.drawString((page_width - text_width) / 2, 0.25 * 72, str(display_page_num)) + + can.save() + + # Move to the beginning of the BytesIO buffer + packet.seek(0) + overlay_pdf = PdfReader(packet) + + # Merge the overlay with the page + page.merge_page(overlay_pdf.pages[0]) + writer.add_page(page) + + # Write to temporary file then replace original + temp_path = pdf_path.parent / f"{pdf_path.stem}_temp.pdf" + with open(temp_path, "wb") as f: + writer.write(f) + + import shutil + + shutil.move(str(temp_path), str(pdf_path)) + print(f"Added page numbers to {len(reader.pages) - 2} pages") + + +def merge_pdfs( + title_pdf: Path, toc_pdf: Path, main_pdf: Path, output_pdf: Path, section_pages: dict[str, int] +) -> None: + """Merge title page, TOC, and main content PDFs using PyPDF2 and add bookmarks.""" + try: + from PyPDF2 import PdfReader, PdfWriter + except ImportError: + print("Installing PyPDF2...") + subprocess.run([sys.executable, "-m", "pip", "install", "PyPDF2"], check=True) + from PyPDF2 import PdfReader, PdfWriter + + writer = PdfWriter() + main_reader = PdfReader(str(main_pdf)) + toc_reader = PdfReader(str(toc_pdf)) + + # Add title page (first page only) + writer.add_page(main_reader.pages[0]) + + # Add TOC page + toc_page_idx = 1 + for page in toc_reader.pages: + writer.add_page(page) + + # Add rest of main content (skip title page) + # Pages are now offset by 1 (because we inserted TOC) + for i in range(1, len(main_reader.pages)): + writer.add_page(main_reader.pages[i]) + + # Add bookmarks for each section + sections = [ + ("validation-plan", "Validation Plan"), + ("advanced-validation", "Advanced Validation"), + ("yaml", "YAML"), + ("post-interrogation", "Post Interrogation"), + ("data-inspection", "Data Inspection"), + ("pointblank-cli", "The Pointblank CLI"), + ] + + for section_id, title in sections: + if section_id in section_pages: + # Page indices need to account for title + TOC pages + # Original page N is now at index N+1 (because TOC was inserted) + page_idx = section_pages[section_id] # This is the page in the final merged PDF + writer.add_outline_item(title, page_idx, parent=None) + + with open(output_pdf, "wb") as f: + writer.write(f) + + print(f"Merged PDF created at {output_pdf}") + print(f"Added {len(section_pages)} bookmarks") + + +def main(): + if len(sys.argv) < 2: + print("Usage: python create_toc_pdf.py ") + sys.exit(1) + + main_pdf = Path(sys.argv[1]) + + if not main_pdf.exists(): + print(f"Error: {main_pdf} not found") + sys.exit(1) + + print(f"Analyzing {main_pdf} for section page numbers...") + section_pages = find_section_pages_from_pdf(main_pdf) + + if not section_pages: + print("Warning: No sections found. Using placeholder page numbers.") + section_pages = { + "validation-plan": "...", + "validation-methods": "...", + "table-transformations": "...", + "action-levels": "...", + "interrogation": "...", + "configuring-pointblank": "...", + } + + # Create TOC HTML + toc_html = main_pdf.parent / "toc.html" + create_toc_html(section_pages, toc_html) + + # Convert TOC to PDF + toc_pdf = main_pdf.parent / "toc.pdf" + html_to_pdf(toc_html, toc_pdf) + + # Add clickable links to TOC PDF (before merging) + print("Adding clickable links to TOC...") + add_links_to_toc_pdf(toc_pdf, section_pages) + + # Create final merged PDF with bookmarks + output_pdf = main_pdf.parent / "user-guide-with-toc.pdf" + merge_pdfs(main_pdf, toc_pdf, main_pdf, output_pdf, section_pages) + + # Replace original + import shutil + + shutil.move(str(output_pdf), str(main_pdf)) + + # Add page numbers + print("Adding page numbers to PDF...") + add_page_numbers_to_pdf(main_pdf) + + # Clean up + toc_html.unlink(missing_ok=True) + toc_pdf.unlink(missing_ok=True) + + print(f"\nFinal PDF with TOC and page numbers: {main_pdf}") + print(f"Size: {main_pdf.stat().st_size / (1024 * 1024):.1f} MB") + + +if __name__ == "__main__": + main() diff --git a/scripts/html_to_pdf.py b/scripts/html_to_pdf.py new file mode 100644 index 000000000..5376ebb20 --- /dev/null +++ b/scripts/html_to_pdf.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +""" +Convert HTML file to PDF using Playwright with custom print CSS. +This preserves all HTML content including validation reports with proper text selection. +""" + +import subprocess +import sys +from pathlib import Path + + +def html_to_pdf_chrome(html_path: str, pdf_path: str): + """Convert HTML to PDF using Chrome/Chromium headless.""" + html_path = Path(html_path).resolve() + pdf_path = Path(pdf_path).resolve() + + if not html_path.exists(): + print(f"Error: HTML file not found: {html_path}") + sys.exit(1) + + print(f"Converting {html_path} to {pdf_path}...") + + # Try to find Chrome executable + chrome_paths = [ + "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", + "/usr/bin/google-chrome", + "/usr/bin/chromium", + "/usr/bin/chromium-browser", + ] + + chrome_cmd = None + for path in chrome_paths: + if Path(path).exists(): + chrome_cmd = path + break + + if not chrome_cmd: + print("Error: Chrome/Chromium not found.") + print("Please install Google Chrome or Chromium.") + sys.exit(1) + + # Run Chrome headless to print to PDF + result = subprocess.run( + [ + chrome_cmd, + "--headless=new", + "--disable-gpu", + "--no-pdf-header-footer", + f"--print-to-pdf={pdf_path}", + f"file://{html_path}", + ], + capture_output=True, + text=True, + ) + + if result.returncode != 0: + print(f"Error converting to PDF: {result.stderr}") + sys.exit(1) + + if pdf_path.exists(): + print(f"PDF generated successfully: {pdf_path}") + print(f"Size: {pdf_path.stat().st_size / (1024 * 1024):.1f} MB") + else: + print("Error: PDF was not created") + sys.exit(1) + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: html_to_pdf.py ") + sys.exit(1) + + html_to_pdf_chrome(sys.argv[1], sys.argv[2]) + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: html_to_pdf.py ") + sys.exit(1) + + html_to_pdf_chrome(sys.argv[1], sys.argv[2])