diff --git a/.gitignore b/.gitignore index 7cf5fb6c8..154aeed44 100644 --- a/.gitignore +++ b/.gitignore @@ -119,6 +119,12 @@ docs/_site docs/.quarto docs/reference docs/objects.json +docs/user-guide-pdf-clean.qmd +docs/toc.html +docs/toc.pdf +docs/user-guilde-with-toc.pdf +docs/basic_validation.yaml +docs/validation_config.yaml datasets/ /*.parquet diff --git a/Makefile b/Makefile index c27a4f257..86089f6b3 100644 --- a/Makefile +++ b/Makefile @@ -63,6 +63,17 @@ docs-build: && quartodoc build --verbose \ && quarto render +docs-pdf: ## Build PDF version of User Guide (HTML to PDF preserving graphics) + @echo "Preparing PDF document (stripping YAML from includes)..." + uv run python scripts/create_pdf_doc.py + @echo "Rendering User Guide to self-contained HTML..." + cd docs && uv run quarto render user-guide-pdf-clean.qmd --to html --output user-guide-pdf.html + @echo "Converting HTML to PDF with Chrome (preserves validation reports)..." + uv run python scripts/html_to_pdf.py docs/_site/user-guide-pdf.html docs/user-guide.pdf + @echo "Creating Table of Contents page with actual page numbers..." + uv run python scripts/create_toc_pdf.py docs/user-guide.pdf + @echo "PDF available at docs/user-guide.pdf" + docs-llms: ## Generate llms.txt and llms-full.txt files for LLM consumption @uv run python scripts/generate_llms_txt.py diff --git a/docs/_quarto.yml b/docs/_quarto.yml index b088edc09..d771b9d45 100644 --- a/docs/_quarto.yml +++ b/docs/_quarto.yml @@ -3,6 +3,7 @@ project: post-render: scripts/post-render.py resources: - "assets/**" + - "user-guide.pdf" format: html: @@ -55,6 +56,9 @@ website: text: API Reference - href: blog/index.qmd text: Pointblog + - text: "User Guide (PDF)" + icon: file-pdf + href: user-guide.pdf right: - icon: discord href: https://discord.com/invite/YH7CybCNCQ diff --git a/docs/assets/pointblank_logo.png b/docs/assets/pointblank_logo.png new file mode 100644 index 000000000..120dfbdd7 Binary files /dev/null and b/docs/assets/pointblank_logo.png differ diff --git a/docs/print-styles.css b/docs/print-styles.css new file mode 100644 index 000000000..16561db69 --- /dev/null +++ b/docs/print-styles.css @@ -0,0 +1,217 @@ +/* Print-specific styles for PDF generation */ + +@media print { + /* Page settings - page numbers added via post-processing */ + @page { + size: letter landscape; + margin: 0.5in; + } + + /* Hide navigation and other UI elements for PDF (but keep TOC) */ + nav.navbar:not(#TOC), + .nav-footer, + #quarto-header, + .quarto-title-banner, + .quarto-title, + header.quarto-title-block, + .sidebar, + #quarto-sidebar, + .page-navigation { + display: none !important; + } + + /* Show and style the TOC for PDF */ + #TOC, + nav#TOC { + display: block !important; + page-break-after: always; + page-break-before: auto; + margin: 1in auto; + max-width: 8in; + padding: 1em; + } + + #TOC::before, + nav#TOC::before { + content: "Table of Contents"; + display: block; + font-size: 24pt; + font-weight: bold; + margin-bottom: 0.75em; + } + + #TOC ul, + nav#TOC ul { + list-style: none; + padding-left: 0; + } + + #TOC li, + nav#TOC li { + margin: 0.5em 0; + line-height: 1.4; + } + + #TOC ul ul, + nav#TOC ul ul { + padding-left: 1.5em; + font-size: 0.9em; + } + + #TOC a, + nav#TOC a { + text-decoration: none; + color: #333; + } + + #TOC a::after, + nav#TOC a::after { + content: leader('.') target-counter(attr(href), page); + } + + /* Manual Table of Contents */ + .manual-toc { + page-break-after: always; + page-break-inside: avoid; + padding: 2em; + max-width: 8in; + margin: 0 auto; + } + + .manual-toc h1 { + font-size: 24pt; + margin-bottom: 1.5em; + text-align: center; + } + + .manual-toc ol { + list-style: none; + padding: 0; + font-size: 14pt; + line-height: 2.5; + } + + .manual-toc li { + margin: 0.75em 0; + position: relative; + } + + .manual-toc a { + text-decoration: none; + color: #333; + } + + /* Style the title page */ + .title-page { + page-break-after: always; + page-break-inside: avoid; + text-align: center; + margin: 0; + padding: 0; + min-height: 8in; + } + + /* Page break utility */ + .page-break { + page-break-after: always; + height: 0; + margin: 0; + padding: 0; + } + + .title-page * { + page-break-before: avoid !important; + page-break-after: avoid !important; + } + + /* Page break helper */ + .page-break { + page-break-after: always; + height: 0; + margin: 0; + padding: 0; + } + + /* Avoid page breaks inside important elements */ + .validation-report, + pre, + code, + img { + page-break-inside: avoid; + } + + /* Ensure links are visible */ + a[href]:after { + content: none !important; + } + + /* Optimize table rendering - reduce font size for wide tables */ + table { + width: 100%; + border-collapse: collapse; + font-size: 9pt; + page-break-inside: avoid; + page-break-before: auto; + page-break-after: auto; + } + + table th, table td { + padding: 4px 6px; + font-size: 8pt; + page-break-inside: avoid; + } + + /* Make validation tables more compact */ + .validation-report table { + font-size: 7pt; + } + + .validation-report table th, + .validation-report table td { + padding: 2px 4px; + } + + /* Ensure table containers don't break */ + .cell-output, + .cell-output-display, + div:has(> table) { + page-break-inside: avoid; + } + + /* Ensure code blocks fit */ + pre code { + font-size: 8pt; + white-space: pre-wrap; + word-wrap: break-word; + } + + /* Better header spacing and page breaks */ + h1 { + page-break-before: always; + page-break-after: avoid; + page-break-inside: avoid; + margin-top: 0; + } + + /* Don't break page before the first h1 */ + body > h1:first-of-type, + main > h1:first-of-type, + #quarto-content > h1:first-of-type { + page-break-before: avoid; + } + + h2, h3, h4, h5, h6 { + page-break-after: avoid; + page-break-inside: avoid; + } + + /* Ensure images fit on page */ + img { + max-width: 100%; + height: auto; + } +} + +@media screen { + /* Screen-only: add print preview button styles if needed */ +} diff --git a/docs/user-guide-pdf.qmd b/docs/user-guide-pdf.qmd new file mode 100644 index 000000000..87fe3db84 --- /dev/null +++ b/docs/user-guide-pdf.qmd @@ -0,0 +1,97 @@ +--- +format: + html: + toc: true + toc-depth: 3 + number-sections: true + embed-resources: true + theme: flatly + css: + - styles.css + - print-styles.css + page-layout: full + self-contained: true +jupyter: python3 +--- + +```{python} +#| echo: false +#| output: false +import pointblank as pb +pb.config(report_incl_footer=False) +``` + +::: {.title-page} + +{width=400px style="display: block; margin: 0 auto; margin-top: 2in; margin-bottom: 0.75in;"} + +
+Data validation toolkit for assessing and monitoring data quality. +
+ ++© 2024–2025 Posit Software, PBC +
+ +::: + +# Validation Plan + +{{< include user-guide/validation-overview.qmd >}} + +{{< include user-guide/validation-methods.qmd >}} + +{{< include user-guide/column-selection-patterns.qmd >}} + +{{< include user-guide/preprocessing.qmd >}} + +{{< include user-guide/segmentation.qmd >}} + +{{< include user-guide/thresholds.qmd >}} + +{{< include user-guide/actions.qmd >}} + +{{< include user-guide/briefs.qmd >}} + +# Advanced Validation + +{{< include user-guide/expressions.qmd >}} + +{{< include user-guide/schema-validation.qmd >}} + +{{< include user-guide/assertions.qmd >}} + +{{< include user-guide/draft-validation.qmd >}} + +# YAML + +{{< include user-guide/yaml-validation-workflows.qmd >}} + +{{< include user-guide/yaml-reference.qmd >}} + +# Post Interrogation + +{{< include user-guide/validation-reports.qmd >}} + +{{< include user-guide/step-reports.qmd >}} + +{{< include user-guide/extracts.qmd >}} + +{{< include user-guide/sundering.qmd >}} + +# Data Inspection + +{{< include user-guide/preview.qmd >}} + +{{< include user-guide/col-summary-tbl.qmd >}} + +{{< include user-guide/missing-vals-tbl.qmd >}} + +# The Pointblank CLI + +{{< include user-guide/cli-data-inspection.qmd >}} + +{{< include user-guide/cli-data-validation.qmd >}} + +{{< include user-guide/cli-reference.qmd >}} + diff --git a/docs/user-guide.pdf b/docs/user-guide.pdf new file mode 100644 index 000000000..b201b32b7 Binary files /dev/null and b/docs/user-guide.pdf differ diff --git a/scripts/create_pdf_doc.py b/scripts/create_pdf_doc.py new file mode 100644 index 000000000..5e324d466 --- /dev/null +++ b/scripts/create_pdf_doc.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 +""" +Create a version of user-guide-pdf.qmd that includes content without YAML front matter. +""" + +import re +from pathlib import Path + + +def strip_yaml_frontmatter(content: str) -> str: + """Remove YAML front matter from content.""" + # Match YAML front matter (--- at start, content, --- or ... at end) + pattern = r"^---\s*\n.*?\n(?:---|\.\.\.)[ \t]*\n" + return re.sub(pattern, "", content, count=1, flags=re.DOTALL | re.MULTILINE) + + +def process_include(include_line: str, base_path: Path) -> str: + """Process an include directive and return content without YAML.""" + # Extract file path from {{< include path >}} + match = re.search(r"\{\{<\s*include\s+([^\s>]+)\s*>\}\}", include_line) + if not match: + return include_line + + file_path = base_path / match.group(1) + + if not file_path.exists(): + print(f"Warning: {file_path} not found") + return include_line + + # Read and strip YAML + content = file_path.read_text() + content = strip_yaml_frontmatter(content) + + return content + + +def create_pdf_version(): + """Create user-guide-pdf-clean.qmd with YAML stripped from includes.""" + docs_dir = Path(__file__).parent.parent / "docs" + source_file = docs_dir / "user-guide-pdf.qmd" + output_file = docs_dir / "user-guide-pdf-clean.qmd" + + if not source_file.exists(): + print(f"Error: {source_file} not found") + return + + content = source_file.read_text() + lines = content.split("\n") + + output_lines = [] + in_frontmatter = False + frontmatter_done = False + + for i, line in enumerate(lines): + # Keep the main document's YAML front matter + if i == 0 and line.strip() == "---": + in_frontmatter = True + output_lines.append(line) + continue + + if in_frontmatter: + output_lines.append(line) + if line.strip() in ["---", "..."]: + in_frontmatter = False + frontmatter_done = True + continue + + # Process include directives + if "{{< include" in line: + processed = process_include(line, docs_dir) + output_lines.append(processed) + else: + output_lines.append(line) + + output_file.write_text("\n".join(output_lines)) + print(f"Created {output_file}") + print(f"Size: {output_file.stat().st_size / 1024:.1f} KB") + + +if __name__ == "__main__": + create_pdf_version() diff --git a/scripts/create_toc_pdf.py b/scripts/create_toc_pdf.py new file mode 100644 index 000000000..6cf395702 --- /dev/null +++ b/scripts/create_toc_pdf.py @@ -0,0 +1,503 @@ +#!/usr/bin/env python3 +""" +Extract page numbers from PDF and create a Table of Contents page. +This script analyzes the generated PDF to find actual page numbers for each section. +""" + +import subprocess +import sys +from pathlib import Path + + +def find_section_pages_from_pdf(pdf_path: Path) -> dict[str, int]: + """ + Extract actual page numbers by analyzing the PDF content using PyPDF2. + Looks for the section headings in the PDF text. + """ + try: + from PyPDF2 import PdfReader + except ImportError: + print("Installing PyPDF2...") + subprocess.run([sys.executable, "-m", "pip", "install", "PyPDF2"], check=True) + from PyPDF2 import PdfReader + + reader = PdfReader(str(pdf_path)) + total_pages = len(reader.pages) + + section_pages = {} + # Search patterns - these match the actual H1 sections in the user guide + sections_to_find = [ + ("1 Validation Plan", "validation-plan"), + ("2 Advanced Validation", "advanced-validation"), + ("3 YAML", "yaml"), + ("4 Post Interrogation", "post-interrogation"), + ("5 Data Inspection", "data-inspection"), + ("6 The Pointblank CLI", "pointblank-cli"), + ] + + print(f"Scanning {total_pages} pages for section headings...") + + for page_num in range(total_pages): + try: + page = reader.pages[page_num] + page_text = page.extract_text() + + # Check if any section heading appears on this page + for section_heading, section_id in sections_to_find: + # Look for the pattern at the start of a line (after whitespace) + if section_heading in page_text and section_id not in section_pages: + # Page numbers are 1-indexed for display + section_pages[section_id] = page_num + 1 + print(f"Found '{section_heading}' on page {page_num + 1}") + except Exception as e: + print(f"Warning: Could not extract text from page {page_num + 1}: {e}") + continue + + return section_pages + + +def create_toc_html(section_pages: dict[str, int], output_path: Path) -> None: + """Create a standalone TOC HTML page.""" + + sections = [ + ("validation-plan", "1", "Validation Plan"), + ("advanced-validation", "2", "Advanced Validation"), + ("yaml", "3", "YAML"), + ("post-interrogation", "4", "Post Interrogation"), + ("data-inspection", "5", "Data Inspection"), + ("pointblank-cli", "6", "The Pointblank CLI"), + ] + + toc_entries = [] + for section_id, num, title in sections: + # Get the page number from the original PDF + original_page = section_pages.get(section_id, "...") + + # Adjust for display: original page N becomes display page N-1 + # (subtract 1 for title page; TOC is inserted separately and blank page exists) + if original_page != "...": + display_page = original_page - 1 + # Store the original page for link destination (will be offset by 1 after TOC insertion) + link_page = original_page + else: + display_page = "..." + link_page = None + + # Create clickable link if we have a valid page number + if link_page: + toc_entries.append(f""" + +