diff --git a/.github/workflows/jekyll-build.yml b/.github/workflows/jekyll-build.yml index a4dceb65d68..6fd49434b5c 100644 --- a/.github/workflows/jekyll-build.yml +++ b/.github/workflows/jekyll-build.yml @@ -13,4 +13,4 @@ jobs: ruby-version: '3.4.5' bundler-cache: true - run: | - JEKYLL_FATAL_LINK_CHECKER=internal bundle exec jekyll build --future + JEKYLL_FATAL_LINK_CHECKER=internal bundle exec jekyll build --future \ No newline at end of file diff --git a/Gemfile b/Gemfile index b1d64c15d84..0d0c362fcaa 100644 --- a/Gemfile +++ b/Gemfile @@ -47,6 +47,12 @@ gem 'typhoeus' gem 'activesupport', '~> 7' gem 'mustache', '~> 1' +# PDF Generator (optional - requires Node.js) +# Install with: ENABLE_PDF_GENERATION=true bundle install +if ENV['ENABLE_PDF_GENERATION'] == 'true' + gem 'grover', '~> 1.3' +end + group :development, :test do gem 'rspec' gem 'rubocop', '~> 1.44', require: false diff --git a/_config.yml b/_config.yml index 122b2ce6763..f2b08466d92 100644 --- a/_config.yml +++ b/_config.yml @@ -329,11 +329,36 @@ plugins: - jekyll-redirect-from - jekyll-sitemap - jekyll-spec-insert + - pdf_generator_loader # This format has to conform to RFC822 last-modified-at: date-format: '%a, %d %b %Y %H:%M:%S %z' +# PDF Generator Configuration +pdf_generator: + enabled: true + # Generate PDFs for entire collections + collections: + - getting-started + - install-and-configure + - api-reference + - query-dsl + - aggregations + - mappings + - analyzers + # Generate PDFs for specific guides (more granular control) + guides: + - name: "Getting Started Guide" + collection: getting-started + filename: "getting-started-guide.pdf" + - name: "Installation Guide" + collection: install-and-configure + filename: "installation-guide.pdf" + - name: "API Reference" + collection: api-reference + filename: "api-reference.pdf" + # Exclude from processing. # The following items will not be processed, by default. Create a custom list # to override the default setting. diff --git a/_pdf_generator/README.md b/_pdf_generator/README.md new file mode 100644 index 00000000000..f413f285957 --- /dev/null +++ b/_pdf_generator/README.md @@ -0,0 +1,88 @@ +# PDF Generator for OpenSearch Documentation + +This plugin generates PDF versions of documentation collections during the Jekyll build process. + +## File Structure + +All PDF generator code is contained in the `_pdf_generator/` directory: +- `pdf_generator.rb` - Main plugin implementation +- `README.md` - This documentation file + +A minimal loader file exists in `_plugins/pdf_generator_loader.rb` to ensure Jekyll loads the plugin (Jekyll requires plugins to be in `_plugins` or be gems). + +## Overview + +The PDF generator creates downloadable PDF files for documentation collections and guides. PDFs are generated automatically during the Jekyll build and are saved to the `pdfs/` directory in the site destination. + +## Configuration + +PDF generation is configured in `_config.yml` under the `pdf_generator` section: + +```yaml +pdf_generator: + enabled: true + # Generate PDFs for entire collections + collections: + - getting-started + - install-and-configure + - api-reference + # Generate PDFs for specific guides (more granular control) + guides: + - name: "Getting Started Guide" + collection: getting-started + filename: "getting-started-guide.pdf" + - name: "Installation Guide" + collection: install-and-configure + filename: "installation-guide.pdf" +``` + +### Configuration Options + +- `enabled`: Set to `true` to enable PDF generation, `false` to disable +- `collections`: Array of collection names to generate PDFs for (PDF filename will be `{collection-name}.pdf`) +- `guides`: Array of guide configurations with: + - `name`: Display name for the guide + - `collection`: Collection name to generate PDF from + - `filename`: Output PDF filename (optional, defaults to `{name}.pdf`) + - `start_page`: Optional URL or path to start from (for partial guides) + +## How It Works + +1. During Jekyll build, the PDF generator plugin identifies configured collections/guides +2. After all pages are rendered, the plugin collects the rendered HTML content +3. HTML is cleaned and formatted for PDF output +4. PDFs are generated using Grover (Puppeteer-based PDF generation) +5. PDFs are saved to `_site/pdfs/` directory + +## Dependencies + +- `grover` gem: Ruby wrapper for Puppeteer (requires Node.js and Chrome/Chromium) +- `puppeteer`: Node.js package (installed automatically by grover) + +## Accessing Generated PDFs + +Generated PDFs are available at: +- Local build: `http://localhost:4000/pdfs/{filename}.pdf` +- Production: `https://docs.opensearch.org/pdfs/{filename}.pdf` + +## Troubleshooting + +### PDF Generation Fails + +1. Ensure `grover` gem is installed: `bundle install` +2. Ensure Node.js is installed (required for Puppeteer) +3. Check Jekyll build logs for error messages +4. Verify collection names in configuration match actual collection names + +### PDF Content Issues + +- The plugin automatically extracts main content and removes navigation elements +- If content is missing, check that documents have `title` and are not excluded with `nav_exclude: true` +- Documents are sorted by `nav_order` if available + +## Customization + +PDF styling can be customized by modifying the `pdf_styles` method in `pdf_generator.rb`. + +PDF options (page size, margins, headers/footers) can be customized in the `pdf_options` method. + diff --git a/_pdf_generator/pdf_generator.rb b/_pdf_generator/pdf_generator.rb new file mode 100644 index 00000000000..b3740e1573b --- /dev/null +++ b/_pdf_generator/pdf_generator.rb @@ -0,0 +1,583 @@ +# frozen_string_literal: true + +require "jekyll" +require "fileutils" +require "uri" + +# Conditionally require grover - only needed when PDF generation is enabled +begin + require "grover" + GROVER_AVAILABLE = true +rescue LoadError + GROVER_AVAILABLE = false + # Don't log here as Jekyll logger may not be available yet +end + +## +# Jekyll Generator Plugin for PDF Generation +# Generates PDF versions of documentation collections during Jekyll build +module Jekyll + class PdfGenerator < Generator + safe true + priority :lowest + + # Class variable to store PDF jobs across generator and hook + @@pdf_jobs = [] + + def generate(site) + return unless site.config["pdf_generator"] && site.config["pdf_generator"]["enabled"] + return unless GROVER_AVAILABLE + + @site = site + @pdf_config = site.config["pdf_generator"] + @destination = site.config["destination"] + + # Ensure PDFs directory exists + pdfs_dir = File.join(@destination, "pdfs") + FileUtils.mkdir_p(pdfs_dir) unless File.directory?(pdfs_dir) + + # Clear previous jobs + @@pdf_jobs.clear + + # Generate PDFs for configured collections + prepare_collection_pdfs if @pdf_config["collections"] + + # Generate PDFs for configured guides + prepare_guide_pdfs if @pdf_config["guides"] + end + + def self.pdf_jobs + @@pdf_jobs + end + + private + + def prepare_collection_pdfs + @pdf_config["collections"].each do |collection_name| + collection = @site.collections[collection_name] + next unless collection && collection.docs.any? + + # Get all documents in the collection, sorted by nav_order or path + docs = collection.docs.select { |doc| doc.data["title"] && !doc.data["nav_exclude"] } + docs.sort_by! { |doc| [doc.data["nav_order"] || 9999, doc.path] } + + # Schedule PDF generation for after rendering + pdf_filename = "#{collection_name}.pdf" + @@pdf_jobs << { + title: collection_name, + documents: docs, + filename: pdf_filename, + site: @site, + pdf_config: @pdf_config, + destination: @destination + } + end + end + + def prepare_guide_pdfs + @pdf_config["guides"].each do |guide_config| + guide_name = guide_config["name"] || guide_config["collection"] + collection_name = guide_config["collection"] + start_page = guide_config["start_page"] + + collection = @site.collections[collection_name] + next unless collection && collection.docs.any? + + # Find starting page if specified + docs = collection.docs.select { |doc| doc.data["title"] && !doc.data["nav_exclude"] } + + if start_page + start_doc = docs.find { |doc| doc.url == start_page || doc.path.include?(start_page) } + if start_doc + # Build document tree starting from this page + docs = build_document_tree(start_doc, docs) + end + end + + docs.sort_by! { |doc| [doc.data["nav_order"] || 9999, doc.path] } + + # Schedule PDF generation + pdf_filename = guide_config["filename"] || "#{guide_name.downcase.gsub(/\s+/, '-')}.pdf" + @@pdf_jobs << { + title: guide_name, + documents: docs, + filename: pdf_filename, + site: @site, + pdf_config: @pdf_config, + destination: @destination + } + end + end + + def build_document_tree(start_doc, all_docs) + # Simple implementation: start from the start_doc and include all following docs + # More sophisticated tree building can be added based on parent/child relationships + start_index = all_docs.index(start_doc) + return [start_doc] unless start_index + + [start_doc] + all_docs[(start_index + 1)..-1].to_a + end + + def generate_pdf_for_documents(title, docs, pdf_filename) + return if docs.empty? + + Jekyll.logger.info "PDF Generator:", "Generating PDF: #{pdf_filename} (#{docs.size} pages)" + + # Build HTML content for PDF + html_content = build_pdf_html(title, docs) + + # Generate PDF using Grover + begin + grover = Grover.new(html_content, pdf_options) + pdf_data = grover.to_pdf + + # Save PDF file + pdf_path = File.join(@destination, "pdfs", pdf_filename) + File.binwrite(pdf_path, pdf_data) + + Jekyll.logger.info "PDF Generator:", "Generated PDF: #{pdf_path}" + rescue => e + Jekyll.logger.error "PDF Generator:", "Failed to generate PDF #{pdf_filename}: #{e.message}" + Jekyll.logger.debug "PDF Generator:", e.backtrace.join("\n") + end + end + + def build_pdf_html(title, docs) + # Build complete HTML document with all pages + html_parts = [] + + # HTML header + html_parts << <<~HTML + + + + + + #{escape_html(title)} + + + + HTML + + # Add cover page + html_parts << build_cover_page(title) + + # Add table of contents + html_parts << build_table_of_contents(docs) + + # Add content from all documents + docs.each_with_index do |doc, index| + html_parts << build_document_section(doc, index + 1, docs.size) + end + + # HTML footer + html_parts << <<~HTML + + + HTML + + html_parts.join("\n") + end + + def build_cover_page(title) + <<~HTML +
+

#{escape_html(title)}

+

OpenSearch Documentation

+

Version #{@site.config["opensearch_major_minor_version"] || "latest"}

+

Generated: #{Time.now.strftime("%B %d, %Y")}

+
+
+ HTML + end + + def build_table_of_contents(docs) + toc_html = <<~HTML +
+

Table of Contents

+ +
+
+ HTML + + toc_html + end + + def build_document_section(doc, section_number, total_sections) + # Read the rendered HTML file from destination + # At post_write time, all files have been written to disk + content = "" + + # Get the output path for this document + output_path = doc.destination(@destination) + + # Read the rendered HTML file + if File.exist?(output_path) + content = File.read(output_path) + elsif doc.respond_to?(:content) + # Fallback: use converted content (markdown to HTML, but no layout) + content = doc.content.to_s + # Wrap in basic HTML structure if it's just content + if content !~ /)<[^<]*)*<\/script>/mis, "") + content = content.gsub(/)<[^<]*)*<\/style>/mis, "") + + # Extract main content from HTML - try to find the main content area + # Remove common HTML structure elements + content = extract_main_content(content) + + section_html = <<~HTML +
+
+

#{escape_html(doc.data["title"] || "Untitled")}

+ +
+
+ #{content} +
+
+
+ HTML + + section_html + end + + def extract_main_content(html) + return "" if html.nil? || html.empty? + + # Remove entire head section + html = html.gsub(/]*>.*?<\/head>/mis, "") + + # Remove header/nav elements (entire elements) + html = html.gsub(/]*>.*?<\/header>/mis, "") + html = html.gsub(/]*>.*?<\/nav>/mis, "") + html = html.gsub(/]*>.*?<\/aside>/mis, "") + html = html.gsub(/]*>.*?<\/footer>/mis, "") + + # Remove side-bar and navigation divs (multiline match) + html = html.gsub(/]*class=["'][^"']*side-bar[^"']*["'][^>]*>.*?<\/div>/mis, "") + html = html.gsub(/]*class=["'][^"']*site-nav[^"']*["'][^>]*>.*?<\/div>/mis, "") + html = html.gsub(/]*class=["'][^"']*site-header[^"']*["'][^>]*>.*?<\/div>/mis, "") + html = html.gsub(/]*class=["'][^"']*toc-wrap[^"']*["'][^>]*>.*?<\/div>/mis, "") + + # Try to extract main content area - look for main-content or main-content-wrap + # Using multiline and dotall matching + if html =~ /]*class=["'][^"']*main-content[^"']*["'][^>]*>(.*?)<\/div>/mis + html = $1 + elsif html =~ /]*id=["'][^"']*main-content[^"']*["'][^>]*>(.*?)<\/div>/mis + html = $1 + elsif html =~ /]*>(.*?)<\/main>/mis + html = $1 + elsif html =~ /]*id=["'][^"']*main[^"']*["'][^>]*>(.*?)<\/div>/mis + html = $1 + end + + # Remove breadcrumb navigation + html = html.gsub(/]*aria-label=["']Breadcrumb["'][^>]*>.*?<\/nav>/mis, "") + html = html.gsub(/]*class=["'][^"']*breadcrumb[^"']*["'][^>]*>.*?<\/div>/mis, "") + + # Remove any remaining script/style tags + html = html.gsub(/]*>.*?<\/script>/mis, "") + html = html.gsub(/]*>.*?<\/style>/mis, "") + + # Remove SVG elements (icons, etc.) + html = html.gsub(/]*>.*?<\/svg>/mis, "") + + # Clean up extra whitespace + html = html.gsub(/\s+/, " ") + html = html.gsub(/>\s+<") + + html.strip + end + + def pdf_styles + <<~CSS + * { + margin: 0; + padding: 0; + box-sizing: border-box; + } + + body { + font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif; + font-size: 11pt; + line-height: 1.6; + color: #333; + background: #fff; + } + + .cover-page { + display: flex; + flex-direction: column; + justify-content: center; + align-items: center; + min-height: 100vh; + text-align: center; + padding: 2cm; + } + + .cover-title { + font-size: 36pt; + font-weight: bold; + margin-bottom: 20pt; + color: #005f8c; + } + + .cover-subtitle { + font-size: 18pt; + margin-bottom: 10pt; + color: #666; + } + + .cover-version { + font-size: 14pt; + margin-top: 30pt; + color: #888; + } + + .cover-date { + font-size: 12pt; + margin-top: 10pt; + color: #888; + } + + .toc-page { + padding: 2cm; + } + + .toc-page h1 { + font-size: 24pt; + margin-bottom: 20pt; + color: #005f8c; + } + + .toc-list { + list-style: none; + padding-left: 0; + } + + .toc-list li { + margin: 8pt 0; + padding: 4pt 0; + border-bottom: 1px solid #eee; + } + + .toc-list a { + color: #005f8c; + text-decoration: none; + font-size: 12pt; + } + + .toc-list a:hover { + text-decoration: underline; + } + + .document-section { + padding: 2cm; + } + + .section-header { + border-bottom: 2px solid #005f8c; + padding-bottom: 10pt; + margin-bottom: 20pt; + } + + .section-title { + font-size: 24pt; + font-weight: bold; + color: #005f8c; + margin-bottom: 5pt; + } + + .section-meta { + font-size: 9pt; + color: #888; + margin-top: 5pt; + } + + .section-content { + margin-top: 20pt; + } + + .section-content h1 { + font-size: 20pt; + margin-top: 20pt; + margin-bottom: 10pt; + color: #005f8c; + page-break-after: avoid; + } + + .section-content h2 { + font-size: 16pt; + margin-top: 16pt; + margin-bottom: 8pt; + color: #005f8c; + page-break-after: avoid; + } + + .section-content h3 { + font-size: 14pt; + margin-top: 12pt; + margin-bottom: 6pt; + color: #333; + page-break-after: avoid; + } + + .section-content h4 { + font-size: 12pt; + margin-top: 10pt; + margin-bottom: 5pt; + color: #333; + } + + .section-content p { + margin: 8pt 0; + text-align: justify; + } + + .section-content ul, + .section-content ol { + margin: 8pt 0; + padding-left: 30pt; + } + + .section-content li { + margin: 4pt 0; + } + + .section-content code { + background: #f5f5f5; + padding: 2pt 4pt; + border-radius: 3pt; + font-family: "Courier New", monospace; + font-size: 10pt; + } + + .section-content pre { + background: #f5f5f5; + padding: 10pt; + border-radius: 5pt; + overflow-x: auto; + margin: 10pt 0; + page-break-inside: avoid; + } + + .section-content pre code { + background: none; + padding: 0; + } + + .section-content table { + width: 100%; + border-collapse: collapse; + margin: 10pt 0; + page-break-inside: avoid; + } + + .section-content table th, + .section-content table td { + border: 1px solid #ddd; + padding: 6pt; + text-align: left; + } + + .section-content table th { + background: #f5f5f5; + font-weight: bold; + } + + .section-content blockquote { + border-left: 4px solid #005f8c; + padding-left: 10pt; + margin: 10pt 0; + color: #666; + font-style: italic; + } + + .section-content img { + max-width: 100%; + height: auto; + margin: 10pt 0; + page-break-inside: avoid; + } + + @media print { + .section-content { + orphans: 3; + widows: 3; + } + + .section-content h1, + .section-content h2, + .section-content h3 { + page-break-after: avoid; + } + + .section-content pre, + .section-content table { + page-break-inside: avoid; + } + } + CSS + end + + def pdf_options + { + format: "A4", + margin: { + top: "1cm", + right: "1.5cm", + bottom: "1cm", + left: "1.5cm" + }, + print_background: true, + display_header_footer: true, + header_template: '
OpenSearch Documentation
', + footer_template: '
/
', + prefer_css_page_size: true + } + end + + def escape_html(text) + return "" if text.nil? + text.to_s + .gsub("&", "&") + .gsub("<", "<") + .gsub(">", ">") + .gsub('"', """) + .gsub("'", "'") + end + end +end + +# Hook to generate PDFs after all files are written +Jekyll::Hooks.register :site, :post_write do |site| + next unless site.config["pdf_generator"] && site.config["pdf_generator"]["enabled"] + next unless GROVER_AVAILABLE + + # Process all queued PDF generations + Jekyll::PdfGenerator.pdf_jobs.each do |pdf_job| + generator = Jekyll::PdfGenerator.new(site.config) + generator.instance_variable_set(:@site, pdf_job[:site]) + generator.instance_variable_set(:@pdf_config, pdf_job[:pdf_config]) + generator.instance_variable_set(:@destination, pdf_job[:destination]) + + generator.send(:generate_pdf_for_documents, pdf_job[:title], pdf_job[:documents], pdf_job[:filename]) + end +end \ No newline at end of file diff --git a/_plugins/pdf_generator_loader.rb b/_plugins/pdf_generator_loader.rb new file mode 100644 index 00000000000..7cfdd8e9698 --- /dev/null +++ b/_plugins/pdf_generator_loader.rb @@ -0,0 +1,28 @@ +# frozen_string_literal: true + +# Loader for PDF Generator plugin +# This file ensures the PDF generator plugin is loaded from _pdf_generator directory +# The PDF generator handles missing grover gem gracefully, so this will not fail if grover is unavailable +begin + require_relative "../_pdf_generator/pdf_generator" +rescue LoadError => e + # If the PDF generator file itself can't be loaded, that's a real error + # But if grover is missing, the PDF generator will handle it gracefully + if e.message.include?("grover") || e.message.include?("cannot load such file -- grover") + # If it's just grover missing, we can continue - PDF generation will be disabled + # Define a dummy constant so the code doesn't break + module Jekyll + class PdfGenerator < Generator + def generate(site) + # No-op when grover is not available + end + end + end + # Set the constant that the PDF generator expects + GROVER_AVAILABLE = false unless defined?(GROVER_AVAILABLE) + else + # Re-raise if it's a different error + raise e + end +end +