diff --git a/.github/workflows/jekyll-build.yml b/.github/workflows/jekyll-build.yml index a4dceb65d68..6fd49434b5c 100644 --- a/.github/workflows/jekyll-build.yml +++ b/.github/workflows/jekyll-build.yml @@ -13,4 +13,4 @@ jobs: ruby-version: '3.4.5' bundler-cache: true - run: | - JEKYLL_FATAL_LINK_CHECKER=internal bundle exec jekyll build --future + JEKYLL_FATAL_LINK_CHECKER=internal bundle exec jekyll build --future \ No newline at end of file diff --git a/Gemfile b/Gemfile index b1d64c15d84..0d0c362fcaa 100644 --- a/Gemfile +++ b/Gemfile @@ -47,6 +47,12 @@ gem 'typhoeus' gem 'activesupport', '~> 7' gem 'mustache', '~> 1' +# PDF Generator (optional - requires Node.js) +# Install with: ENABLE_PDF_GENERATION=true bundle install +if ENV['ENABLE_PDF_GENERATION'] == 'true' + gem 'grover', '~> 1.3' +end + group :development, :test do gem 'rspec' gem 'rubocop', '~> 1.44', require: false diff --git a/_config.yml b/_config.yml index 122b2ce6763..f2b08466d92 100644 --- a/_config.yml +++ b/_config.yml @@ -329,11 +329,36 @@ plugins: - jekyll-redirect-from - jekyll-sitemap - jekyll-spec-insert + - pdf_generator_loader # This format has to conform to RFC822 last-modified-at: date-format: '%a, %d %b %Y %H:%M:%S %z' +# PDF Generator Configuration +pdf_generator: + enabled: true + # Generate PDFs for entire collections + collections: + - getting-started + - install-and-configure + - api-reference + - query-dsl + - aggregations + - mappings + - analyzers + # Generate PDFs for specific guides (more granular control) + guides: + - name: "Getting Started Guide" + collection: getting-started + filename: "getting-started-guide.pdf" + - name: "Installation Guide" + collection: install-and-configure + filename: "installation-guide.pdf" + - name: "API Reference" + collection: api-reference + filename: "api-reference.pdf" + # Exclude from processing. # The following items will not be processed, by default. Create a custom list # to override the default setting. diff --git a/_pdf_generator/README.md b/_pdf_generator/README.md new file mode 100644 index 00000000000..f413f285957 --- /dev/null +++ b/_pdf_generator/README.md @@ -0,0 +1,88 @@ +# PDF Generator for OpenSearch Documentation + +This plugin generates PDF versions of documentation collections during the Jekyll build process. + +## File Structure + +All PDF generator code is contained in the `_pdf_generator/` directory: +- `pdf_generator.rb` - Main plugin implementation +- `README.md` - This documentation file + +A minimal loader file exists in `_plugins/pdf_generator_loader.rb` to ensure Jekyll loads the plugin (Jekyll requires plugins to be in `_plugins` or be gems). + +## Overview + +The PDF generator creates downloadable PDF files for documentation collections and guides. PDFs are generated automatically during the Jekyll build and are saved to the `pdfs/` directory in the site destination. + +## Configuration + +PDF generation is configured in `_config.yml` under the `pdf_generator` section: + +```yaml +pdf_generator: + enabled: true + # Generate PDFs for entire collections + collections: + - getting-started + - install-and-configure + - api-reference + # Generate PDFs for specific guides (more granular control) + guides: + - name: "Getting Started Guide" + collection: getting-started + filename: "getting-started-guide.pdf" + - name: "Installation Guide" + collection: install-and-configure + filename: "installation-guide.pdf" +``` + +### Configuration Options + +- `enabled`: Set to `true` to enable PDF generation, `false` to disable +- `collections`: Array of collection names to generate PDFs for (PDF filename will be `{collection-name}.pdf`) +- `guides`: Array of guide configurations with: + - `name`: Display name for the guide + - `collection`: Collection name to generate PDF from + - `filename`: Output PDF filename (optional, defaults to `{name}.pdf`) + - `start_page`: Optional URL or path to start from (for partial guides) + +## How It Works + +1. During Jekyll build, the PDF generator plugin identifies configured collections/guides +2. After all pages are rendered, the plugin collects the rendered HTML content +3. HTML is cleaned and formatted for PDF output +4. PDFs are generated using Grover (Puppeteer-based PDF generation) +5. PDFs are saved to `_site/pdfs/` directory + +## Dependencies + +- `grover` gem: Ruby wrapper for Puppeteer (requires Node.js and Chrome/Chromium) +- `puppeteer`: Node.js package (installed automatically by grover) + +## Accessing Generated PDFs + +Generated PDFs are available at: +- Local build: `http://localhost:4000/pdfs/{filename}.pdf` +- Production: `https://docs.opensearch.org/pdfs/{filename}.pdf` + +## Troubleshooting + +### PDF Generation Fails + +1. Ensure `grover` gem is installed: `bundle install` +2. Ensure Node.js is installed (required for Puppeteer) +3. Check Jekyll build logs for error messages +4. Verify collection names in configuration match actual collection names + +### PDF Content Issues + +- The plugin automatically extracts main content and removes navigation elements +- If content is missing, check that documents have `title` and are not excluded with `nav_exclude: true` +- Documents are sorted by `nav_order` if available + +## Customization + +PDF styling can be customized by modifying the `pdf_styles` method in `pdf_generator.rb`. + +PDF options (page size, margins, headers/footers) can be customized in the `pdf_options` method. + diff --git a/_pdf_generator/pdf_generator.rb b/_pdf_generator/pdf_generator.rb new file mode 100644 index 00000000000..b3740e1573b --- /dev/null +++ b/_pdf_generator/pdf_generator.rb @@ -0,0 +1,583 @@ +# frozen_string_literal: true + +require "jekyll" +require "fileutils" +require "uri" + +# Conditionally require grover - only needed when PDF generation is enabled +begin + require "grover" + GROVER_AVAILABLE = true +rescue LoadError + GROVER_AVAILABLE = false + # Don't log here as Jekyll logger may not be available yet +end + +## +# Jekyll Generator Plugin for PDF Generation +# Generates PDF versions of documentation collections during Jekyll build +module Jekyll + class PdfGenerator < Generator + safe true + priority :lowest + + # Class variable to store PDF jobs across generator and hook + @@pdf_jobs = [] + + def generate(site) + return unless site.config["pdf_generator"] && site.config["pdf_generator"]["enabled"] + return unless GROVER_AVAILABLE + + @site = site + @pdf_config = site.config["pdf_generator"] + @destination = site.config["destination"] + + # Ensure PDFs directory exists + pdfs_dir = File.join(@destination, "pdfs") + FileUtils.mkdir_p(pdfs_dir) unless File.directory?(pdfs_dir) + + # Clear previous jobs + @@pdf_jobs.clear + + # Generate PDFs for configured collections + prepare_collection_pdfs if @pdf_config["collections"] + + # Generate PDFs for configured guides + prepare_guide_pdfs if @pdf_config["guides"] + end + + def self.pdf_jobs + @@pdf_jobs + end + + private + + def prepare_collection_pdfs + @pdf_config["collections"].each do |collection_name| + collection = @site.collections[collection_name] + next unless collection && collection.docs.any? + + # Get all documents in the collection, sorted by nav_order or path + docs = collection.docs.select { |doc| doc.data["title"] && !doc.data["nav_exclude"] } + docs.sort_by! { |doc| [doc.data["nav_order"] || 9999, doc.path] } + + # Schedule PDF generation for after rendering + pdf_filename = "#{collection_name}.pdf" + @@pdf_jobs << { + title: collection_name, + documents: docs, + filename: pdf_filename, + site: @site, + pdf_config: @pdf_config, + destination: @destination + } + end + end + + def prepare_guide_pdfs + @pdf_config["guides"].each do |guide_config| + guide_name = guide_config["name"] || guide_config["collection"] + collection_name = guide_config["collection"] + start_page = guide_config["start_page"] + + collection = @site.collections[collection_name] + next unless collection && collection.docs.any? + + # Find starting page if specified + docs = collection.docs.select { |doc| doc.data["title"] && !doc.data["nav_exclude"] } + + if start_page + start_doc = docs.find { |doc| doc.url == start_page || doc.path.include?(start_page) } + if start_doc + # Build document tree starting from this page + docs = build_document_tree(start_doc, docs) + end + end + + docs.sort_by! { |doc| [doc.data["nav_order"] || 9999, doc.path] } + + # Schedule PDF generation + pdf_filename = guide_config["filename"] || "#{guide_name.downcase.gsub(/\s+/, '-')}.pdf" + @@pdf_jobs << { + title: guide_name, + documents: docs, + filename: pdf_filename, + site: @site, + pdf_config: @pdf_config, + destination: @destination + } + end + end + + def build_document_tree(start_doc, all_docs) + # Simple implementation: start from the start_doc and include all following docs + # More sophisticated tree building can be added based on parent/child relationships + start_index = all_docs.index(start_doc) + return [start_doc] unless start_index + + [start_doc] + all_docs[(start_index + 1)..-1].to_a + end + + def generate_pdf_for_documents(title, docs, pdf_filename) + return if docs.empty? + + Jekyll.logger.info "PDF Generator:", "Generating PDF: #{pdf_filename} (#{docs.size} pages)" + + # Build HTML content for PDF + html_content = build_pdf_html(title, docs) + + # Generate PDF using Grover + begin + grover = Grover.new(html_content, pdf_options) + pdf_data = grover.to_pdf + + # Save PDF file + pdf_path = File.join(@destination, "pdfs", pdf_filename) + File.binwrite(pdf_path, pdf_data) + + Jekyll.logger.info "PDF Generator:", "Generated PDF: #{pdf_path}" + rescue => e + Jekyll.logger.error "PDF Generator:", "Failed to generate PDF #{pdf_filename}: #{e.message}" + Jekyll.logger.debug "PDF Generator:", e.backtrace.join("\n") + end + end + + def build_pdf_html(title, docs) + # Build complete HTML document with all pages + html_parts = [] + + # HTML header + html_parts << <<~HTML + + +
+ + +OpenSearch Documentation
+Version #{@site.config["opensearch_major_minor_version"] || "latest"}
+Generated: #{Time.now.strftime("%B %d, %Y")}
+