Merge branch 'main' into llane/llm-txt-extension

lbliii · web-flow · commit 00f1c903358b · 2025-10-17T14:44:35.000-04:00
diff --git a/docs/_extensions/rich_metadata/__init__.py b/docs/_extensions/rich_metadata/__init__.py
diff --git a/docs/_extensions/rich_metadata/templates/layout.html b/docs/_extensions/rich_metadata/templates/layout.html
@@ -0,0 +1,17 @@
+{# Rich Metadata Extension - Layout Override #}
+{# Extends the theme's layout to inject metadata in extrahead block #}
+{% extends "!layout.html" %}
+
+{%- block htmltitle %}
+{# Use pagetitle if set by rich_metadata extension, otherwise use default #}
+{%- if pagetitle and '|' in pagetitle %}
+<title>{{ pagetitle|striptags }}</title>
+{%- else %}
+{{ super() }}
+{%- endif %}
+{%- endblock htmltitle %}
+
+{%- block extrahead %}
+{{ super() }}
+{# metatags is already rendered by parent theme's extrahead block #}
+{%- endblock extrahead %}
diff --git a/docs/_extensions/rich_metadata/verify_metadata.py b/docs/_extensions/rich_metadata/verify_metadata.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python3
+"""
+Verification script for rich metadata extension.
+
+This script checks if metadata has been properly injected into built HTML files.
+
+Usage:
+    python verify_metadata.py <path_to_built_html>
+
+Example:
+    python verify_metadata.py ../../_build/html/index.html
+    python verify_metadata.py ../../_build/html/get-started/text.html
+"""
+
+import argparse
+import json
+import re
+import sys
+from pathlib import Path
+
+
+def extract_meta_tags(html_content: str) -> dict[str, list[str]]:
+    """Extract all meta tags from HTML content."""
+    meta_tags = {
+        "standard": [],
+        "open_graph": [],
+        "twitter": [],
+        "custom": [],
+    }
+
+    # Extract standard meta tags
+    for match in re.finditer(r'<meta name="([^"]+)" content="([^"]*)"', html_content):
+        name, content = match.groups()
+        meta_tags["standard"].append(f"{name}: {content}")
+
+    # Extract Open Graph tags
+    for match in re.finditer(r'<meta property="og:([^"]+)" content="([^"]*)"', html_content):
+        name, content = match.groups()
+        meta_tags["open_graph"].append(f"og:{name}: {content}")
+
+    # Extract Twitter Card tags
+    for match in re.finditer(r'<meta name="twitter:([^"]+)" content="([^"]*)"', html_content):
+        name, content = match.groups()
+        meta_tags["twitter"].append(f"twitter:{name}: {content}")
+
+    return meta_tags
+
+
+def extract_json_ld(html_content: str) -> dict | None:
+    """Extract JSON-LD structured data from HTML content."""
+    match = re.search(
+        r'<script type="application/ld\+json">\s*(\{.*?\})\s*</script>',
+        html_content,
+        re.DOTALL,
+    )
+
+    if match:
+        try:
+            return json.loads(match.group(1))
+        except json.JSONDecodeError as e:
+            print(f"❌ Error parsing JSON-LD: {e}")
+            return None
+
+    return None
+
+
+def _display_meta_tags(tags: list[str], tag_type: str) -> bool:
+    """Display meta tags of a specific type."""
+    if tags:
+        print(f"✅ {tag_type}:")
+        for tag in tags:
+            print(f"   • {tag}")
+        print()
+        return True
+
+    print(f"⚠️  No {tag_type.lower()} found\n")
+    return False
+
+
+def _display_json_ld(json_ld: dict | None) -> bool:
+    """Display JSON-LD structured data."""
+    if not json_ld:
+        print("⚠️  No JSON-LD structured data found\n")
+        return False
+
+    print("✅ JSON-LD Structured Data:")
+    print(f"   • @type: {json_ld.get('@type', 'N/A')}")
+    print(f"   • headline: {json_ld.get('headline', 'N/A')}")
+
+    description = json_ld.get("description", "N/A")
+    if description != "N/A":
+        print(f"   • description: {description[:80]}...")
+
+    if "keywords" in json_ld and isinstance(json_ld["keywords"], list):
+        print(f"   • keywords: {', '.join(json_ld['keywords'][:5])}")
+
+    if "audience" in json_ld:
+        audience_type = json_ld["audience"].get("audienceType", [])
+        if isinstance(audience_type, list):
+            print(f"   • audience: {', '.join(audience_type)}")
+
+    if "proficiencyLevel" in json_ld:
+        print(f"   • proficiency: {json_ld['proficiencyLevel']}")
+
+    print()
+    return True
+
+
+def _display_no_metadata_help() -> None:
+    """Display help message when no metadata is found."""
+    print("❌ No rich metadata found in this file.")
+    print("   This could mean:")
+    print("   • The page has no frontmatter")
+    print("   • The extension is not enabled in conf.py")
+    print("   • The template is not rendering {{ metatags }} or {{ rich_metadata }}")
+
+
+def verify_html_file(html_path: Path) -> bool:
+    """
+    Verify that a built HTML file contains rich metadata.
+
+    Returns:
+        True if metadata is present, False otherwise
+    """
+    if not html_path.exists():
+        print(f"❌ File not found: {html_path}")
+        return False
+
+    print(f"\n{'='*80}")
+    print(f"Verifying: {html_path.name}")
+    print(f"{'='*80}\n")
+
+    html_content = html_path.read_text(encoding="utf-8")
+
+    # Extract metadata
+    meta_tags = extract_meta_tags(html_content)
+    json_ld = extract_json_ld(html_content)
+
+    # Display results and track if any metadata was found
+    has_metadata = False
+    has_metadata |= _display_meta_tags(meta_tags["standard"], "Standard Meta Tags")
+    has_metadata |= _display_meta_tags(meta_tags["open_graph"], "Open Graph Tags")
+    has_metadata |= _display_meta_tags(meta_tags["twitter"], "Twitter Card Tags")
+    has_metadata |= _display_json_ld(json_ld)
+
+    # Overall result
+    if has_metadata:
+        print("✅ Rich metadata extension is working!")
+        return True
+
+    _display_no_metadata_help()
+    return False
+
+
+def main() -> None:
+    """Main entry point for the verification script."""
+    parser = argparse.ArgumentParser(
+        description="Verify rich metadata injection in built HTML files"
+    )
+    parser.add_argument(
+        "html_files",
+        nargs="+",
+        type=Path,
+        help="Path(s) to HTML file(s) to verify",
+    )
+    parser.add_argument(
+        "--verbose",
+        "-v",
+        action="store_true",
+        help="Show detailed output",
+    )
+
+    args = parser.parse_args()
+
+    all_passed = True
+    for html_file in args.html_files:
+        if not verify_html_file(html_file):
+            all_passed = False
+
+    print(f"\n{'='*80}")
+    if all_passed:
+        print("✅ All files verified successfully!")
+    else:
+        print("⚠️  Some files are missing metadata")
+    print(f"{'='*80}\n")
+
+    sys.exit(0 if all_passed else 1)
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/docs/_extensions/search_assets/templates/search.html b/docs/_extensions/search_assets/templates/search.html
@@ -2,53 +2,48 @@
 {# Enhanced Search Page - Clean template without embedded CSS/JS #}
 
 {% block docs_body %}
-  <div class="bd-search-container">
-    <h1>{{ _("Search") }}</h1>
-    
-    <noscript>
-      <div class="alert alert-error">
-        <p class="alert-title">{% trans %}Error{% endtrans %}</p>
-        <p>{% trans %}Please activate JavaScript to enable the search functionality.{% endtrans %}</p>
-      </div>
-    </noscript>
-    
-    {# Search and filter controls container - will be enhanced by JavaScript #}
-    <div class="search-controls-container mb-4">
-      <input 
-        type="search" 
-        class="form-control" 
-        id="enhanced-search-page-input"
-        placeholder="Search NVIDIA documentation..."
-        autocomplete="off"
-        spellcheck="false"
-        aria-label="Search documentation"
-      >
+<div class="bd-search-container">
+  <h1>{{ _("Search") }}</h1>
+
+  <noscript>
+    <div class="alert alert-error">
+      <p class="alert-title">{% trans %}Error{% endtrans %}</p>
+      <p>{% trans %}Please activate JavaScript to enable the search functionality.{% endtrans %}</p>
     </div>
+  </noscript>
+
+  {# Search and filter controls container - will be enhanced by JavaScript #}
+  <div class="search-controls-container mb-4">
+    <input type="search" class="form-control" id="enhanced-search-page-input"
+      placeholder="Search NVIDIA documentation..." autocomplete="off" spellcheck="false"
+      aria-label="Search documentation">
+  </div>
 
-    {# Search results container #}
-    <div id="enhanced-search-page-results">
-      <div class="search-empty-state text-center py-4">
-        <i class="fa-solid fa-magnifying-glass fa-2x mb-3 search-icon-primary"></i>
-        <h4 class="search-empty-title">Search Documentation</h4>
-        <p class="search-empty-text">Start typing to search across all documentation pages...</p>
-        <div class="mt-3">
-          <small class="search-empty-tips">
-            <i class="fa-solid fa-lightbulb search-icon-primary"></i>
-            <strong>Search Tips:</strong> Use specific terms for better results • Search includes titles, content, and headings
-          </small>
-        </div>
+  {# Search results container #}
+  <div id="enhanced-search-page-results">
+    <div class="search-empty-state text-center py-4">
+      <i class="fa-solid fa-magnifying-glass fa-2x mb-3 search-icon-primary"></i>
+      <h4 class="search-empty-title">Search Documentation</h4>
+      <p class="search-empty-text">Start typing to search across all documentation pages...</p>
+      <div class="mt-3">
+        <small class="search-empty-tips">
+          <i class="fa-solid fa-lightbulb search-icon-primary"></i>
+          <strong>Search Tips:</strong> Use specific terms for better results • Search includes titles, content, and
+          headings
+        </small>
       </div>
     </div>
   </div>
+</div>
 {% endblock docs_body %}
 
 {# Page metadata #}
 {%- block htmltitle -%}
-  <title>{{ _("Search") }} - {{ title or docstitle }}</title>
+<title>{{ _("Search") }} - {{ title or docstitle }}</title>
 {%- endblock htmltitle -%}
 
 {# Load our enhanced search scripts #}
 {% block scripts -%}
-  {{ super() }}
-  {# Search page script is loaded via html_js_files in conf.py #}
-{%- endblock scripts %} 
+{{ super() }}
+{# Search page script is loaded via html_js_files in conf.py #}
+{%- endblock scripts %}
diff --git a/docs/conf.py b/docs/conf.py
@@ -48,6 +48,7 @@
     "json_output",  # Generate JSON output for each page
     "llm_txt_output",  # Generate LLM.txt output for each page
     "search_assets",  # Enhanced search assets extension
+    "rich_metadata",  # SEO metadata injection from frontmatter
     # "ai_assistant",  # AI assistant extension
     # "swagger_plugin_for_sphinx",  # For Swagger API documentation
     "sphinxcontrib.mermaid",  # For Mermaid diagrams
diff --git a/docs/curate-text/load-data/common-crawl.md b/docs/curate-text/load-data/common-crawl.md
@@ -73,7 +73,7 @@ def main():
     pipeline.add_stage(cc_stage)
 
     # Add output writer stage
-    writer = JsonlWriter(output_dir="./cc_output")
+    writer = JsonlWriter("./cc_output")
     pipeline.add_stage(writer)
 
     # Run pipeline
@@ -94,7 +94,7 @@ To write Parquet instead of JSONL, use `ParquetWriter`:
 from nemo_curator.stages.text.io.writer import ParquetWriter
 
 # Replace the JSONL writer with ParquetWriter
-writer = ParquetWriter(output_dir="./cc_output_parquet")
+writer = ParquetWriter("./cc_output_parquet")
 pipeline.add_stage(writer)
 ```
 
diff --git a/docs/curate-text/process-data/content-processing/add-id.md b/docs/curate-text/process-data/content-processing/add-id.md
@@ -79,7 +79,7 @@ pipeline = Pipeline(name="add_ids")
 # Add stages
 pipeline.add_stage(JsonlReader(file_paths="input/*.jsonl"))
 pipeline.add_stage(AddId(id_field="doc_id", id_prefix="v1"))
-pipeline.add_stage(JsonlWriter(output_path="output/"))
+pipeline.add_stage(JsonlWriter("output/"))
 
 # Run pipeline
 result = pipeline.run()
diff --git a/docs/index.md b/docs/index.md
@@ -1,3 +1,24 @@
+---
+description: "NeMo Curator is an open-source, scalable data curation platform for curating large datasets across text, image, video, and audio modalities to improve AI model training"
+categories:
+  - documentation
+  - home
+tags:
+  - data-curation
+  - multimodal
+  - scalable
+  - gpu-accelerated
+  - distributed
+personas:
+  - Data Scientists
+  - Machine Learning Engineers
+  - Cluster Administrators
+  - DevOps Professionals
+difficulty: beginner
+content_type: index
+modality: universal
+---
+
 (curator-home)=
 
 # NeMo Curator Documentation