Skip to content

Commit 00f1c90

Browse files
authored
Merge branch 'main' into llane/llm-txt-extension
2 parents 4d3bc81 + 4a3e76b commit 00f1c90

File tree

8 files changed

+743
-40
lines changed

8 files changed

+743
-40
lines changed

docs/_extensions/rich_metadata/__init__.py

Lines changed: 477 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
{# Rich Metadata Extension - Layout Override #}
2+
{# Extends the theme's layout to inject metadata in extrahead block #}
3+
{% extends "!layout.html" %}
4+
5+
{%- block htmltitle %}
6+
{# Use pagetitle if set by rich_metadata extension, otherwise use default #}
7+
{%- if pagetitle and '|' in pagetitle %}
8+
<title>{{ pagetitle|striptags }}</title>
9+
{%- else %}
10+
{{ super() }}
11+
{%- endif %}
12+
{%- endblock htmltitle %}
13+
14+
{%- block extrahead %}
15+
{{ super() }}
16+
{# metatags is already rendered by parent theme's extrahead block #}
17+
{%- endblock extrahead %}
Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Verification script for rich metadata extension.
4+
5+
This script checks if metadata has been properly injected into built HTML files.
6+
7+
Usage:
8+
python verify_metadata.py <path_to_built_html>
9+
10+
Example:
11+
python verify_metadata.py ../../_build/html/index.html
12+
python verify_metadata.py ../../_build/html/get-started/text.html
13+
"""
14+
15+
import argparse
16+
import json
17+
import re
18+
import sys
19+
from pathlib import Path
20+
21+
22+
def extract_meta_tags(html_content: str) -> dict[str, list[str]]:
23+
"""Extract all meta tags from HTML content."""
24+
meta_tags = {
25+
"standard": [],
26+
"open_graph": [],
27+
"twitter": [],
28+
"custom": [],
29+
}
30+
31+
# Extract standard meta tags
32+
for match in re.finditer(r'<meta name="([^"]+)" content="([^"]*)"', html_content):
33+
name, content = match.groups()
34+
meta_tags["standard"].append(f"{name}: {content}")
35+
36+
# Extract Open Graph tags
37+
for match in re.finditer(r'<meta property="og:([^"]+)" content="([^"]*)"', html_content):
38+
name, content = match.groups()
39+
meta_tags["open_graph"].append(f"og:{name}: {content}")
40+
41+
# Extract Twitter Card tags
42+
for match in re.finditer(r'<meta name="twitter:([^"]+)" content="([^"]*)"', html_content):
43+
name, content = match.groups()
44+
meta_tags["twitter"].append(f"twitter:{name}: {content}")
45+
46+
return meta_tags
47+
48+
49+
def extract_json_ld(html_content: str) -> dict | None:
50+
"""Extract JSON-LD structured data from HTML content."""
51+
match = re.search(
52+
r'<script type="application/ld\+json">\s*(\{.*?\})\s*</script>',
53+
html_content,
54+
re.DOTALL,
55+
)
56+
57+
if match:
58+
try:
59+
return json.loads(match.group(1))
60+
except json.JSONDecodeError as e:
61+
print(f"❌ Error parsing JSON-LD: {e}")
62+
return None
63+
64+
return None
65+
66+
67+
def _display_meta_tags(tags: list[str], tag_type: str) -> bool:
68+
"""Display meta tags of a specific type."""
69+
if tags:
70+
print(f"✅ {tag_type}:")
71+
for tag in tags:
72+
print(f" • {tag}")
73+
print()
74+
return True
75+
76+
print(f"⚠️ No {tag_type.lower()} found\n")
77+
return False
78+
79+
80+
def _display_json_ld(json_ld: dict | None) -> bool:
81+
"""Display JSON-LD structured data."""
82+
if not json_ld:
83+
print("⚠️ No JSON-LD structured data found\n")
84+
return False
85+
86+
print("✅ JSON-LD Structured Data:")
87+
print(f" • @type: {json_ld.get('@type', 'N/A')}")
88+
print(f" • headline: {json_ld.get('headline', 'N/A')}")
89+
90+
description = json_ld.get("description", "N/A")
91+
if description != "N/A":
92+
print(f" • description: {description[:80]}...")
93+
94+
if "keywords" in json_ld and isinstance(json_ld["keywords"], list):
95+
print(f" • keywords: {', '.join(json_ld['keywords'][:5])}")
96+
97+
if "audience" in json_ld:
98+
audience_type = json_ld["audience"].get("audienceType", [])
99+
if isinstance(audience_type, list):
100+
print(f" • audience: {', '.join(audience_type)}")
101+
102+
if "proficiencyLevel" in json_ld:
103+
print(f" • proficiency: {json_ld['proficiencyLevel']}")
104+
105+
print()
106+
return True
107+
108+
109+
def _display_no_metadata_help() -> None:
110+
"""Display help message when no metadata is found."""
111+
print("❌ No rich metadata found in this file.")
112+
print(" This could mean:")
113+
print(" • The page has no frontmatter")
114+
print(" • The extension is not enabled in conf.py")
115+
print(" • The template is not rendering {{ metatags }} or {{ rich_metadata }}")
116+
117+
118+
def verify_html_file(html_path: Path) -> bool:
119+
"""
120+
Verify that a built HTML file contains rich metadata.
121+
122+
Returns:
123+
True if metadata is present, False otherwise
124+
"""
125+
if not html_path.exists():
126+
print(f"❌ File not found: {html_path}")
127+
return False
128+
129+
print(f"\n{'='*80}")
130+
print(f"Verifying: {html_path.name}")
131+
print(f"{'='*80}\n")
132+
133+
html_content = html_path.read_text(encoding="utf-8")
134+
135+
# Extract metadata
136+
meta_tags = extract_meta_tags(html_content)
137+
json_ld = extract_json_ld(html_content)
138+
139+
# Display results and track if any metadata was found
140+
has_metadata = False
141+
has_metadata |= _display_meta_tags(meta_tags["standard"], "Standard Meta Tags")
142+
has_metadata |= _display_meta_tags(meta_tags["open_graph"], "Open Graph Tags")
143+
has_metadata |= _display_meta_tags(meta_tags["twitter"], "Twitter Card Tags")
144+
has_metadata |= _display_json_ld(json_ld)
145+
146+
# Overall result
147+
if has_metadata:
148+
print("✅ Rich metadata extension is working!")
149+
return True
150+
151+
_display_no_metadata_help()
152+
return False
153+
154+
155+
def main() -> None:
156+
"""Main entry point for the verification script."""
157+
parser = argparse.ArgumentParser(
158+
description="Verify rich metadata injection in built HTML files"
159+
)
160+
parser.add_argument(
161+
"html_files",
162+
nargs="+",
163+
type=Path,
164+
help="Path(s) to HTML file(s) to verify",
165+
)
166+
parser.add_argument(
167+
"--verbose",
168+
"-v",
169+
action="store_true",
170+
help="Show detailed output",
171+
)
172+
173+
args = parser.parse_args()
174+
175+
all_passed = True
176+
for html_file in args.html_files:
177+
if not verify_html_file(html_file):
178+
all_passed = False
179+
180+
print(f"\n{'='*80}")
181+
if all_passed:
182+
print("✅ All files verified successfully!")
183+
else:
184+
print("⚠️ Some files are missing metadata")
185+
print(f"{'='*80}\n")
186+
187+
sys.exit(0 if all_passed else 1)
188+
189+
190+
if __name__ == "__main__":
191+
main()
192+

docs/_extensions/search_assets/templates/search.html

Lines changed: 32 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -2,53 +2,48 @@
22
{# Enhanced Search Page - Clean template without embedded CSS/JS #}
33

44
{% block docs_body %}
5-
<div class="bd-search-container">
6-
<h1>{{ _("Search") }}</h1>
7-
8-
<noscript>
9-
<div class="alert alert-error">
10-
<p class="alert-title">{% trans %}Error{% endtrans %}</p>
11-
<p>{% trans %}Please activate JavaScript to enable the search functionality.{% endtrans %}</p>
12-
</div>
13-
</noscript>
14-
15-
{# Search and filter controls container - will be enhanced by JavaScript #}
16-
<div class="search-controls-container mb-4">
17-
<input
18-
type="search"
19-
class="form-control"
20-
id="enhanced-search-page-input"
21-
placeholder="Search NVIDIA documentation..."
22-
autocomplete="off"
23-
spellcheck="false"
24-
aria-label="Search documentation"
25-
>
5+
<div class="bd-search-container">
6+
<h1>{{ _("Search") }}</h1>
7+
8+
<noscript>
9+
<div class="alert alert-error">
10+
<p class="alert-title">{% trans %}Error{% endtrans %}</p>
11+
<p>{% trans %}Please activate JavaScript to enable the search functionality.{% endtrans %}</p>
2612
</div>
13+
</noscript>
14+
15+
{# Search and filter controls container - will be enhanced by JavaScript #}
16+
<div class="search-controls-container mb-4">
17+
<input type="search" class="form-control" id="enhanced-search-page-input"
18+
placeholder="Search NVIDIA documentation..." autocomplete="off" spellcheck="false"
19+
aria-label="Search documentation">
20+
</div>
2721

28-
{# Search results container #}
29-
<div id="enhanced-search-page-results">
30-
<div class="search-empty-state text-center py-4">
31-
<i class="fa-solid fa-magnifying-glass fa-2x mb-3 search-icon-primary"></i>
32-
<h4 class="search-empty-title">Search Documentation</h4>
33-
<p class="search-empty-text">Start typing to search across all documentation pages...</p>
34-
<div class="mt-3">
35-
<small class="search-empty-tips">
36-
<i class="fa-solid fa-lightbulb search-icon-primary"></i>
37-
<strong>Search Tips:</strong> Use specific terms for better results • Search includes titles, content, and headings
38-
</small>
39-
</div>
22+
{# Search results container #}
23+
<div id="enhanced-search-page-results">
24+
<div class="search-empty-state text-center py-4">
25+
<i class="fa-solid fa-magnifying-glass fa-2x mb-3 search-icon-primary"></i>
26+
<h4 class="search-empty-title">Search Documentation</h4>
27+
<p class="search-empty-text">Start typing to search across all documentation pages...</p>
28+
<div class="mt-3">
29+
<small class="search-empty-tips">
30+
<i class="fa-solid fa-lightbulb search-icon-primary"></i>
31+
<strong>Search Tips:</strong> Use specific terms for better results • Search includes titles, content, and
32+
headings
33+
</small>
4034
</div>
4135
</div>
4236
</div>
37+
</div>
4338
{% endblock docs_body %}
4439

4540
{# Page metadata #}
4641
{%- block htmltitle -%}
47-
<title>{{ _("Search") }} - {{ title or docstitle }}</title>
42+
<title>{{ _("Search") }} - {{ title or docstitle }}</title>
4843
{%- endblock htmltitle -%}
4944

5045
{# Load our enhanced search scripts #}
5146
{% block scripts -%}
52-
{{ super() }}
53-
{# Search page script is loaded via html_js_files in conf.py #}
54-
{%- endblock scripts %}
47+
{{ super() }}
48+
{# Search page script is loaded via html_js_files in conf.py #}
49+
{%- endblock scripts %}

docs/conf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
"json_output", # Generate JSON output for each page
4949
"llm_txt_output", # Generate LLM.txt output for each page
5050
"search_assets", # Enhanced search assets extension
51+
"rich_metadata", # SEO metadata injection from frontmatter
5152
# "ai_assistant", # AI assistant extension
5253
# "swagger_plugin_for_sphinx", # For Swagger API documentation
5354
"sphinxcontrib.mermaid", # For Mermaid diagrams

docs/curate-text/load-data/common-crawl.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def main():
7373
pipeline.add_stage(cc_stage)
7474

7575
# Add output writer stage
76-
writer = JsonlWriter(output_dir="./cc_output")
76+
writer = JsonlWriter("./cc_output")
7777
pipeline.add_stage(writer)
7878

7979
# Run pipeline
@@ -94,7 +94,7 @@ To write Parquet instead of JSONL, use `ParquetWriter`:
9494
from nemo_curator.stages.text.io.writer import ParquetWriter
9595

9696
# Replace the JSONL writer with ParquetWriter
97-
writer = ParquetWriter(output_dir="./cc_output_parquet")
97+
writer = ParquetWriter("./cc_output_parquet")
9898
pipeline.add_stage(writer)
9999
```
100100

docs/curate-text/process-data/content-processing/add-id.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ pipeline = Pipeline(name="add_ids")
7979
# Add stages
8080
pipeline.add_stage(JsonlReader(file_paths="input/*.jsonl"))
8181
pipeline.add_stage(AddId(id_field="doc_id", id_prefix="v1"))
82-
pipeline.add_stage(JsonlWriter(output_path="output/"))
82+
pipeline.add_stage(JsonlWriter("output/"))
8383

8484
# Run pipeline
8585
result = pipeline.run()

docs/index.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,24 @@
1+
---
2+
description: "NeMo Curator is an open-source, scalable data curation platform for curating large datasets across text, image, video, and audio modalities to improve AI model training"
3+
categories:
4+
- documentation
5+
- home
6+
tags:
7+
- data-curation
8+
- multimodal
9+
- scalable
10+
- gpu-accelerated
11+
- distributed
12+
personas:
13+
- Data Scientists
14+
- Machine Learning Engineers
15+
- Cluster Administrators
16+
- DevOps Professionals
17+
difficulty: beginner
18+
content_type: index
19+
modality: universal
20+
---
21+
122
(curator-home)=
223

324
# NeMo Curator Documentation

0 commit comments

Comments
 (0)