Skip to content

Commit 6f5cea2

Browse files
committed
include non-markdown docs in indexer
1 parent 7fe8ce1 commit 6f5cea2

File tree

1 file changed

+174
-8
lines changed

1 file changed

+174
-8
lines changed

scripts/typesense_indexer.py

Lines changed: 174 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,56 @@
22
import pathlib
33
import datetime
44
import re
5+
import sys
56
import yaml
67
import logging
78
from concurrent.futures import ThreadPoolExecutor, as_completed
89
from typing import Dict, List, Any, Optional
10+
911
import typesense
12+
import reflex as rx
13+
from reflex.utils.imports import ImportVar
14+
15+
# Add the project root to the sys.path
16+
project_root = pathlib.Path(__file__).resolve().parent.parent
17+
if str(project_root) not in sys.path:
18+
sys.path.insert(0, str(project_root))
19+
20+
from pcweb.pages.docs.source import Source, generate_docs
21+
from pcweb.pages.docs.apiref import modules
22+
from pcweb.pages.docs.env_vars import env_vars_page, EnvVarDocs
1023

1124
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
1225
logger = logging.getLogger(__name__)
1326

1427
ACRONYMS = {"AI", "API", "HTTP", "HTTPS", "SQL", "JSON", "XML", "CPU", "GPU", "OAuth", "CLI", "URL", "DNS", "IP", "UI", "MCP"}
1528

29+
def _render_component_to_text(c: Any) -> str:
30+
"""Render a Reflex component to a text string."""
31+
if not isinstance(c, rx.Component):
32+
if isinstance(c, rx.Var):
33+
return str(c._var_value)
34+
if isinstance(c, (str, int, float, bool)):
35+
return str(c)
36+
return ""
37+
38+
texts = [_render_component_to_text(child) for child in c.children]
39+
return " ".join(filter(None, texts))
40+
41+
def _extract_headings_from_component(c: Any) -> List[str]:
42+
"""Extract headings from a component tree."""
43+
headings = []
44+
if not isinstance(c, rx.Component):
45+
return headings
46+
47+
if c.tag and c.tag.startswith('h') and c.tag[1:].isdigit():
48+
headings.append(_render_component_to_text(c))
49+
50+
for child in c.children:
51+
headings.extend(_extract_headings_from_component(child))
52+
53+
return headings
54+
1655
CLUSTERS = {
1756
"All Content": [],
1857
"AI Builder": ["ai_builder"],
@@ -59,9 +98,14 @@ def __init__(self):
5998
self.client = typesense.Client(TYPESENSE_CONFIG)
6099

61100
def smart_title_case(self, name: str) -> str:
62-
def fix_word(word: str) -> str:
63-
return word.upper() if word.upper() in ACRONYMS else word.capitalize()
64-
return " ".join(fix_word(w) for w in name.split())
101+
words = name.split(' ')
102+
title_cased_words = []
103+
for word in words:
104+
if word.upper() in ACRONYMS:
105+
title_cased_words.append(word.upper())
106+
else:
107+
title_cased_words.append(word.capitalize())
108+
return " ".join(title_cased_words)
65109

66110
def clean_name(self, name: str) -> str:
67111
if name.lower().endswith(".md"):
@@ -147,7 +191,7 @@ def process_doc_file(self, docs_path: str, file: str, root: str) -> Optional[dic
147191
for i, p in enumerate(parts):
148192
is_last = i == len(parts) - 1
149193
if is_last:
150-
if filename_no_ext.endswith("-ll"):
194+
if filename_no_ext.lower().endswith("-ll"):
151195
parts_clean.append("Low Level")
152196
else:
153197
parts_clean.append(self.clean_name(filename_no_ext))
@@ -159,7 +203,7 @@ def process_doc_file(self, docs_path: str, file: str, root: str) -> Optional[dic
159203
url_parts[-1] = url_parts[-1].replace("-ll", "/low")
160204

161205
url = "/" + "/".join(url_parts)
162-
name = self.name_from_url(f"docs{url}")
206+
name = " ".join(parts_clean)
163207

164208
full_content = self.summarize_markdown(file_path, max_lines=100)
165209
components = self.extract_components(file_path)
@@ -187,6 +231,126 @@ def process_doc_file(self, docs_path: str, file: str, root: str) -> Optional[dic
187231
"parts": parts_clean,
188232
}
189233

234+
def _index_programmatic_docs(self) -> List[dict]:
235+
logger.info("Processing programmatic docs...")
236+
documents = []
237+
238+
# Process API reference pages
239+
for module in modules:
240+
if isinstance(module, tuple):
241+
module, *extra_modules = module
242+
extra_fields = []
243+
for extra_module in extra_modules:
244+
s_extra = Source(module=extra_module)
245+
extra_fields.extend(s_extra.get_fields())
246+
else:
247+
extra_fields = None
248+
s = Source(module=module)
249+
name = module.__name__.lower()
250+
251+
# Get the content from the source object directly
252+
content_parts = []
253+
headings = []
254+
255+
overview = s.get_overview()
256+
if overview:
257+
content_parts.append(overview)
258+
259+
class_fields = s.get_class_fields()
260+
if class_fields:
261+
content_parts.append("\n## Class Fields\n")
262+
headings.append("Class Fields")
263+
for field in class_fields:
264+
prop = field.get("prop")
265+
if not prop: continue
266+
prop_name = getattr(prop, 'name', '')
267+
description = field.get("description", "")
268+
content_parts.append(f"### {prop_name}\n{description}\n")
269+
headings.append(prop_name)
270+
271+
fields = s.get_fields()
272+
if extra_fields:
273+
fields.extend(extra_fields)
274+
if fields:
275+
content_parts.append("\n## Fields\n")
276+
headings.append("Fields")
277+
for field in fields:
278+
prop = field.get("prop")
279+
if not prop: continue
280+
prop_name = getattr(prop, 'name', '')
281+
description = field.get("description", "")
282+
content_parts.append(f"### {prop_name}\n{description}\n")
283+
headings.append(prop_name)
284+
285+
methods = s.get_methods()
286+
if methods:
287+
content_parts.append("\n## Methods\n")
288+
headings.append("Methods")
289+
for method in methods:
290+
method_name = method.get("name", "")
291+
signature = method.get("signature", "")
292+
description = method.get("description", "")
293+
content_parts.append(f"### {method_name}{signature}\n{description}\n")
294+
headings.append(f"{method_name}{signature}")
295+
296+
content = "\n".join(content_parts)
297+
298+
url_path = f"/api-reference/{name}"
299+
title = self.name_from_url(f"docs{url_path}")
300+
path = f"api-reference/{name}"
301+
302+
documents.append({
303+
"id": path,
304+
"title": title,
305+
"content": self.clean_markdown(content),
306+
"headings": headings,
307+
"path": path,
308+
"url": f"docs{url_path}",
309+
"section": "API Reference",
310+
"subsection": name,
311+
"cluster": "API Reference",
312+
"is_blog": False,
313+
"parts": ["API Reference", title],
314+
"components": [],
315+
})
316+
317+
# Process Environment Variables page
318+
env_var_url_path = "/api-reference/environment-variables"
319+
env_var_title = self.name_from_url(f"docs{env_var_url_path}")
320+
env_var_path = "api-reference/environment-variables"
321+
322+
all_vars = EnvVarDocs.get_all_env_vars()
323+
content_parts = [
324+
"Reflex provides a number of environment variables that can be used to configure the behavior of your application. These environment variables can be set in your shell environment or in a .env file. This page documents all available environment variables in Reflex."
325+
]
326+
headings = ["Environment Variables"]
327+
for name, var in all_vars:
328+
if not getattr(var, "internal", False):
329+
docstring = EnvVarDocs.get_env_var_docstring(name) or ""
330+
var_type = var.type_.__name__ if hasattr(var.type_, "__name__") else str(var.type_)
331+
content_parts.append(f"{var.name}: {docstring} (Type: {var_type}, Default: {var.default})")
332+
headings.append(var.name)
333+
334+
content = "\n".join(content_parts)
335+
336+
documents.append({
337+
"id": env_var_path,
338+
"title": env_var_title,
339+
"content": self.clean_markdown(content),
340+
"headings": headings,
341+
"path": env_var_path,
342+
"url": f"docs{env_var_url_path}",
343+
"section": "API Reference",
344+
"subsection": "Environment Variables",
345+
"cluster": "API Reference",
346+
"is_blog": False,
347+
"parts": ["API Reference", env_var_title],
348+
"components": [],
349+
})
350+
351+
logger.info(f"Found {len(documents)} programmatic docs.")
352+
return documents
353+
190354
def extract_frontmatter(self, md_path: str) -> dict:
191355
"""Your existing frontmatter extraction"""
192356
with open(md_path, "r", encoding="utf-8") as f:
@@ -263,6 +427,8 @@ def create_collection(self, force_recreate: bool = False) -> bool:
263427
def index_documents(self, docs_path: str, blog_path: str, max_workers: int = 4, batch_size: int = 100) -> bool:
264428
"""Index both docs and blog files"""
265429
try:
430+
programmatic_docs = self._index_programmatic_docs()
431+
266432
docs_files = []
267433
for root, _, files in os.walk(docs_path):
268434
for file in files:
@@ -279,7 +445,7 @@ def index_documents(self, docs_path: str, blog_path: str, max_workers: int = 4,
279445
all_files = docs_files + blog_files
280446
logger.info(f"Found {len(docs_files)} docs and {len(blog_files)} blog files")
281447

282-
documents = []
448+
documents = programmatic_docs
283449
processed = 0
284450

285451
with ThreadPoolExecutor(max_workers=max_workers) as executor:
@@ -297,11 +463,11 @@ def index_documents(self, docs_path: str, blog_path: str, max_workers: int = 4,
297463
if len(documents) >= batch_size:
298464
self._index_batch(documents)
299465
documents = []
300-
logger.info(f"Processed {processed}/{len(all_files)} files")
466+
logger.info(f"Processed {processed}/{len(all_files)} files (plus programmatic docs)")
301467

302468
if documents:
303469
self._index_batch(documents)
304-
logger.info(f"Processed {processed}/{len(all_files)} files")
470+
logger.info(f"Processed {processed}/{len(all_files)} files (plus programmatic docs)")
305471

306472
logger.info("Indexing completed successfully!")
307473
return True

0 commit comments

Comments
 (0)