feat: Centralize slugify utility, inline XML serialization, and consolidate load_dotenv calls.

btfranklin · btfranklin · commit afc659f9f0df · 2025-12-26T14:02:23.000-07:00
diff --git a/src/compendiumscribe/cli.py b/src/compendiumscribe/cli.py
@@ -6,7 +6,7 @@
 
 import click
 
-from .compendium import Compendium
+from .compendium import Compendium, slugify
 from .create_llm_clients import (
     MissingAPIKeyError,
     create_openai_client,
@@ -22,13 +22,6 @@
 )
 
 
-def _generate_slug(topic: str) -> str:
-    slug = re.sub(r"[^a-z0-9]+", "-", topic.lower()).strip("-")
-    if not slug:
-        slug = "compendium"
-    return slug
-
-
 
 @click.group()
 def cli() -> None:
@@ -134,7 +127,7 @@ def handle_progress(update: ResearchProgress) -> None:
     if output_path:
         base_path = output_path.parent / output_path.stem
     else:
-        slug = _generate_slug(topic)
+        slug = slugify(topic)
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         base_path = Path(f"{slug}_{timestamp}")
 
@@ -229,7 +222,7 @@ def recover(input_file: Path):
 
         click.echo("Research completed! Writing outputs.")
 
-        slug = _generate_slug(topic)
+        slug = slugify(topic)
         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
         base_path = Path(f"{slug}_{timestamp}")
 
diff --git a/src/compendiumscribe/compendium/__init__.py b/src/compendiumscribe/compendium/__init__.py
@@ -5,8 +5,7 @@
 from .pdf import render_pdf
 from .text_utils import (
     format_html_text,
-    format_plain_text,
-    iter_markdown_links,
+    slugify,
 )
 from .xml_utils import etree_to_string
 
@@ -17,7 +16,6 @@
     "Section",
     "render_pdf",
     "format_html_text",
-    "format_plain_text",
-    "iter_markdown_links",
+    "slugify",
     "etree_to_string",
 ]
diff --git a/src/compendiumscribe/compendium/compendium.py b/src/compendiumscribe/compendium/compendium.py
@@ -9,7 +9,7 @@
 from .html_site_renderer import render_html_site
 from .markdown_renderer import render_markdown
 from .payload_parser import build_from_payload
-from .xml_serializer import build_xml_root, render_xml_string
+from .xml_utils import etree_to_string
 
 
 @dataclass
@@ -27,10 +27,54 @@ class Compendium:
     )
 
     def to_xml(self) -> ET.Element:
-        return build_xml_root(self)
+        """Return an XML element representing the compendium."""
+        root = ET.Element(
+            "compendium",
+            attrib={
+                "topic": self.topic,
+                "generated_at": self.generated_at.replace(
+                    microsecond=0
+                ).isoformat(),
+            },
+        )
+
+        overview_elem = ET.SubElement(root, "overview")
+        overview_elem.text = self.overview
+
+        if self.methodology:
+            methodology_elem = ET.SubElement(root, "methodology")
+            for step in self.methodology:
+                ET.SubElement(methodology_elem, "step").text = step
+
+        if self.sections:
+            sections_elem = ET.SubElement(root, "sections")
+            for section in self.sections:
+                sections_elem.append(section.to_xml())
+
+        if self.open_questions:
+            questions_elem = ET.SubElement(root, "open_questions")
+            for question in self.open_questions:
+                ET.SubElement(questions_elem, "question").text = question
+
+        if self.citations:
+            citations_elem = ET.SubElement(root, "citations")
+            for citation in self.citations:
+                citations_elem.append(citation.to_xml())
+
+        return root
 
     def to_xml_string(self) -> str:
-        return render_xml_string(self)
+        """Render the compendium to a UTF-8 XML string with CDATA wrapping."""
+        cdata_tags = {
+            "overview",
+            "summary",
+            "evidence",
+            "implications",
+            "step",
+            "question",
+            "title",
+        }
+        return etree_to_string(self.to_xml(), cdata_tags=cdata_tags)
 
     def to_markdown(self) -> str:
         """Render the compendium as human-readable Markdown."""
diff --git a/src/compendiumscribe/compendium/html_site_renderer.py b/src/compendiumscribe/compendium/html_site_renderer.py
@@ -5,21 +5,13 @@
 import html
 from typing import TYPE_CHECKING
 
-from .text_utils import format_html_text
+from .text_utils import format_html_text, slugify
 
 if TYPE_CHECKING:  # pragma: no cover - hints only
     from .compendium import Compendium
     from .entities import Citation, Section
 
 
-def _slugify(text: str) -> str:
-    """Convert text to a URL-friendly slug."""
-    import re
-
-    slug = re.sub(r"[^a-z0-9]+", "-", text.lower()).strip("-")
-    return slug or "page"
-
-
 def _html_head(title: str, depth: int = 0) -> list[str]:
     """Generate HTML head section."""
     return [
@@ -50,7 +42,7 @@ def _nav_links(
         parts.append("    <li>Sections:")
         parts.append("      <ul>")
         for section in sections:
-            section_slug = _slugify(section.identifier)
+            section_slug = slugify(section.identifier)
             href = f"{prefix}sections/{section_slug}.html"
             label = html.escape(section.title)
             parts.append(f'        <li><a href="{href}">{label}</a></li>')
@@ -105,7 +97,7 @@ def _render_index_page(compendium: "Compendium") -> str:
         parts.append("  <h2>Sections</h2>")
         parts.append("  <ul>")
         for section in compendium.sections:
-            section_slug = _slugify(section.identifier)
+            section_slug = slugify(section.identifier)
             href = f"sections/{section_slug}.html"
             label = html.escape(section.title)
             summary = format_html_text(section.summary)
@@ -320,7 +312,7 @@ def render_html_site(compendium: "Compendium") -> dict[str, str]:
 
     # Section pages
     for section in compendium.sections:
-        section_slug = _slugify(section.identifier)
+        section_slug = slugify(section.identifier)
         path = f"sections/{section_slug}.html"
         files[path] = _render_section_page(section, compendium)
 
diff --git a/src/compendiumscribe/compendium/text_utils.py b/src/compendiumscribe/compendium/text_utils.py
@@ -5,69 +5,14 @@
 import mistune
 
 
-def iter_markdown_links(text: str) -> Iterator[tuple[int, int, str, str]]:
-    """Yield ranges and components for Markdown-style inline links."""
+def slugify(text: str) -> str:
+    """Convert text to a URL-friendly slug."""
+    import re
 
-    index = 0
-    length = len(text)
-    while index < length:
-        # Avoid complexity if no '['
-        start = text.find("[", index)
-        if start == -1:
-            break
+    slug = re.sub(r"[^a-z0-9]+", "-", text.lower()).strip("-")
+    return slug or "page"
 
-        end_label = text.find("]", start + 1)
-        if end_label == -1:
-            break
-        if end_label + 1 >= length or text[end_label + 1] != "(":
-            index = end_label + 1
-            continue
 
-        url_start = end_label + 2
-        depth = 1
-        position = url_start
-        while position < length and depth > 0:
-            char = text[position]
-            if char == "(":
-                depth += 1
-            elif char == ")":
-                depth -= 1
-                if depth == 0:
-                    break
-            position += 1
-
-        if depth != 0:
-            break
-
-        url_end = position
-        label = text[start + 1:end_label]
-        url = text[url_start:url_end]
-        yield start, url_end + 1, label, url
-        index = url_end + 1
-
-
-def format_plain_text(text: str) -> str:
-    """Replace Markdown-style links with plain text equivalents."""
-
-    if not text:
-        return text
-
-    segments: list[str] = []
-    cursor = 0
-    transformed = False
-    for start, end, label, url in iter_markdown_links(text):
-        segments.append(text[cursor:start])
-        clean_url = url.strip()
-        replacement = f"{label} ({clean_url})" if clean_url else label
-        segments.append(replacement)
-        cursor = end
-        transformed = True
-
-    if not transformed:
-        return text
-
-    segments.append(text[cursor:])
-    return "".join(segments)
 
 
 def format_html_text(text: str | None) -> str:
@@ -93,7 +38,6 @@ def format_html_text(text: str | None) -> str:
 
 
 __all__ = [
-    "iter_markdown_links",
-    "format_plain_text",
+    "slugify",
     "format_html_text",
 ]
diff --git a/src/compendiumscribe/compendium/xml_serializer.py b/src/compendiumscribe/compendium/xml_serializer.py
diff --git a/src/compendiumscribe/research/__init__.py b/src/compendiumscribe/research/__init__.py
@@ -30,10 +30,7 @@
 )
 from .utils import (
     coerce_optional_string,
-    first_non_empty,
     get_field,
-    stringify_metadata_value,
-    truncate_text,
 )
 
 __all__ = [
@@ -57,8 +54,5 @@
     "execute_deep_research",
     "await_completion",
     "coerce_optional_string",
-    "first_non_empty",
     "get_field",
-    "stringify_metadata_value",
-    "truncate_text",
 ]
diff --git a/src/compendiumscribe/research/config.py b/src/compendiumscribe/research/config.py
@@ -11,6 +11,8 @@
 
 from .errors import MissingConfigurationError
 
+load_dotenv()
+
 
 @dataclass
 class ResearchConfig:
@@ -42,7 +44,6 @@ class ResearchConfig:
 
 
 def _default_deep_research_model() -> str:
-    load_dotenv()
     # Check specific env var first, then fallback to generic
     model = os.getenv("DEEP_RESEARCH_MODEL") or os.getenv("RESEARCH_MODEL")
     if not model:
@@ -53,7 +54,6 @@ def _default_deep_research_model() -> str:
 
 
 def _default_prompt_refiner_model() -> str:
-    load_dotenv()
     model = os.getenv("PROMPT_REFINER_MODEL")
     if not model:
         raise MissingConfigurationError(
diff --git a/src/compendiumscribe/research/utils.py b/src/compendiumscribe/research/utils.py

Original file line number	Diff line number	Diff line change
`@@ -30,10 +30,7 @@`
`30`	`30`	`)`
`31`	`31`	`from .utils import (`
`32`	`32`	`coerce_optional_string,`
`33`		`- first_non_empty,`
`34`	`33`	`get_field,`
`35`		`- stringify_metadata_value,`
`36`		`- truncate_text,`
`37`	`34`	`)`
`38`	`35`
`39`	`36`	`__all__ = [`
`@@ -57,8 +54,5 @@`
`57`	`54`	`"execute_deep_research",`
`58`	`55`	`"await_completion",`
`59`	`56`	`"coerce_optional_string",`
`60`		`- "first_non_empty",`
`61`	`57`	`"get_field",`
`62`		`- "stringify_metadata_value",`
`63`		`- "truncate_text",`
`64`	`58`	`]`