fix: fall back to architecture.md H1 when threats.md lacks project name

davidmatousek · claude · davidmatousek · commit 9c5a20887fb3 · 2026-04-14T19:50:11.000-04:00
The orchestrator output template writes a literal `# Threat Model Report`
H1 that matches neither of `parse_project_name()`'s accepted formats
(`# {Name} Threat Model` or `# Threat Model: {Name}`), causing every PDF
cover page and infographic to read "Unknown Project" unless `--title`
was passed explicitly.

Add an architecture.md H1 fallback that runs after threats.md parsing
fails. Feature 120 already snapshots architecture.md into each run's
output directory, so the fallback is reliable for real pipeline runs.
Supports both em-dash formats observed in the wild:

  # {Name} — Architecture               (example convention)
  # Security Architecture — {Name}      (user projects)
  # Architecture — {Name}

Precedence: `--title` override &gt; threats.md H1 &gt; architecture.md H1 &gt;
"Unknown Project".

Regenerated the 2 baseline PDFs whose examples ship architecture.md
(web-app, microservices) under SOURCE_DATE_EPOCH=1700000000 per
ADR-021. The 3 examples without architecture.md (ascii-web-api,
mermaid-agentic-app, free-text-microservice) still resolve to
"Unknown Project" and their baselines are byte-identical.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/examples/microservices/security-report.pdf.baseline b/examples/microservices/security-report.pdf.baseline
diff --git a/examples/web-app/security-report.pdf.baseline b/examples/web-app/security-report.pdf.baseline
diff --git a/scripts/extract-infographic-data.py b/scripts/extract-infographic-data.py
@@ -1576,7 +1576,7 @@ def main():
 
     # Parse frontmatter and project name
     frontmatter = parse_frontmatter(threats_content)
-    project_name = parse_project_name(threats_content)
+    project_name = parse_project_name(threats_content, target_dir=target_dir)
 
     # Parse baseline metadata for delta-aware output
     baseline = parse_baseline_frontmatter(threats_content)
diff --git a/scripts/extract-report-data.py b/scripts/extract-report-data.py
@@ -1591,7 +1591,7 @@ def main():
     frontmatter = parse_frontmatter(threats_content)
 
     # Parse project name
-    project_name = parse_project_name(threats_content, args.title)
+    project_name = parse_project_name(threats_content, args.title, target_dir)
 
     # Parse baseline metadata for delta-aware output
     baseline = parse_baseline_frontmatter(threats_content)
diff --git a/scripts/tachi_parsers.py b/scripts/tachi_parsers.py
@@ -286,36 +286,84 @@ def parse_baseline_frontmatter(content: str) -> dict:
 # Project Name Parser
 # =============================================================================
 
-def parse_project_name(content: str, title_override: str = None) -> str:
-    """Extract project name from threats.md H1 heading.
-
-    Supports two heading formats:
-      - "# {Name} Threat Model" (orchestrator output format)
-      - "# Threat Model: {Name}" (legacy format)
-
-    Args:
-        content: threats.md content.
-        title_override: If provided, use this instead of auto-detected name.
-
-    Returns:
-        Project name string.
+def parse_project_name(
+    content: str,
+    title_override: str = None,
+    target_dir: Path = None,
+) -> str:
+    """Extract project name from threats.md H1, with architecture.md fallback.
+
+    Precedence:
+      1. ``title_override`` if provided (CLI --title wins)
+      2. threats.md H1 in one of the two recognized formats
+      3. architecture.md H1 in ``target_dir`` (snapshot from Feature 120)
+      4. ``"Unknown Project"`` fallback
+
+    Recognized threats.md H1 formats:
+      - ``# {Name} Threat Model``
+      - ``# Threat Model: {Name}``
+
+    Recognized architecture.md H1 formats (em-dash separated):
+      - ``# {Name} — Architecture`` (example convention)
+      - ``# Security Architecture — {Name}`` / ``# Architecture — {Name}``
+
+    The current orchestrator output template writes a literal
+    ``# Threat Model Report`` H1, which matches neither threats.md format, so
+    the architecture.md fallback recovers the name for real pipeline runs that
+    snapshot architecture.md alongside threats.md.
     """
     if title_override:
         return title_override
 
-    # Format 1: "# {Name} Threat Model" (orchestrator output)
     match = re.search(r"^#\s+(.+?)\s+Threat Model\s*$", content, re.MULTILINE)
     if match:
         return match.group(1).strip()
 
-    # Format 2: "# Threat Model: {Name}" (legacy)
     match = re.search(r"^#\s+Threat Model:\s*(.+)$", content, re.MULTILINE)
     if match:
         return match.group(1).strip()
 
+    if target_dir is not None:
+        arch_name = _parse_architecture_project_name(target_dir)
+        if arch_name:
+            return arch_name
+
     return "Unknown Project"
 
 
+def _parse_architecture_project_name(target_dir: Path):
+    """Extract project name from architecture.md H1 in ``target_dir``.
+
+    Returns None when architecture.md is absent, unreadable, or has no
+    parseable H1 in the recognized em-dash formats.
+    """
+    arch_path = target_dir / "architecture.md"
+    if not arch_path.is_file():
+        return None
+
+    try:
+        arch_content = arch_path.read_text(encoding="utf-8")
+    except OSError:
+        return None
+
+    match = re.search(r"^#\s+(.+)$", arch_content, re.MULTILINE)
+    if not match:
+        return None
+
+    heading = match.group(1).strip()
+    parts = [p.strip() for p in heading.split(" — ")]
+    if len(parts) != 2:
+        return None
+
+    left, right = parts
+    if left.lower() in ("architecture", "security architecture"):
+        return right or None
+    if right.lower() == "architecture":
+        return left or None
+
+    return None
+
+
 # =============================================================================
 # Artifact Detection
 # =============================================================================
diff --git a/tests/scripts/test_project_name_parser.py b/tests/scripts/test_project_name_parser.py
@@ -0,0 +1,98 @@
+"""Unit tests for ``parse_project_name`` in ``scripts/tachi_parsers.py``.
+
+Covers precedence ordering (title override > threats.md H1 > architecture.md
+fallback > "Unknown Project"), both threats.md formats, both architecture.md
+em-dash formats, and edge cases (missing file, unreadable file, malformed H1).
+"""
+
+import importlib.util
+import sys
+from pathlib import Path
+
+import pytest
+
+
+REPO_ROOT = Path(__file__).resolve().parents[2]
+PARSER_PATH = REPO_ROOT / "scripts" / "tachi_parsers.py"
+
+
+def _load_parser_module():
+    spec = importlib.util.spec_from_file_location("tachi_parsers", PARSER_PATH)
+    module = importlib.util.module_from_spec(spec)
+    sys.modules["tachi_parsers"] = module
+    spec.loader.exec_module(module)
+    return module
+
+
+tachi_parsers = _load_parser_module()
+parse_project_name = tachi_parsers.parse_project_name
+
+
+class TestTitleOverride:
+    def test_title_override_wins_over_all_sources(self, tmp_path):
+        (tmp_path / "architecture.md").write_text("# Alpha — Architecture\n")
+        content = "# Beta Threat Model\n"
+        assert parse_project_name(
+            content, title_override="Gamma", target_dir=tmp_path
+        ) == "Gamma"
+
+    def test_empty_title_override_falls_through(self):
+        assert parse_project_name("# Alpha Threat Model\n", title_override="") == "Alpha"
+
+
+class TestThreatsMdFormats:
+    def test_format_orchestrator_output(self):
+        assert parse_project_name("# Alpha Threat Model\n") == "Alpha"
+
+    def test_format_legacy_colon(self):
+        assert parse_project_name("# Threat Model: Beta\n") == "Beta"
+
+    def test_multiword_name(self):
+        assert parse_project_name("# Web Application Threat Model\n") == "Web Application"
+
+    def test_name_with_hyphens(self):
+        assert parse_project_name("# Threat Model: second-brain-mcp\n") == "second-brain-mcp"
+
+
+class TestArchitectureMdFallback:
+    def test_name_before_architecture_suffix(self, tmp_path):
+        (tmp_path / "architecture.md").write_text("# Web Application — Architecture\n")
+        assert parse_project_name("# Threat Model Report\n", target_dir=tmp_path) == "Web Application"
+
+    def test_name_after_security_architecture_prefix(self, tmp_path):
+        (tmp_path / "architecture.md").write_text("# Security Architecture — second-brain-mcp\n")
+        assert parse_project_name("# Threat Model Report\n", target_dir=tmp_path) == "second-brain-mcp"
+
+    def test_name_after_plain_architecture_prefix(self, tmp_path):
+        (tmp_path / "architecture.md").write_text("# Architecture — my-service\n")
+        assert parse_project_name("# Threat Model Report\n", target_dir=tmp_path) == "my-service"
+
+    def test_no_architecture_file_falls_back_to_unknown(self, tmp_path):
+        assert parse_project_name("# Threat Model Report\n", target_dir=tmp_path) == "Unknown Project"
+
+    def test_target_dir_none_skips_fallback(self):
+        assert parse_project_name("# Threat Model Report\n") == "Unknown Project"
+
+    def test_architecture_without_em_dash_is_ignored(self, tmp_path):
+        (tmp_path / "architecture.md").write_text("# Plain Heading\n")
+        assert parse_project_name("# Threat Model Report\n", target_dir=tmp_path) == "Unknown Project"
+
+    def test_architecture_with_hyphen_instead_of_em_dash_is_ignored(self, tmp_path):
+        (tmp_path / "architecture.md").write_text("# Web Application - Architecture\n")
+        assert parse_project_name("# Threat Model Report\n", target_dir=tmp_path) == "Unknown Project"
+
+    def test_threats_h1_wins_over_architecture(self, tmp_path):
+        (tmp_path / "architecture.md").write_text("# Alpha — Architecture\n")
+        content = "# Beta Threat Model\n"
+        assert parse_project_name(content, target_dir=tmp_path) == "Beta"
+
+    def test_architecture_with_extra_whitespace(self, tmp_path):
+        (tmp_path / "architecture.md").write_text("#   Web Application   —   Architecture   \n")
+        assert parse_project_name("# Threat Model Report\n", target_dir=tmp_path) == "Web Application"
+
+    def test_architecture_h1_must_be_first_heading(self, tmp_path):
+        # Still accepts first H1 even with multiple H1s present
+        (tmp_path / "architecture.md").write_text(
+            "# Web Application — Architecture\n\n# Another Heading\n"
+        )
+        assert parse_project_name("# Threat Model Report\n", target_dir=tmp_path) == "Web Application"