fix: strip conversational preambles from extracted artifact titles

marian2js · marian2js · commit 12d87a237cab · 2026-04-05T00:00:04.000+02:00
diff --git a/packages/sidecar/src/artifact-extractor/extractor.ts b/packages/sidecar/src/artifact-extractor/extractor.ts
@@ -2,6 +2,7 @@ import type { ArtifactRecord, ArtifactService, ArtifactType, CreateArtifactOptio
 import type { OpenGoatPaths } from "@opengoat/core";
 import { detectSections, matchHeadingToOutputType } from "./content-detector.ts";
 import { mapOutputTypeToArtifactType } from "./output-type-mapper.ts";
+import { cleanSectionTitle } from "./title-cleaner.ts";
 
 export interface ExtractionContext {
   specialistId: string;
@@ -45,7 +46,7 @@ function deriveBundleTitle(specialistName: string, sections: MatchedSection[]):
     return `${specialistName}: ${typeName} Bundle`;
   }
 
-  return `${specialistName}: ${sections[0].heading}`;
+  return `${specialistName}: ${cleanSectionTitle(sections[0].heading, sections[0].content, sections[0].artifactType)}`;
 }
 
 /**
@@ -102,9 +103,10 @@ export async function extractArtifacts(
   // Pass 2: Create artifacts with optional bundleId
   const artifacts: ArtifactRecord[] = [];
   for (const section of matched) {
+    const title = cleanSectionTitle(section.heading, section.content, section.artifactType);
     const options: CreateArtifactOptions = {
       projectId: context.agentId,
-      title: section.heading,
+      title,
       type: section.artifactType,
       format: "markdown",
       contentRef: `chat://${context.sessionId}/${context.messageIndex ?? 0}`,
diff --git a/packages/sidecar/src/artifact-extractor/index.ts b/packages/sidecar/src/artifact-extractor/index.ts
@@ -3,6 +3,7 @@ export type { DetectedSection } from "./content-detector.ts";
 export { mapOutputTypeToArtifactType } from "./output-type-mapper.ts";
 export { extractArtifacts } from "./extractor.ts";
 export type { ExtractionContext, ExtractionResult } from "./extractor.ts";
+export { cleanSectionTitle, isConversationalTitle } from "./title-cleaner.ts";
 export { extractSessionId } from "./session-id.ts";
 export { bundleUnbundledArtifacts } from "./bundle-grouper.ts";
 export type { BundleGrouperDeps, BundleGrouperResult } from "./bundle-grouper.ts";
diff --git a/packages/sidecar/src/artifact-extractor/title-cleaner.ts b/packages/sidecar/src/artifact-extractor/title-cleaner.ts
@@ -0,0 +1,74 @@
+/**
+ * Regex matching conversational preamble patterns that should not
+ * appear as artifact titles. Case-insensitive, anchored to start.
+ */
+const CONVERSATIONAL_PATTERN =
+  /^(I |I'm |I'll |I've |I don't|I can't|I can |I still|I checked|Got it|Let me|Here |Here'|Sure|OK |Okay|Well |So |Hmm|Based on |According to |After reviewing |After analyzing |Looking at |From the |From my |Given |Pulling |Checking |Reviewing |Analyzing |To help |In order to |For this |For your |As requested|Absolutely|Assuming )/i;
+
+/**
+ * Normalizes Unicode smart/curly quotes to ASCII equivalents
+ * so pattern matching works consistently.
+ */
+function normalizeQuotes(text: string): string {
+  return text
+    .replace(/[\u2018\u2019\u201A]/g, "'")
+    .replace(/[\u201C\u201D\u201E]/g, '"');
+}
+
+/**
+ * Strips common inline markdown formatting from text (bold, italic, code, links).
+ */
+function stripInlineMarkdown(text: string): string {
+  return text
+    .replace(/\*\*(.+?)\*\*/g, "$1")
+    .replace(/\*(.+?)\*/g, "$1")
+    .replace(/__(.+?)__/g, "$1")
+    .replace(/_(.+?)_/g, "$1")
+    .replace(/~~(.+?)~~/g, "$1")
+    .replace(/`(.+?)`/g, "$1")
+    .replace(/\[([^\]]+)\]\([^)]+\)/g, "$1")
+    .trim();
+}
+
+/**
+ * Returns true if the given title looks like conversational AI preamble
+ * rather than a descriptive artifact name.
+ */
+export function isConversationalTitle(title: string): boolean {
+  return CONVERSATIONAL_PATTERN.test(normalizeQuotes(title.trim()));
+}
+
+/**
+ * Produces a clean artifact title from a section heading.
+ *
+ * 1. Strips inline markdown from the heading.
+ * 2. If the heading is conversational preamble, extracts the first
+ *    non-conversational markdown heading from the section content.
+ * 3. Falls back to a humanized artifact type label.
+ */
+export function cleanSectionTitle(
+  heading: string,
+  content: string,
+  artifactType: string,
+): string {
+  const stripped = stripInlineMarkdown(heading);
+
+  if (!isConversationalTitle(stripped)) {
+    return stripped;
+  }
+
+  // Try to extract the first markdown heading from content
+  const headingRegex = /^#{1,6}\s+(.+)$/gm;
+  let match: RegExpExecArray | null;
+  while ((match = headingRegex.exec(content)) !== null) {
+    const candidate = stripInlineMarkdown(match[1].trim());
+    if (!isConversationalTitle(candidate)) {
+      return candidate;
+    }
+  }
+
+  // Fallback: humanized artifact type
+  return artifactType
+    .replace(/_/g, " ")
+    .replace(/\b\w/g, (c) => c.toUpperCase());
+}
diff --git a/test/sidecar/artifact-extractor.test.ts b/test/sidecar/artifact-extractor.test.ts
@@ -797,6 +797,161 @@ Here's a detailed market brief with positioning opportunities and whitespace are
   });
 });
 
+// ---------------------------------------------------------------------------
+// extractArtifacts: conversational preamble title cleaning
+// ---------------------------------------------------------------------------
+describe("extractArtifacts — conversational preamble cleaning", () => {
+  function createMockDeps() {
+    const createdArtifacts: unknown[] = [];
+    const artifactService = {
+      createArtifact: vi.fn().mockImplementation((_paths: unknown, opts: unknown) => {
+        const record = { artifactId: `art-${createdArtifacts.length + 1}`, ...(opts as object) };
+        createdArtifacts.push(record);
+        return Promise.resolve(record);
+      }),
+      createBundle: vi.fn().mockImplementation((_paths: unknown, opts: unknown) => {
+        return Promise.resolve({ bundleId: "bnd-mock-1", ...(opts as object) });
+      }),
+    };
+    const opengoatPaths = { homeDir: "/tmp/test" };
+    return { artifactService, opengoatPaths, createdArtifacts };
+  }
+
+  const baseContext: ExtractionContext = {
+    specialistId: "market-intel",
+    agentId: "proj-1",
+    sessionId: "sess-1",
+    messageIndex: 0,
+  };
+
+  it("strips conversational heading and uses content heading instead", async () => {
+    const { artifactService, opengoatPaths } = createMockDeps();
+    const text = `Some intro text
+
+## Here's a structured readout of the main messaging gaps
+
+After analyzing the competitive landscape, the key gaps are clear.
+
+### Messaging Gap Analysis
+
+| Gap Area | Your Position | Competitor Position |
+|---|---|---|
+| AI-native | Strong | Weak |
+| Developer DX | Moderate | Strong |
+
+This analysis shows significant opportunities in the AI-native space.
+`;
+
+    const specialist = {
+      id: "market-intel",
+      name: "Market Intel",
+      outputTypes: ["competitor messaging matrix", "community shortlist", "market brief"],
+    };
+
+    const result = await extractArtifacts(text, baseContext, {
+      artifactService: artifactService as any,
+      opengoatPaths: opengoatPaths as any,
+      specialist: specialist as any,
+    });
+
+    // The heading was conversational; should fall back to content heading or type
+    expect(result.artifacts).toHaveLength(1);
+    const callArgs = artifactService.createArtifact.mock.calls[0][1];
+    expect(callArgs.title).toBe("Messaging Gap Analysis");
+  });
+
+  it("falls back to humanized artifact type when no content heading exists", async () => {
+    const { artifactService, opengoatPaths } = createMockDeps();
+    // Heading is conversational but contains enough matching tokens ("competitor", "messaging")
+    const text = `## Here's the competitor messaging breakdown you asked for
+
+A detailed competitor messaging matrix showing positioning gaps across all major players in the B2B SaaS market for marketing intelligence tools.
+
+| Competitor | Positioning | Key Claim | Weakness |
+|---|---|---|---|
+| Acme Corp | "All-in-one platform" | Speed | No customization |
+| Beta Inc | "Enterprise-grade" | Security | Expensive |
+`;
+
+    const specialist = {
+      id: "market-intel",
+      name: "Market Intel",
+      outputTypes: ["competitor messaging matrix", "community shortlist"],
+    };
+
+    const result = await extractArtifacts(text, baseContext, {
+      artifactService: artifactService as any,
+      opengoatPaths: opengoatPaths as any,
+      specialist: specialist as any,
+    });
+
+    expect(result.artifacts).toHaveLength(1);
+    const callArgs = artifactService.createArtifact.mock.calls[0][1];
+    // No content heading and section heading is conversational → fallback to type label
+    expect(callArgs.title).toBe("Matrix");
+  });
+
+  it("preserves clean descriptive headings unchanged", async () => {
+    const { artifactService, opengoatPaths } = createMockDeps();
+    const text = `## Competitor Messaging Matrix
+
+| Competitor | Positioning | Key Claim | Weakness |
+|---|---|---|---|
+| Acme Corp | "All-in-one platform" | Speed | No customization |
+| Beta Inc | "Enterprise-grade" | Security | Expensive |
+
+Key gaps: None of your competitors emphasize the "AI-native" angle.
+`;
+
+    const specialist = {
+      id: "market-intel",
+      name: "Market Intel",
+      outputTypes: ["competitor messaging matrix", "community shortlist"],
+    };
+
+    const result = await extractArtifacts(text, baseContext, {
+      artifactService: artifactService as any,
+      opengoatPaths: opengoatPaths as any,
+      specialist: specialist as any,
+    });
+
+    expect(result.artifacts).toHaveLength(1);
+    const callArgs = artifactService.createArtifact.mock.calls[0][1];
+    expect(callArgs.title).toBe("Competitor Messaging Matrix");
+  });
+
+  it("strips markdown bold from heading", async () => {
+    const { artifactService, opengoatPaths } = createMockDeps();
+    const text = `## **Hero Rewrite Options**
+
+Here are three hero rewrite options for your landing page:
+
+1. **Option A** — "Ship faster with AI-powered workflows"
+2. **Option B** — "Your team's second brain for shipping"
+3. **Option C** — "From idea to production in minutes"
+`;
+
+    const specialist = {
+      id: "website-conversion",
+      name: "Website Conversion",
+      outputTypes: ["hero rewrite bundle", "CTA options"],
+    };
+
+    const result = await extractArtifacts(text, {
+      ...baseContext,
+      specialistId: "website-conversion",
+    }, {
+      artifactService: artifactService as any,
+      opengoatPaths: opengoatPaths as any,
+      specialist: specialist as any,
+    });
+
+    expect(result.artifacts).toHaveLength(1);
+    const callArgs = artifactService.createArtifact.mock.calls[0][1];
+    expect(callArgs.title).toBe("Hero Rewrite Options");
+  });
+});
+
 // ---------------------------------------------------------------------------
 // Manual extraction route: POST /extract
 // ---------------------------------------------------------------------------
diff --git a/test/sidecar/title-cleaner.test.ts b/test/sidecar/title-cleaner.test.ts