Skip to content

Commit 12d87a2

Browse files
committed
fix: strip conversational preambles from extracted artifact titles
1 parent 8217cf3 commit 12d87a2

5 files changed

Lines changed: 397 additions & 2 deletions

File tree

packages/sidecar/src/artifact-extractor/extractor.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import type { ArtifactRecord, ArtifactService, ArtifactType, CreateArtifactOptio
22
import type { OpenGoatPaths } from "@opengoat/core";
33
import { detectSections, matchHeadingToOutputType } from "./content-detector.ts";
44
import { mapOutputTypeToArtifactType } from "./output-type-mapper.ts";
5+
import { cleanSectionTitle } from "./title-cleaner.ts";
56

67
export interface ExtractionContext {
78
specialistId: string;
@@ -45,7 +46,7 @@ function deriveBundleTitle(specialistName: string, sections: MatchedSection[]):
4546
return `${specialistName}: ${typeName} Bundle`;
4647
}
4748

48-
return `${specialistName}: ${sections[0].heading}`;
49+
return `${specialistName}: ${cleanSectionTitle(sections[0].heading, sections[0].content, sections[0].artifactType)}`;
4950
}
5051

5152
/**
@@ -102,9 +103,10 @@ export async function extractArtifacts(
102103
// Pass 2: Create artifacts with optional bundleId
103104
const artifacts: ArtifactRecord[] = [];
104105
for (const section of matched) {
106+
const title = cleanSectionTitle(section.heading, section.content, section.artifactType);
105107
const options: CreateArtifactOptions = {
106108
projectId: context.agentId,
107-
title: section.heading,
109+
title,
108110
type: section.artifactType,
109111
format: "markdown",
110112
contentRef: `chat://${context.sessionId}/${context.messageIndex ?? 0}`,

packages/sidecar/src/artifact-extractor/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ export type { DetectedSection } from "./content-detector.ts";
33
export { mapOutputTypeToArtifactType } from "./output-type-mapper.ts";
44
export { extractArtifacts } from "./extractor.ts";
55
export type { ExtractionContext, ExtractionResult } from "./extractor.ts";
6+
export { cleanSectionTitle, isConversationalTitle } from "./title-cleaner.ts";
67
export { extractSessionId } from "./session-id.ts";
78
export { bundleUnbundledArtifacts } from "./bundle-grouper.ts";
89
export type { BundleGrouperDeps, BundleGrouperResult } from "./bundle-grouper.ts";
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
/**
2+
* Regex matching conversational preamble patterns that should not
3+
* appear as artifact titles. Case-insensitive, anchored to start.
4+
*/
5+
const CONVERSATIONAL_PATTERN =
6+
/^(I |I'm |I'll |I've |I don't|I can't|I can |I still|I checked|Got it|Let me|Here |Here'|Sure|OK |Okay|Well |So |Hmm|Based on |According to |After reviewing |After analyzing |Looking at |From the |From my |Given |Pulling |Checking |Reviewing |Analyzing |To help |In order to |For this |For your |As requested|Absolutely|Assuming )/i;
7+
8+
/**
9+
* Normalizes Unicode smart/curly quotes to ASCII equivalents
10+
* so pattern matching works consistently.
11+
*/
12+
function normalizeQuotes(text: string): string {
13+
return text
14+
.replace(/[\u2018\u2019\u201A]/g, "'")
15+
.replace(/[\u201C\u201D\u201E]/g, '"');
16+
}
17+
18+
/**
19+
* Strips common inline markdown formatting from text (bold, italic, code, links).
20+
*/
21+
function stripInlineMarkdown(text: string): string {
22+
return text
23+
.replace(/\*\*(.+?)\*\*/g, "$1")
24+
.replace(/\*(.+?)\*/g, "$1")
25+
.replace(/__(.+?)__/g, "$1")
26+
.replace(/_(.+?)_/g, "$1")
27+
.replace(/~~(.+?)~~/g, "$1")
28+
.replace(/`(.+?)`/g, "$1")
29+
.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1")
30+
.trim();
31+
}
32+
33+
/**
34+
* Returns true if the given title looks like conversational AI preamble
35+
* rather than a descriptive artifact name.
36+
*/
37+
export function isConversationalTitle(title: string): boolean {
38+
return CONVERSATIONAL_PATTERN.test(normalizeQuotes(title.trim()));
39+
}
40+
41+
/**
42+
* Produces a clean artifact title from a section heading.
43+
*
44+
* 1. Strips inline markdown from the heading.
45+
* 2. If the heading is conversational preamble, extracts the first
46+
* non-conversational markdown heading from the section content.
47+
* 3. Falls back to a humanized artifact type label.
48+
*/
49+
export function cleanSectionTitle(
50+
heading: string,
51+
content: string,
52+
artifactType: string,
53+
): string {
54+
const stripped = stripInlineMarkdown(heading);
55+
56+
if (!isConversationalTitle(stripped)) {
57+
return stripped;
58+
}
59+
60+
// Try to extract the first markdown heading from content
61+
const headingRegex = /^#{1,6}\s+(.+)$/gm;
62+
let match: RegExpExecArray | null;
63+
while ((match = headingRegex.exec(content)) !== null) {
64+
const candidate = stripInlineMarkdown(match[1].trim());
65+
if (!isConversationalTitle(candidate)) {
66+
return candidate;
67+
}
68+
}
69+
70+
// Fallback: humanized artifact type
71+
return artifactType
72+
.replace(/_/g, " ")
73+
.replace(/\b\w/g, (c) => c.toUpperCase());
74+
}

test/sidecar/artifact-extractor.test.ts

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -797,6 +797,161 @@ Here's a detailed market brief with positioning opportunities and whitespace are
797797
});
798798
});
799799

800+
// ---------------------------------------------------------------------------
801+
// extractArtifacts: conversational preamble title cleaning
802+
// ---------------------------------------------------------------------------
803+
describe("extractArtifacts — conversational preamble cleaning", () => {
804+
function createMockDeps() {
805+
const createdArtifacts: unknown[] = [];
806+
const artifactService = {
807+
createArtifact: vi.fn().mockImplementation((_paths: unknown, opts: unknown) => {
808+
const record = { artifactId: `art-${createdArtifacts.length + 1}`, ...(opts as object) };
809+
createdArtifacts.push(record);
810+
return Promise.resolve(record);
811+
}),
812+
createBundle: vi.fn().mockImplementation((_paths: unknown, opts: unknown) => {
813+
return Promise.resolve({ bundleId: "bnd-mock-1", ...(opts as object) });
814+
}),
815+
};
816+
const opengoatPaths = { homeDir: "/tmp/test" };
817+
return { artifactService, opengoatPaths, createdArtifacts };
818+
}
819+
820+
const baseContext: ExtractionContext = {
821+
specialistId: "market-intel",
822+
agentId: "proj-1",
823+
sessionId: "sess-1",
824+
messageIndex: 0,
825+
};
826+
827+
it("strips conversational heading and uses content heading instead", async () => {
828+
const { artifactService, opengoatPaths } = createMockDeps();
829+
const text = `Some intro text
830+
831+
## Here's a structured readout of the main messaging gaps
832+
833+
After analyzing the competitive landscape, the key gaps are clear.
834+
835+
### Messaging Gap Analysis
836+
837+
| Gap Area | Your Position | Competitor Position |
838+
|---|---|---|
839+
| AI-native | Strong | Weak |
840+
| Developer DX | Moderate | Strong |
841+
842+
This analysis shows significant opportunities in the AI-native space.
843+
`;
844+
845+
const specialist = {
846+
id: "market-intel",
847+
name: "Market Intel",
848+
outputTypes: ["competitor messaging matrix", "community shortlist", "market brief"],
849+
};
850+
851+
const result = await extractArtifacts(text, baseContext, {
852+
artifactService: artifactService as any,
853+
opengoatPaths: opengoatPaths as any,
854+
specialist: specialist as any,
855+
});
856+
857+
// The heading was conversational; should fall back to content heading or type
858+
expect(result.artifacts).toHaveLength(1);
859+
const callArgs = artifactService.createArtifact.mock.calls[0][1];
860+
expect(callArgs.title).toBe("Messaging Gap Analysis");
861+
});
862+
863+
it("falls back to humanized artifact type when no content heading exists", async () => {
864+
const { artifactService, opengoatPaths } = createMockDeps();
865+
// Heading is conversational but contains enough matching tokens ("competitor", "messaging")
866+
const text = `## Here's the competitor messaging breakdown you asked for
867+
868+
A detailed competitor messaging matrix showing positioning gaps across all major players in the B2B SaaS market for marketing intelligence tools.
869+
870+
| Competitor | Positioning | Key Claim | Weakness |
871+
|---|---|---|---|
872+
| Acme Corp | "All-in-one platform" | Speed | No customization |
873+
| Beta Inc | "Enterprise-grade" | Security | Expensive |
874+
`;
875+
876+
const specialist = {
877+
id: "market-intel",
878+
name: "Market Intel",
879+
outputTypes: ["competitor messaging matrix", "community shortlist"],
880+
};
881+
882+
const result = await extractArtifacts(text, baseContext, {
883+
artifactService: artifactService as any,
884+
opengoatPaths: opengoatPaths as any,
885+
specialist: specialist as any,
886+
});
887+
888+
expect(result.artifacts).toHaveLength(1);
889+
const callArgs = artifactService.createArtifact.mock.calls[0][1];
890+
// No content heading and section heading is conversational → fallback to type label
891+
expect(callArgs.title).toBe("Matrix");
892+
});
893+
894+
it("preserves clean descriptive headings unchanged", async () => {
895+
const { artifactService, opengoatPaths } = createMockDeps();
896+
const text = `## Competitor Messaging Matrix
897+
898+
| Competitor | Positioning | Key Claim | Weakness |
899+
|---|---|---|---|
900+
| Acme Corp | "All-in-one platform" | Speed | No customization |
901+
| Beta Inc | "Enterprise-grade" | Security | Expensive |
902+
903+
Key gaps: None of your competitors emphasize the "AI-native" angle.
904+
`;
905+
906+
const specialist = {
907+
id: "market-intel",
908+
name: "Market Intel",
909+
outputTypes: ["competitor messaging matrix", "community shortlist"],
910+
};
911+
912+
const result = await extractArtifacts(text, baseContext, {
913+
artifactService: artifactService as any,
914+
opengoatPaths: opengoatPaths as any,
915+
specialist: specialist as any,
916+
});
917+
918+
expect(result.artifacts).toHaveLength(1);
919+
const callArgs = artifactService.createArtifact.mock.calls[0][1];
920+
expect(callArgs.title).toBe("Competitor Messaging Matrix");
921+
});
922+
923+
it("strips markdown bold from heading", async () => {
924+
const { artifactService, opengoatPaths } = createMockDeps();
925+
const text = `## **Hero Rewrite Options**
926+
927+
Here are three hero rewrite options for your landing page:
928+
929+
1. **Option A** — "Ship faster with AI-powered workflows"
930+
2. **Option B** — "Your team's second brain for shipping"
931+
3. **Option C** — "From idea to production in minutes"
932+
`;
933+
934+
const specialist = {
935+
id: "website-conversion",
936+
name: "Website Conversion",
937+
outputTypes: ["hero rewrite bundle", "CTA options"],
938+
};
939+
940+
const result = await extractArtifacts(text, {
941+
...baseContext,
942+
specialistId: "website-conversion",
943+
}, {
944+
artifactService: artifactService as any,
945+
opengoatPaths: opengoatPaths as any,
946+
specialist: specialist as any,
947+
});
948+
949+
expect(result.artifacts).toHaveLength(1);
950+
const callArgs = artifactService.createArtifact.mock.calls[0][1];
951+
expect(callArgs.title).toBe("Hero Rewrite Options");
952+
});
953+
});
954+
800955
// ---------------------------------------------------------------------------
801956
// Manual extraction route: POST /extract
802957
// ---------------------------------------------------------------------------

0 commit comments

Comments
 (0)