Skip to content

Commit aa9a98d

Browse files
authored
Merge pull request #190 from robinspt/fix/reflection-injection-sanitization
fix: sanitize reflection lines before prompt injection
2 parents 065c465 + 3c102df commit aa9a98d

File tree

5 files changed

+482
-23
lines changed

5 files changed

+482
-23
lines changed

index.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ import {
3232
} from "./src/reflection-store.js";
3333
import {
3434
extractReflectionLearningGovernanceCandidates,
35-
extractReflectionMappedMemoryItems,
35+
extractInjectableReflectionMappedMemoryItems,
3636
} from "./src/reflection-slices.js";
3737
import { createReflectionEventId } from "./src/reflection-event-store.js";
3838
import { buildReflectionMappedMetadata } from "./src/reflection-mapped-metadata.js";
@@ -2768,7 +2768,7 @@ const memoryLanceDBProPlugin = {
27682768
command: String(event.action || "unknown"),
27692769
});
27702770

2771-
const mappedReflectionMemories = extractReflectionMappedMemoryItems(reflectionText);
2771+
const mappedReflectionMemories = extractInjectableReflectionMappedMemoryItems(reflectionText);
27722772
for (const mapped of mappedReflectionMemories) {
27732773
const vector = await embedder.embedPassage(mapped.text);
27742774
let existing: Awaited<ReturnType<typeof store.vectorSearch>> = [];

src/reflection-slices.ts

Lines changed: 65 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,27 @@ export function sanitizeReflectionSliceLines(lines: string[]): string[] {
9090
.filter((line) => !isPlaceholderReflectionSliceLine(line));
9191
}
9292

93+
const INJECTABLE_REFLECTION_BLOCK_PATTERNS: RegExp[] = [
94+
/^\s*(?:(?:next|this)\s+run\s+)?(?:ignore|disregard|forget|override|bypass)\b[\s\S]{0,80}\b(?:instructions?|guardrails?|policy|developer|system)\b/i,
95+
/\b(?:reveal|print|dump|show|output)\b[\s\S]{0,80}\b(?:system prompt|developer prompt|hidden prompt|hidden instructions?|full prompt|prompt verbatim|secrets?|keys?|tokens?)\b/i,
96+
/<\s*\/?\s*(?:system|assistant|user|tool|developer|inherited-rules|derived-focus)\b[^>]*>/i,
97+
/^(?:system|assistant|user|developer|tool)\s*:/i,
98+
];
99+
100+
export function isUnsafeInjectableReflectionLine(line: string): boolean {
101+
const normalized = normalizeReflectionSliceLine(line);
102+
if (!normalized) return true;
103+
return INJECTABLE_REFLECTION_BLOCK_PATTERNS.some((pattern) =>
104+
pattern.test(normalized),
105+
);
106+
}
107+
108+
export function sanitizeInjectableReflectionLines(lines: string[]): string[] {
109+
return sanitizeReflectionSliceLines(lines).filter(
110+
(line) => !isUnsafeInjectableReflectionLine(line),
111+
);
112+
}
113+
93114
function isInvariantRuleLike(line: string): boolean {
94115
return /^(always|never|when\b|if\b|before\b|after\b|prefer\b|avoid\b|require\b|only\b|do not\b|must\b|should\b)/i.test(line) ||
95116
/\b(must|should|never|always|prefer|avoid|required?)\b/i.test(line);
@@ -172,7 +193,10 @@ export function extractReflectionMappedMemories(reflectionText: string): Reflect
172193
return extractReflectionMappedMemoryItems(reflectionText).map(({ text, category, heading }) => ({ text, category, heading }));
173194
}
174195

175-
export function extractReflectionMappedMemoryItems(reflectionText: string): ReflectionMappedMemoryItem[] {
196+
function extractReflectionMappedMemoryItemsWithSanitizer(
197+
reflectionText: string,
198+
sanitizeLines: (lines: string[]) => string[],
199+
): ReflectionMappedMemoryItem[] {
176200
const mappedSections: Array<{
177201
heading: string;
178202
category: "preference" | "fact" | "decision";
@@ -201,30 +225,45 @@ export function extractReflectionMappedMemoryItems(reflectionText: string): Refl
201225
];
202226

203227
return mappedSections.flatMap(({ heading, category, mappedKind }) => {
204-
const lines = sanitizeReflectionSliceLines(parseSectionBullets(reflectionText, heading));
228+
const lines = sanitizeLines(parseSectionBullets(reflectionText, heading));
205229
const groupSize = lines.length;
206230
return lines.map((text, ordinal) => ({ text, category, heading, mappedKind, ordinal, groupSize }));
207231
});
208232
}
209233

210-
export function extractReflectionSlices(reflectionText: string): ReflectionSlices {
234+
export function extractReflectionMappedMemoryItems(reflectionText: string): ReflectionMappedMemoryItem[] {
235+
return extractReflectionMappedMemoryItemsWithSanitizer(reflectionText, sanitizeReflectionSliceLines);
236+
}
237+
238+
export function extractInjectableReflectionMappedMemoryItems(reflectionText: string): ReflectionMappedMemoryItem[] {
239+
return extractReflectionMappedMemoryItemsWithSanitizer(reflectionText, sanitizeInjectableReflectionLines);
240+
}
241+
242+
export function extractInjectableReflectionMappedMemories(reflectionText: string): ReflectionMappedMemory[] {
243+
return extractInjectableReflectionMappedMemoryItems(reflectionText).map(({ text, category, heading }) => ({ text, category, heading }));
244+
}
245+
246+
function extractReflectionSlicesWithSanitizer(
247+
reflectionText: string,
248+
sanitizeLines: (lines: string[]) => string[],
249+
): ReflectionSlices {
211250
const invariantSection = parseSectionBullets(reflectionText, "Invariants");
212251
const derivedSection = parseSectionBullets(reflectionText, "Derived");
213252
const mergedSection = parseSectionBullets(reflectionText, "Invariants & Reflections");
214253

215-
const invariantsPrimary = sanitizeReflectionSliceLines(invariantSection).filter(isInvariantRuleLike);
216-
const derivedPrimary = sanitizeReflectionSliceLines(derivedSection).filter(isDerivedDeltaLike);
254+
const invariantsPrimary = sanitizeLines(invariantSection).filter(isInvariantRuleLike);
255+
const derivedPrimary = sanitizeLines(derivedSection).filter(isDerivedDeltaLike);
217256

218-
const invariantLinesLegacy = sanitizeReflectionSliceLines(
257+
const invariantLinesLegacy = sanitizeLines(
219258
mergedSection.filter((line) => /invariant|stable|policy|rule/i.test(line))
220259
).filter(isInvariantRuleLike);
221-
const reflectionLinesLegacy = sanitizeReflectionSliceLines(
260+
const reflectionLinesLegacy = sanitizeLines(
222261
mergedSection.filter((line) => /reflect|inherit|derive|change|apply/i.test(line))
223262
).filter(isDerivedDeltaLike);
224-
const openLoopLines = sanitizeReflectionSliceLines(parseSectionBullets(reflectionText, "Open loops / next actions"))
263+
const openLoopLines = sanitizeLines(parseSectionBullets(reflectionText, "Open loops / next actions"))
225264
.filter(isOpenLoopAction)
226265
.filter(isDerivedDeltaLike);
227-
const durableDecisionLines = sanitizeReflectionSliceLines(parseSectionBullets(reflectionText, "Decisions (durable)"))
266+
const durableDecisionLines = sanitizeLines(parseSectionBullets(reflectionText, "Decisions (durable)"))
228267
.filter(isInvariantRuleLike);
229268

230269
const invariants = invariantsPrimary.length > 0
@@ -240,8 +279,15 @@ export function extractReflectionSlices(reflectionText: string): ReflectionSlice
240279
};
241280
}
242281

243-
export function extractReflectionSliceItems(reflectionText: string): ReflectionSliceItem[] {
244-
const slices = extractReflectionSlices(reflectionText);
282+
export function extractReflectionSlices(reflectionText: string): ReflectionSlices {
283+
return extractReflectionSlicesWithSanitizer(reflectionText, sanitizeReflectionSliceLines);
284+
}
285+
286+
export function extractInjectableReflectionSlices(reflectionText: string): ReflectionSlices {
287+
return extractReflectionSlicesWithSanitizer(reflectionText, sanitizeInjectableReflectionLines);
288+
}
289+
290+
function buildReflectionSliceItemsFromSlices(slices: ReflectionSlices): ReflectionSliceItem[] {
245291
const invariantGroupSize = slices.invariants.length;
246292
const derivedGroupSize = slices.derived.length;
247293

@@ -262,3 +308,11 @@ export function extractReflectionSliceItems(reflectionText: string): ReflectionS
262308

263309
return [...invariantItems, ...derivedItems];
264310
}
311+
312+
export function extractReflectionSliceItems(reflectionText: string): ReflectionSliceItem[] {
313+
return buildReflectionSliceItemsFromSlices(extractReflectionSlices(reflectionText));
314+
}
315+
316+
export function extractInjectableReflectionSliceItems(reflectionText: string): ReflectionSliceItem[] {
317+
return buildReflectionSliceItemsFromSlices(extractInjectableReflectionSlices(reflectionText));
318+
}

src/reflection-store.ts

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
import type { MemoryEntry, MemorySearchResult } from "./store.js";
22
import {
3-
extractReflectionSliceItems,
4-
extractReflectionSlices,
3+
extractInjectableReflectionSliceItems,
4+
extractInjectableReflectionSlices,
55
sanitizeReflectionSliceLines,
6+
sanitizeInjectableReflectionLines,
67
type ReflectionSlices,
78
} from "./reflection-slices.js";
89
import { parseReflectionMetadata } from "./reflection-metadata.js";
@@ -57,7 +58,7 @@ export function buildReflectionStorePayloads(params: BuildReflectionStorePayload
5758
slices: ReflectionSlices;
5859
payloads: ReflectionStorePayload[];
5960
} {
60-
const slices = extractReflectionSlices(params.reflectionText);
61+
const slices = extractInjectableReflectionSlices(params.reflectionText);
6162
const eventId = params.eventId || createReflectionEventId({
6263
runAt: params.runAt,
6364
sessionKey: params.sessionKey,
@@ -82,7 +83,7 @@ export function buildReflectionStorePayloads(params: BuildReflectionStorePayload
8283
];
8384

8485
const itemPayloads = buildReflectionItemPayloads({
85-
items: extractReflectionSliceItems(params.reflectionText),
86+
items: extractInjectableReflectionSliceItems(params.reflectionText),
8687
eventId,
8788
agentId: params.agentId,
8889
sessionKey: params.sessionKey,
@@ -287,11 +288,12 @@ function buildInvariantCandidates(
287288
.filter(({ metadata }) => metadata.itemKind === "invariant")
288289
.flatMap(({ entry, metadata }) => {
289290
const lines = sanitizeReflectionSliceLines([entry.text]);
290-
if (lines.length === 0) return [];
291+
const safeLines = sanitizeInjectableReflectionLines([entry.text]);
292+
if (safeLines.length === 0) return [];
291293

292294
const defaults = getReflectionItemDecayDefaults("invariant");
293295
const timestamp = metadataTimestamp(metadata, entry.timestamp);
294-
return lines.map((line) => ({
296+
return safeLines.map((line) => ({
295297
line,
296298
timestamp,
297299
midpointDays: readPositiveNumber(metadata.decayMidpointDays, defaults.midpointDays),
@@ -307,7 +309,7 @@ function buildInvariantCandidates(
307309
return legacyRows.flatMap(({ entry, metadata }) => {
308310
const defaults = getReflectionItemDecayDefaults("invariant");
309311
const timestamp = metadataTimestamp(metadata, entry.timestamp);
310-
const lines = sanitizeReflectionSliceLines(toStringArray(metadata.invariants));
312+
const lines = sanitizeInjectableReflectionLines(toStringArray(metadata.invariants));
311313
return lines.map((line) => ({
312314
line,
313315
timestamp,
@@ -328,11 +330,12 @@ function buildDerivedCandidates(
328330
.filter(({ metadata }) => metadata.itemKind === "derived")
329331
.flatMap(({ entry, metadata }) => {
330332
const lines = sanitizeReflectionSliceLines([entry.text]);
331-
if (lines.length === 0) return [];
333+
const safeLines = sanitizeInjectableReflectionLines([entry.text]);
334+
if (safeLines.length === 0) return [];
332335

333336
const defaults = getReflectionItemDecayDefaults("derived");
334337
const timestamp = metadataTimestamp(metadata, entry.timestamp);
335-
return lines.map((line) => ({
338+
return safeLines.map((line) => ({
336339
line,
337340
timestamp,
338341
midpointDays: readPositiveNumber(metadata.decayMidpointDays, defaults.midpointDays),
@@ -347,7 +350,7 @@ function buildDerivedCandidates(
347350

348351
return legacyRows.flatMap(({ entry, metadata }) => {
349352
const timestamp = metadataTimestamp(metadata, entry.timestamp);
350-
const lines = sanitizeReflectionSliceLines(toStringArray(metadata.derived));
353+
const lines = sanitizeInjectableReflectionLines(toStringArray(metadata.derived));
351354
if (lines.length === 0) return [];
352355

353356
const defaults = {

0 commit comments

Comments
 (0)