Skip to content

Commit e7bad8b

Browse files
authored
Merge pull request #121 from arete-org/feat(validation)--add-structure-for-inbound/outbound-text-validation
Feat(validation) add structure for inbound/outbound text validation
2 parents 995e393 + 5ba893e commit e7bad8b

File tree

12 files changed

+3637
-2110
lines changed

12 files changed

+3637
-2110
lines changed

packages/backend/src/shared/prompts/defaults.yaml

Lines changed: 218 additions & 218 deletions
Large diffs are not rendered by default.

packages/discord-bot/package.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,20 +32,27 @@
3232
"express": "catalog:",
3333
"flyio": "catalog:",
3434
"js-yaml": "catalog:",
35+
"linkify-it": "^5.0.0",
3536
"mime-types": "catalog:",
3637
"node-fetch": "catalog:",
3738
"only": "catalog:",
3839
"openai": "catalog:",
3940
"opusscript": "catalog:",
4041
"prism-media": "catalog:",
4142
"reindex": "catalog:",
43+
"remark-parse": "^11.0.0",
44+
"unified": "^11.0.5",
45+
"unist-util-visit": "^5.0.0",
4246
"winston": "catalog:",
4347
"ws": "catalog:"
4448
},
4549
"devDependencies": {
4650
"@types/body-parser": "catalog:",
4751
"@types/express": "catalog:",
52+
"@types/linkify-it": "^5.0.0",
53+
"@types/mdast": "^4.0.4",
4854
"@types/node": "catalog:",
55+
"@types/unist": "^3.0.3",
4956
"copyfiles": "catalog:",
5057
"cross-env": "catalog:",
5158
"rimraf": "catalog:",
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
/**
2+
* @description: Placeholder entry point for inbound filters (pre-processing) before core logic.
3+
* @arete-scope: interface
4+
* @arete-module: InboundFilters
5+
* @arete-risk: low - No active filters means no behavioral changes yet.
6+
* @arete-ethics: low - Placeholder does not alter user content.
7+
*/
8+
9+
export interface InboundFilterResult {
10+
content: string;
11+
changes: string[];
12+
}
13+
14+
// Reserved for future inbound filtering; currently a no-op.
15+
export const runInboundFilters = (content: string): InboundFilterResult => {
16+
return { content, changes: [] };
17+
};
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
/**
2+
* @description: Runs outbound message filters before content is sent to Discord.
3+
* @arete-scope: interface
4+
* @arete-module: OutboundFilters
5+
* @arete-risk: moderate - Filter failures could distort messages or degrade formatting.
6+
* @arete-ethics: moderate - Outbound normalization influences transparency and user trust.
7+
*/
8+
9+
import { logger } from '../../utils/logger.js';
10+
import { normalizeOutboundLinks } from './normalizeLinks.js';
11+
import type { OutboundFilter, OutboundFilterResult } from './types.js';
12+
13+
/**
14+
* @arete-logger: outboundFilters
15+
*
16+
* @logs
17+
* Outbound filter execution, changes applied, and filter error conditions.
18+
*
19+
* @impact
20+
* Risk: Missing or noisy logs can obscure formatting decisions.
21+
* Ethics: Logs touch message metadata and should avoid raw content leakage.
22+
*/
23+
const outboundFilterLogger = logger.child({ module: 'outboundFilters' });
24+
25+
// Ordered pipeline so each filter sees the edits from the prior one.
26+
const outboundFilters: Array<{ name: string; apply: OutboundFilter }> = [
27+
{ name: 'normalize_links', apply: normalizeOutboundLinks },
28+
];
29+
30+
export const runOutboundFilters = (content: string): OutboundFilterResult => {
31+
let filteredContent = content; // Track intermediate state for each filter.
32+
const changeLog: string[] = []; // Final list of changes for logging.
33+
34+
// Execute each filter in sequence so formatting changes are deterministic.
35+
for (const filter of outboundFilters) {
36+
try {
37+
const result = filter.apply(filteredContent);
38+
filteredContent = result.content;
39+
if (result.changes.length > 0) {
40+
for (const change of result.changes) {
41+
changeLog.push(`${filter.name}:${change}`);
42+
}
43+
}
44+
} catch (error) {
45+
// Fail open: log the failure and proceed to the next filter.
46+
outboundFilterLogger.error('Outbound filter failed; continuing', {
47+
filter: filter.name,
48+
error: (error as Error)?.message ?? String(error),
49+
});
50+
}
51+
}
52+
53+
// Log only the change summary; avoid raw message bodies or identifiers.
54+
outboundFilterLogger.debug('Outbound filters evaluated', {
55+
// TODO: Pseudonymize change summaries if they later include identifiers.
56+
changes: changeLog,
57+
});
58+
59+
return { content: filteredContent, changes: changeLog };
60+
};
61+
62+
export type { OutboundFilter, OutboundFilterResult } from './types.js';
Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
/**
2+
* @description: Normalizes outbound URLs into Markdown autolinks (<url>) without reflowing formatting.
3+
* @arete-scope: interface
4+
* @arete-module: NormalizeOutboundLinks
5+
* @arete-risk: moderate - Linkification errors can distort meaning or intent.
6+
* @arete-ethics: moderate - Formatting changes shape user interpretation and trust.
7+
*/
8+
9+
// used only to run a Markdown parse so we can target edits by source offsets (no re-serialization/reflow).
10+
import { unified } from 'unified';
11+
12+
// turns Markdown into an mdast AST with positional info, letting us identify “do not touch” spans (links/code/etc.).
13+
import remarkParse from 'remark-parse';
14+
15+
// walks the AST so we can collect protected ranges and avoid editing inside Markdown constructs.
16+
import { visit } from 'unist-util-visit';
17+
18+
// provides robust URL detection in plain text (punctuation/parentheses/etc.) without maintaining a regex.
19+
import LinkifyIt from 'linkify-it';
20+
21+
// the AST type produced by remark-parse.
22+
import type { Root } from 'mdast';
23+
24+
// the base type for nodes visited in the AST walker.
25+
import type { Node } from 'unist';
26+
27+
// this filter’s return contract: { content, changes } for pipeline logging/telemetry.
28+
import type { OutboundFilterResult } from './types.js';
29+
30+
// represents a portion of the scanned text.
31+
interface TextRange {
32+
start: number;
33+
end: number;
34+
}
35+
36+
// Linkify is scoped to this module to keep behavior consistent and testable.
37+
const linkify = new LinkifyIt();
38+
39+
// Node types that should never be rewritten by the outbound normalizer.
40+
// https://www.npmjs.com/package/mdast
41+
const PROTECTED_NODE_TYPES = new Set<string>([
42+
'link',
43+
'linkReference',
44+
'definition',
45+
'inlineCode',
46+
'code',
47+
'html',
48+
'image',
49+
'imageReference',
50+
]);
51+
52+
/**
53+
* Wraps bare URLs in "<...>" so Markdown renders them as links.
54+
* Note: Discord suppresses embeds for links formatted this way.
55+
*
56+
* What changes:
57+
* - For each URL we detect in normal text, we wrap it with "<" and ">" only.
58+
* - The only new characters we add are those angle brackets.
59+
*
60+
* What counts as a plain-text URL:
61+
* - It appears in normal text, and NOT inside existing Markdown links/images,
62+
* code blocks, inline code, raw HTML, or reference definitions.
63+
* - We only run this filter when the message contains "http://" or "https://"
64+
* - Reflow is avoided so line breaks and original formatting stay intact.
65+
*
66+
* How:
67+
* - We parse first to find protected ranges (links, images, code, definitions, HTML),
68+
* then scan only the remaining text outside those ranges.
69+
* - URL detection is handled by linkify-it so we don't maintain our own edge-case
70+
* rules (punctuation, parentheses, trailing periods, etc.).
71+
*/
72+
export const normalizeOutboundLinks = (
73+
content: string
74+
): OutboundFilterResult => {
75+
if (!content) {
76+
return { content, changes: [] };
77+
}
78+
79+
// Fast path: skip parsing when there are no http(s) URLs to normalize.
80+
if (!content.includes('http://') && !content.includes('https://')) {
81+
return { content, changes: [] };
82+
}
83+
84+
// Parse content to find protected regions we must not modify.
85+
const tree = unified().use(remarkParse).parse(content) as Root;
86+
const protectedRanges = collectProtectedRanges(tree, content.length);
87+
88+
const { text: normalized, count } = linkifyWithProtectedRanges(
89+
content,
90+
protectedRanges
91+
);
92+
93+
// Emit a compact summary for logging rather than per-link detail.
94+
const changes = count > 0 ? [`wrapped_urls:${count}`] : [];
95+
return { content: normalized, changes };
96+
};
97+
98+
// Collect source ranges that should NOT be modified (see PROTECTED_NODE_TYPES)
99+
const collectProtectedRanges = (tree: Root, maxLength: number): TextRange[] => {
100+
const ranges: TextRange[] = [];
101+
102+
visit(tree, (node: Node) => {
103+
if (!PROTECTED_NODE_TYPES.has(node.type)) {
104+
return;
105+
}
106+
107+
const start = node.position?.start?.offset;
108+
const end = node.position?.end?.offset;
109+
if (typeof start !== 'number' || typeof end !== 'number') {
110+
return;
111+
}
112+
113+
const clampedStart = Math.max(0, Math.min(start, maxLength));
114+
const clampedEnd = Math.max(0, Math.min(end, maxLength));
115+
if (clampedEnd <= clampedStart) {
116+
return;
117+
}
118+
119+
ranges.push({ start: clampedStart, end: clampedEnd });
120+
});
121+
122+
return mergeRanges(ranges);
123+
};
124+
125+
// Merge overlapping ranges so we can scan the content efficiently.
126+
const mergeRanges = (ranges: TextRange[]): TextRange[] => {
127+
if (ranges.length === 0) {
128+
return [];
129+
}
130+
131+
const sorted = [...ranges].sort((first, second) => {
132+
if (first.start !== second.start) {
133+
return first.start - second.start;
134+
}
135+
return first.end - second.end;
136+
});
137+
138+
const merged: TextRange[] = [{ ...sorted[0] }];
139+
140+
for (const range of sorted.slice(1)) {
141+
const last = merged[merged.length - 1];
142+
if (range.start <= last.end) {
143+
last.end = Math.max(last.end, range.end);
144+
} else {
145+
merged.push({ ...range });
146+
}
147+
}
148+
149+
return merged;
150+
};
151+
152+
// Apply linkification to content slices that are not protected.
153+
const linkifyWithProtectedRanges = (
154+
content: string,
155+
ranges: TextRange[]
156+
): { text: string; count: number } => {
157+
if (ranges.length === 0) {
158+
return linkifySegment(content);
159+
}
160+
161+
let cursor = 0;
162+
let output = '';
163+
let total = 0;
164+
165+
for (const range of ranges) {
166+
if (range.start > cursor) {
167+
const segment = content.slice(cursor, range.start);
168+
const { text, count } = linkifySegment(segment);
169+
output += text;
170+
total += count;
171+
}
172+
173+
output += content.slice(range.start, range.end);
174+
cursor = range.end;
175+
}
176+
177+
if (cursor < content.length) {
178+
const { text, count } = linkifySegment(content.slice(cursor));
179+
output += text;
180+
total += count;
181+
}
182+
183+
return { text: output, count: total };
184+
};
185+
186+
// Convert a single plain-text segment by wrapping detected URLs in autolinks.
187+
const linkifySegment = (segment: string): { text: string; count: number } => {
188+
const matches = linkify.match(segment);
189+
if (!matches || matches.length === 0) {
190+
return { text: segment, count: 0 };
191+
}
192+
193+
let result = '';
194+
let cursor = 0;
195+
let count = 0;
196+
197+
for (const match of matches) {
198+
const start = match.index ?? 0;
199+
const end = match.lastIndex ?? start;
200+
201+
if (start > cursor) {
202+
result += segment.slice(cursor, start);
203+
}
204+
205+
const raw = match.raw ?? match.text ?? segment.slice(start, end);
206+
const url = raw || match.url;
207+
result += `<${url}>`;
208+
count += 1;
209+
cursor = end;
210+
}
211+
212+
result += segment.slice(cursor);
213+
return { text: result, count };
214+
};
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
/**
2+
* @description: Shared types for outbound message filters and pipeline composition.
3+
* @arete-scope: interface
4+
* @arete-module: OutboundFilterTypes
5+
* @arete-risk: low - Typing mismatches could hide filter output errors.
6+
* @arete-ethics: low - Type safety affects developer clarity more than user impact.
7+
*/
8+
9+
export interface OutboundFilterResult {
10+
content: string;
11+
changes: string[];
12+
}
13+
14+
// Outbound filters operate on plain text and describe their edits for logging.
15+
export type OutboundFilter = (content: string) => OutboundFilterResult;

0 commit comments

Comments
 (0)