Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
5b1e040
feat(filters): add placeholder for inbound filters with no-op impleme…
jbax1899 Jan 2, 2026
b533a75
feat(filters): implement outbound message filters and logging mechanism
jbax1899 Jan 2, 2026
00bd864
feat(discord-bot): add link normalization for outbound messages and u…
jbax1899 Jan 2, 2026
18ce548
refactor(discord-bot): linkify URLs without markdown reflow and trim …
jbax1899 Jan 2, 2026
9fd3355
refactor(normalizeLinks): remove unused LinkifyMatch type definition
jbax1899 Jan 2, 2026
4cf1e42
chore(dependencies): remove unused remark-stringify dependency from d…
jbax1899 Jan 2, 2026
b41e4da
fix(prompts): update the default YAML configuration for Discord bot, …
jbax1899 Jan 2, 2026
7a520de
fix(prompts): enhance hyperlink formatting in default YAML for Discor…
jbax1899 Jan 2, 2026
52d19e3
fix(prompts): clarify source citation formatting in default YAML for …
jbax1899 Jan 2, 2026
20e6f3d
fix(response-handler): apply outbound filters to message options and …
jbax1899 Jan 2, 2026
58ce769
fix(messaging): Separate response text from provenance footer to supp…
jbax1899 Jan 2, 2026
b7f07c6
Fix provenance button flows and anchor recovery
jbax1899 Jan 3, 2026
94991e6
feat(filters): add placeholder for inbound filters with no-op impleme…
jbax1899 Jan 2, 2026
48fbf0f
feat(filters): implement outbound message filters and logging mechanism
jbax1899 Jan 2, 2026
0c87507
feat(discord-bot): add link normalization for outbound messages and u…
jbax1899 Jan 2, 2026
5fd8a94
refactor(discord-bot): linkify URLs without markdown reflow and trim …
jbax1899 Jan 2, 2026
2ab8b83
refactor(normalizeLinks): remove unused LinkifyMatch type definition
jbax1899 Jan 2, 2026
94aed2b
chore(dependencies): remove unused remark-stringify dependency from d…
jbax1899 Jan 2, 2026
fee7070
fix(prompts): update the default YAML configuration for Discord bot, …
jbax1899 Jan 2, 2026
76784db
fix(prompts): enhance hyperlink formatting in default YAML for Discor…
jbax1899 Jan 2, 2026
ed84b6d
fix(prompts): clarify source citation formatting in default YAML for …
jbax1899 Jan 2, 2026
ab24a43
fix(response-handler): apply outbound filters to message options and …
jbax1899 Jan 2, 2026
9adb0fe
fix(messaging): Separate response text from provenance footer to supp…
jbax1899 Jan 2, 2026
9d29823
Fix provenance button flows and anchor recovery
jbax1899 Jan 3, 2026
55fb481
Merge branch 'feat(validation)--add-structure-for-inbound/outbound-te…
jbax1899 Jan 5, 2026
52814e2
Merge branch 'main' into feat(validation)--add-structure-for-inbound/…
jbax1899 Jan 5, 2026
d50f3f4
chore(dependencies): constrain new deps to discord-bot package
jbax1899 Jan 5, 2026
5ba893e
feat(filter): enhance comments, add more tests
jbax1899 Jan 5, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
436 changes: 218 additions & 218 deletions packages/backend/src/shared/prompts/defaults.yaml

Large diffs are not rendered by default.

7 changes: 7 additions & 0 deletions packages/discord-bot/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -32,20 +32,27 @@
"express": "catalog:",
"flyio": "catalog:",
"js-yaml": "catalog:",
"linkify-it": "^5.0.0",
"mime-types": "catalog:",
"node-fetch": "catalog:",
"only": "catalog:",
"openai": "catalog:",
"opusscript": "catalog:",
"prism-media": "catalog:",
"reindex": "catalog:",
"remark-parse": "^11.0.0",
"unified": "^11.0.5",
"unist-util-visit": "^5.0.0",
"winston": "catalog:",
"ws": "catalog:"
},
"devDependencies": {
"@types/body-parser": "catalog:",
"@types/express": "catalog:",
"@types/linkify-it": "^5.0.0",
"@types/mdast": "^4.0.4",
"@types/node": "catalog:",
"@types/unist": "^3.0.3",
"copyfiles": "catalog:",
"cross-env": "catalog:",
"rimraf": "catalog:",
Expand Down
17 changes: 17 additions & 0 deletions packages/discord-bot/src/filters/inbound/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
/**
* @description: Placeholder entry point for inbound filters (pre-processing) before core logic.
* @arete-scope: interface
* @arete-module: InboundFilters
* @arete-risk: low - No active filters means no behavioral changes yet.
* @arete-ethics: low - Placeholder does not alter user content.
*/

export interface InboundFilterResult {
content: string;
changes: string[];
}

// Reserved for future inbound filtering; currently a no-op.
export const runInboundFilters = (content: string): InboundFilterResult => {
return { content, changes: [] };
};
62 changes: 62 additions & 0 deletions packages/discord-bot/src/filters/outbound/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
/**
* @description: Runs outbound message filters before content is sent to Discord.
* @arete-scope: interface
* @arete-module: OutboundFilters
* @arete-risk: moderate - Filter failures could distort messages or degrade formatting.
* @arete-ethics: moderate - Outbound normalization influences transparency and user trust.
*/

import { logger } from '../../utils/logger.js';
import { normalizeOutboundLinks } from './normalizeLinks.js';
import type { OutboundFilter, OutboundFilterResult } from './types.js';

/**
* @arete-logger: outboundFilters
*
* @logs
* Outbound filter execution, changes applied, and filter error conditions.
*
* @impact
* Risk: Missing or noisy logs can obscure formatting decisions.
* Ethics: Logs touch message metadata and should avoid raw content leakage.
*/
const outboundFilterLogger = logger.child({ module: 'outboundFilters' });

// Ordered pipeline so each filter sees the edits from the prior one.
const outboundFilters: Array<{ name: string; apply: OutboundFilter }> = [
{ name: 'normalize_links', apply: normalizeOutboundLinks },
];

export const runOutboundFilters = (content: string): OutboundFilterResult => {
let filteredContent = content; // Track intermediate state for each filter.
const changeLog: string[] = []; // Final list of changes for logging.

// Execute each filter in sequence so formatting changes are deterministic.
for (const filter of outboundFilters) {
try {
const result = filter.apply(filteredContent);
filteredContent = result.content;
if (result.changes.length > 0) {
for (const change of result.changes) {
changeLog.push(`${filter.name}:${change}`);
}
}
} catch (error) {
// Fail open: log the failure and proceed to the next filter.
outboundFilterLogger.error('Outbound filter failed; continuing', {
filter: filter.name,
error: (error as Error)?.message ?? String(error),
});
}
}

// Log only the change summary; avoid raw message bodies or identifiers.
outboundFilterLogger.debug('Outbound filters evaluated', {
// TODO: Pseudonymize change summaries if they later include identifiers.
changes: changeLog,
});

return { content: filteredContent, changes: changeLog };
};

export type { OutboundFilter, OutboundFilterResult } from './types.js';
214 changes: 214 additions & 0 deletions packages/discord-bot/src/filters/outbound/normalizeLinks.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
/**
* @description: Normalizes outbound URLs into Markdown autolinks (<url>) without reflowing formatting.
* @arete-scope: interface
* @arete-module: NormalizeOutboundLinks
* @arete-risk: moderate - Linkification errors can distort meaning or intent.
* @arete-ethics: moderate - Formatting changes shape user interpretation and trust.
*/

// used only to run a Markdown parse so we can target edits by source offsets (no re-serialization/reflow).
import { unified } from 'unified';

// turns Markdown into an mdast AST with positional info, letting us identify “do not touch” spans (links/code/etc.).
import remarkParse from 'remark-parse';

// walks the AST so we can collect protected ranges and avoid editing inside Markdown constructs.
import { visit } from 'unist-util-visit';

// provides robust URL detection in plain text (punctuation/parentheses/etc.) without maintaining a regex.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

linkify-it actually uses regex under the hood

import LinkifyIt from 'linkify-it';

// the AST type produced by remark-parse.
import type { Root } from 'mdast';

// the base type for nodes visited in the AST walker.
import type { Node } from 'unist';

// this filter’s return contract: { content, changes } for pipeline logging/telemetry.
import type { OutboundFilterResult } from './types.js';

// represents a portion of the scanned text.
interface TextRange {
start: number;
end: number;
}

// Linkify is scoped to this module to keep behavior consistent and testable.
const linkify = new LinkifyIt();

// Node types that should never be rewritten by the outbound normalizer.
// https://www.npmjs.com/package/mdast
const PROTECTED_NODE_TYPES = new Set<string>([
'link',
'linkReference',
'definition',
'inlineCode',
'code',
'html',
'image',
'imageReference',
]);

/**
* Wraps bare URLs in "<...>" so Markdown renders them as links.
* Note: Discord suppresses embeds for links formatted this way.
*
* What changes:
* - For each URL we detect in normal text, we wrap it with "<" and ">" only.
* - The only new characters we add are those angle brackets.
*
* What counts as a plain-text URL:
* - It appears in normal text, and NOT inside existing Markdown links/images,
* code blocks, inline code, raw HTML, or reference definitions.
* - We only run this filter when the message contains "http://" or "https://"
* - Reflow is avoided so line breaks and original formatting stay intact.
*
* How:
* - We parse first to find protected ranges (links, images, code, definitions, HTML),
* then scan only the remaining text outside those ranges.
* - URL detection is handled by linkify-it so we don't maintain our own edge-case
* rules (punctuation, parentheses, trailing periods, etc.).
*/
export const normalizeOutboundLinks = (
content: string
): OutboundFilterResult => {
if (!content) {
return { content, changes: [] };
}

// Fast path: skip parsing when there are no http(s) URLs to normalize.
if (!content.includes('http://') && !content.includes('https://')) {
return { content, changes: [] };
}

// Parse content to find protected regions we must not modify.
const tree = unified().use(remarkParse).parse(content) as Root;
const protectedRanges = collectProtectedRanges(tree, content.length);

const { text: normalized, count } = linkifyWithProtectedRanges(
content,
protectedRanges
);

// Emit a compact summary for logging rather than per-link detail.
const changes = count > 0 ? [`wrapped_urls:${count}`] : [];
return { content: normalized, changes };
};

// Collect source ranges that should NOT be modified (see PROTECTED_NODE_TYPES)
const collectProtectedRanges = (tree: Root, maxLength: number): TextRange[] => {
const ranges: TextRange[] = [];

visit(tree, (node: Node) => {
if (!PROTECTED_NODE_TYPES.has(node.type)) {
return;
}

const start = node.position?.start?.offset;
const end = node.position?.end?.offset;
if (typeof start !== 'number' || typeof end !== 'number') {
return;
}

const clampedStart = Math.max(0, Math.min(start, maxLength));
const clampedEnd = Math.max(0, Math.min(end, maxLength));
if (clampedEnd <= clampedStart) {
return;
}

ranges.push({ start: clampedStart, end: clampedEnd });
});

return mergeRanges(ranges);
};

// Merge overlapping ranges so we can scan the content efficiently.
const mergeRanges = (ranges: TextRange[]): TextRange[] => {
if (ranges.length === 0) {
return [];
}

const sorted = [...ranges].sort((first, second) => {
if (first.start !== second.start) {
return first.start - second.start;
}
return first.end - second.end;
});

const merged: TextRange[] = [{ ...sorted[0] }];

for (const range of sorted.slice(1)) {
const last = merged[merged.length - 1];
if (range.start <= last.end) {
last.end = Math.max(last.end, range.end);
} else {
merged.push({ ...range });
}
}

return merged;
};

// Apply linkification to content slices that are not protected.
const linkifyWithProtectedRanges = (
content: string,
ranges: TextRange[]
): { text: string; count: number } => {
if (ranges.length === 0) {
return linkifySegment(content);
}

let cursor = 0;
let output = '';
let total = 0;

for (const range of ranges) {
if (range.start > cursor) {
const segment = content.slice(cursor, range.start);
const { text, count } = linkifySegment(segment);
output += text;
total += count;
}

output += content.slice(range.start, range.end);
cursor = range.end;
}

if (cursor < content.length) {
const { text, count } = linkifySegment(content.slice(cursor));
output += text;
total += count;
}

return { text: output, count: total };
};

// Convert a single plain-text segment by wrapping detected URLs in autolinks.
const linkifySegment = (segment: string): { text: string; count: number } => {
const matches = linkify.match(segment);
if (!matches || matches.length === 0) {
return { text: segment, count: 0 };
}

let result = '';
let cursor = 0;
let count = 0;

for (const match of matches) {
const start = match.index ?? 0;
const end = match.lastIndex ?? start;

if (start > cursor) {
result += segment.slice(cursor, start);
}

const raw = match.raw ?? match.text ?? segment.slice(start, end);
const url = raw || match.url;
result += `<${url}>`;
count += 1;
cursor = end;
}

result += segment.slice(cursor);
return { text: result, count };
};
15 changes: 15 additions & 0 deletions packages/discord-bot/src/filters/outbound/types.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
/**
* @description: Shared types for outbound message filters and pipeline composition.
* @arete-scope: interface
* @arete-module: OutboundFilterTypes
* @arete-risk: low - Typing mismatches could hide filter output errors.
* @arete-ethics: low - Type safety affects developer clarity more than user impact.
*/

export interface OutboundFilterResult {
content: string;
changes: string[];
}

// Outbound filters operate on plain text and describe their edits for logging.
export type OutboundFilter = (content: string) => OutboundFilterResult;
Loading