diff --git a/src/__tests__/unit/checks/keywords-urls.test.ts b/src/__tests__/unit/checks/keywords-urls.test.ts index 8f5cfa5..6a293f9 100644 --- a/src/__tests__/unit/checks/keywords-urls.test.ts +++ b/src/__tests__/unit/checks/keywords-urls.test.ts @@ -4,7 +4,7 @@ import { describe, it, expect } from 'vitest'; import { keywordsCheck, KeywordsConfig } from '../../../checks/keywords'; -import { urls } from '../../../checks/urls'; +import { urls, UrlsConfig } from '../../../checks/urls'; import { competitorsCheck } from '../../../checks/competitors'; import { GuardrailResult } from '../../../types'; @@ -34,6 +34,16 @@ describe('keywords guardrail', () => { }); }); +describe('UrlsConfig', () => { + it('normalizes allowed scheme inputs', () => { + const config = UrlsConfig.parse({ + allowed_schemes: ['HTTPS://', 'http:', ' https '], + }); + + expect(Array.from(config.allowed_schemes).sort()).toEqual(['http', 'https']); + }); +}); + describe('urls guardrail', () => { it('allows https URLs listed in the allow list', async () => { const result = await urls( @@ -92,6 +102,377 @@ describe('urls guardrail', () => { expect(result.info?.blocked).toContain('https://other.com'); expect(result.tripwireTriggered).toBe(true); }); + + it('allows full URLs with explicit paths in the allow list', async () => { + const text = [ + 'https://suntropy.es', + 'https://api.example.com/v1/tools?id=2', + 'https://api.example.com/v2', + ].join(' '); + + const result = await urls( + {}, + text, + { + url_allow_list: ['https://suntropy.es', 'https://api.example.com/v1'], + allowed_schemes: new Set(['https']), + allow_subdomains: false, + block_userinfo: true, + } + ); + + expect(result.info?.allowed).toEqual( + expect.arrayContaining([ + 'https://suntropy.es', + 'https://api.example.com/v1/tools?id=2', + ]) + ); + expect(result.info?.blocked).toContain('https://api.example.com/v2'); + }); + + it('respects path segment boundaries to avoid prefix bypasses', async () => { + const text = [ + 'https://example.com/api', + 'https://example.com/api/users', + 'https://example.com/api2', + 'https://example.com/api-v2', + ].join(' '); + + const result = await urls( + {}, + text, + { + url_allow_list: ['https://example.com/api'], + allowed_schemes: new Set(['https']), + allow_subdomains: false, + block_userinfo: true, + } + ); + + expect(result.info?.allowed).toEqual( + expect.arrayContaining([ + 'https://example.com/api', + 'https://example.com/api/users', + ]) + ); + expect(result.info?.blocked).toEqual( + expect.arrayContaining([ + 'https://example.com/api2', + 'https://example.com/api-v2', + ]) + ); + }); + + it('matches scheme-less allow list entries across configured schemes', async () => { + const text = ['https://example.com', 'http://example.com'].join(' '); + + const result = await urls( + {}, + text, + { + url_allow_list: ['example.com'], + allowed_schemes: new Set(['https', 'http']), + allow_subdomains: false, + block_userinfo: true, + } + ); + + expect(result.info?.allowed).toEqual( + expect.arrayContaining(['https://example.com', 'http://example.com']) + ); + expect(result.info?.blocked).toEqual([]); + }); + + it('enforces explicit scheme matches when allow list entries include schemes', async () => { + const text = ['https://bank.example.com', 'http://bank.example.com'].join(' '); + + const result = await urls( + {}, + text, + { + url_allow_list: ['https://bank.example.com'], + allowed_schemes: new Set(['https', 'http']), + allow_subdomains: false, + block_userinfo: true, + } + ); + + expect(result.info?.allowed).toEqual(expect.arrayContaining(['https://bank.example.com'])); + expect(result.info?.blocked).toContain('http://bank.example.com'); + }); + + it('supports CIDR ranges and explicit port matching', async () => { + const text = [ + 'https://10.5.5.5', + 'https://192.168.1.100', + 'https://192.168.2.1', + 'https://example.com:8443', + 'https://example.com', + 'https://api.internal.com:9000', + ].join(' '); + + const result = await urls( + {}, + text, + { + url_allow_list: ['10.0.0.0/8', '192.168.1.0/24', 'https://example.com:8443', 'api.internal.com'], + allowed_schemes: new Set(['https']), + allow_subdomains: false, + block_userinfo: true, + } + ); + + expect(result.info?.allowed).toEqual( + expect.arrayContaining([ + 'https://10.5.5.5', + 'https://192.168.1.100', + 'https://example.com:8443', + 'https://api.internal.com:9000', + ]) + ); + expect(result.info?.blocked).toEqual( + expect.arrayContaining(['https://192.168.2.1', 'https://example.com']) + ); + }); + + it('requires query strings and fragments to match exactly when configured', async () => { + const text = [ + 'https://example.com/search?q=test', + 'https://example.com/search?q=other', + 'https://example.com/docs#intro', + 'https://example.com/docs#outro', + ].join(' '); + + const result = await urls( + {}, + text, + { + url_allow_list: [ + 'https://example.com/search?q=test', + 'https://example.com/docs#intro', + ], + allowed_schemes: new Set(['https']), + allow_subdomains: false, + block_userinfo: true, + } + ); + + expect(result.info?.allowed).toEqual( + expect.arrayContaining([ + 'https://example.com/search?q=test', + 'https://example.com/docs#intro', + ]) + ); + expect(result.info?.blocked).toEqual( + expect.arrayContaining([ + 'https://example.com/search?q=other', + 'https://example.com/docs#outro', + ]) + ); + }); + + it('blocks URLs containing only a password in userinfo when configured', async () => { + const result = await urls( + {}, + 'https://:secret@example.com', + { + url_allow_list: ['example.com'], + allowed_schemes: new Set(['https']), + allow_subdomains: false, + block_userinfo: true, + } + ); + + expect(result.info?.blocked).toContain('https://:secret@example.com'); + expect( + (result.info?.blocked_reasons as string[]).some((reason) => reason.includes('userinfo')) + ).toBe(true); + }); + + it('handles malformed ports gracefully without crashing', async () => { + const text = [ + 'https://example.com:99999', + 'https://example.com:abc', + 'https://example.com:-1', + ].join(' '); + + const result = await urls( + {}, + text, + { + url_allow_list: ['example.com'], + allowed_schemes: new Set(['https']), + allow_subdomains: false, + block_userinfo: true, + } + ); + + expect(result.tripwireTriggered).toBe(true); + expect(result.info?.blocked).toHaveLength(3); + expect(result.info?.blocked_reasons).toHaveLength(3); + }); + + it('handles trailing slashes in allow list paths correctly', async () => { + // Regression test: allow list entries with trailing slashes should match subpaths + // Previously, '/api/' + '/' created '/api//' which wouldn't match '/api/users' + const text = [ + 'https://example.com/api/users', + 'https://example.com/api/v2/data', + 'https://example.com/other', + ].join(' '); + + const result = await urls( + {}, + text, + { + url_allow_list: ['https://example.com/api/'], + allowed_schemes: new Set(['https']), + allow_subdomains: false, + block_userinfo: true, + } + ); + + expect(result.info?.allowed).toEqual( + expect.arrayContaining([ + 'https://example.com/api/users', + 'https://example.com/api/v2/data', + ]) + ); + expect(result.info?.blocked).toContain('https://example.com/other'); + }); + + it('matches scheme-less URLs against scheme-qualified allow list entries', async () => { + // Test exact behavior: scheme-qualified allow list vs scheme-less/explicit URLs + const config = { + url_allow_list: ['https://suntropy.es'], + allowed_schemes: new Set(['https']), + allow_subdomains: false, + block_userinfo: true, + }; + + // Test scheme-less URL (should be allowed) + const result1 = await urls({}, 'Visit suntropy.es', config); + expect(result1.info?.allowed).toContain('suntropy.es'); + expect(result1.tripwireTriggered).toBe(false); + + // Test HTTPS URL (should match allow list scheme) + const result2 = await urls({}, 'Visit https://suntropy.es', config); + expect(result2.info?.allowed).toContain('https://suntropy.es'); + expect(result2.tripwireTriggered).toBe(false); + + // Test HTTP URL (wrong explicit scheme should be blocked) + const result3 = await urls({}, 'Visit http://suntropy.es', config); + expect(result3.info?.blocked).toContain('http://suntropy.es'); + expect(result3.tripwireTriggered).toBe(true); + }); + + it('blocks subdomains and paths correctly with scheme-qualified allow list', async () => { + // Verify subdomains and paths are still blocked according to allow list rules + const config = { + url_allow_list: ['https://suntropy.es'], + allowed_schemes: new Set(['https']), + allow_subdomains: false, + block_userinfo: true, + }; + + const text = 'Visit help-suntropy.es and help.suntropy.es'; + const result = await urls({}, text, config); + + // Both should be blocked - not in allow list + expect(result.tripwireTriggered).toBe(true); + expect(result.info?.blocked).toHaveLength(2); + expect(result.info?.blocked).toContain('help-suntropy.es'); + expect(result.info?.blocked).toContain('help.suntropy.es'); + }); + + it('treats explicit default ports as equivalent to no port', async () => { + // URLs with explicit default ports should match allow list entries without ports + const config = { + url_allow_list: ['example.com'], + allowed_schemes: new Set(['https', 'http']), + allow_subdomains: false, + block_userinfo: true, + }; + + const text = 'Visit https://example.com:443 and http://example.com:80'; + const result = await urls({}, text, config); + + // Both should be allowed (443 is default for https, 80 is default for http) + expect(result.tripwireTriggered).toBe(false); + expect(result.info?.allowed).toContain('https://example.com:443'); + expect(result.info?.allowed).toContain('http://example.com:80'); + expect(result.info?.blocked).toEqual([]); + }); + + it('allows any port when allow list entry has no port specification', async () => { + // When the allow list entry omits a port, URLs with any port (default or non-default) are allowed + const config = { + url_allow_list: ['example.com'], + allowed_schemes: new Set(['https']), + allow_subdomains: false, + block_userinfo: true, + }; + + const text = 'Visit https://example.com:8443 and https://example.com:9000'; + const result = await urls({}, text, config); + + // Both should be allowed - when allow list has no port, any port is OK + expect(result.tripwireTriggered).toBe(false); + expect(result.info?.allowed).toContain('https://example.com:8443'); + expect(result.info?.allowed).toContain('https://example.com:9000'); + }); + + it('accepts CIDR /0 with 0.0.0.0 network address', async () => { + // 0.0.0.0/0 should match all IPs + const config = { + url_allow_list: ['0.0.0.0/0'], + allowed_schemes: new Set(['https']), + allow_subdomains: false, + block_userinfo: true, + }; + + const text = 'Visit https://1.2.3.4 and https://192.168.1.1'; + const result = await urls({}, text, config); + + expect(result.tripwireTriggered).toBe(false); + expect(result.info?.allowed).toContain('https://1.2.3.4'); + expect(result.info?.allowed).toContain('https://192.168.1.1'); + }); + + it('rejects CIDR /0 with non-zero network address', async () => { + // 10.0.0.0/0 is ambiguous - /0 should only use 0.0.0.0 + const config = { + url_allow_list: ['10.0.0.0/0'], + allowed_schemes: new Set(['https']), + allow_subdomains: false, + block_userinfo: true, + }; + + const text = 'Visit https://10.5.5.5 and https://192.168.1.1'; + const result = await urls({}, text, config); + + // Should block both because 10.0.0.0/0 is invalid (emits warning) + expect(result.tripwireTriggered).toBe(true); + expect(result.info?.blocked).toContain('https://10.5.5.5'); + expect(result.info?.blocked).toContain('https://192.168.1.1'); + }); + + it('rejects invalid CIDR prefix values', async () => { + // Test various invalid CIDR prefixes + const config = { + url_allow_list: ['10.0.0.0/33', '192.168.0.0/-1', '172.16.0.0/abc'], + allowed_schemes: new Set(['https']), + allow_subdomains: false, + block_userinfo: true, + }; + + const text = 'Visit https://10.5.5.5 and https://192.168.1.1 and https://172.16.1.1'; + const result = await urls({}, text, config); + + // All should be blocked due to invalid CIDR configurations + expect(result.tripwireTriggered).toBe(true); + expect(result.info?.blocked).toHaveLength(3); + }); }); describe('competitors guardrail', () => { diff --git a/src/checks/urls.ts b/src/checks/urls.ts index 5725830..c712d3d 100644 --- a/src/checks/urls.ts +++ b/src/checks/urls.ts @@ -9,6 +9,55 @@ import { z } from 'zod'; import { CheckFn } from '../types'; import { defaultSpecRegistry } from '../registry'; +const DEFAULT_PORTS: Record = { + http: 80, + https: 443, +}; + +const SCHEME_PREFIX_RE = /^[a-z][a-z0-9+.-]*:\/\//; +const HOSTLESS_SCHEMES = new Set(['data', 'javascript', 'vbscript', 'mailto']); + +function normalizeAllowedSchemes(value: unknown): Set { + if (value === undefined || value === null) { + return new Set(['https']); + } + + let rawValues: unknown[]; + if (typeof value === 'string') { + rawValues = [value]; + } else if (value instanceof Set) { + rawValues = Array.from(value.values()); + } else if (Array.isArray(value)) { + rawValues = value; + } else { + throw new Error('allowed_schemes must be a string, Set, or Array'); + } + + const normalized = new Set(); + for (const entry of rawValues) { + if (typeof entry !== 'string') { + throw new Error('allowed_schemes entries must be strings'); + } + let cleaned = entry.trim().toLowerCase(); + if (!cleaned) { + continue; + } + if (cleaned.endsWith('://')) { + cleaned = cleaned.slice(0, -3); + } + cleaned = cleaned.replace(/:+$/, ''); + if (cleaned) { + normalized.add(cleaned); + } + } + + if (normalized.size === 0) { + throw new Error('allowed_schemes must include at least one scheme'); + } + + return normalized; +} + /** * Configuration schema for URL filtering. */ @@ -17,7 +66,7 @@ export const UrlsConfig = z.object({ url_allow_list: z.array(z.string()).default([]), /** Allowed URL schemes/protocols (default: HTTPS only for security) */ allowed_schemes: z - .preprocess((val) => (Array.isArray(val) ? new Set(val) : val), z.set(z.string())) + .preprocess((val) => normalizeAllowedSchemes(val), z.set(z.string())) .default(new Set(['https'])), /** Block URLs with userinfo (user:pass@domain) to prevent credential injection */ block_userinfo: z.boolean().default(true), @@ -45,6 +94,35 @@ function ipToInt(ip: string): number { return (parts[0] << 24) + (parts[1] << 16) + (parts[2] << 8) + parts[3]; } +function extractHostCandidate(url: string): string | null { + if (!url.includes('://')) { + return null; + } + + const [, rest] = url.split('://', 2); + if (!rest) { + return null; + } + + const hostAndRest = rest.split(/[/?#]/, 1)[0]; + const withoutCreds = hostAndRest.includes('@') + ? hostAndRest.split('@').pop() ?? '' + : hostAndRest; + if (!withoutCreds) { + return null; + } + + if (withoutCreds.startsWith('[')) { + const closingIndex = withoutCreds.indexOf(']'); + if (closingIndex !== -1) { + return withoutCreds.slice(0, closingIndex + 1); + } + return withoutCreds; + } + + return withoutCreds.split(':', 1)[0]; +} + /** * Detect URLs in text using robust regex patterns. */ @@ -129,8 +207,12 @@ function detectUrls(text: string): string[] { schemeUrlDomains.add(bareDomain); } } catch { - // Skip URLs with parsing errors (malformed URLs, encoding issues) - // This is expected for edge cases and doesn't require logging + const fallbackHost = extractHostCandidate(url); + if (fallbackHost) { + const normalizedHost = fallbackHost.toLowerCase(); + schemeUrlDomains.add(normalizedHost); + schemeUrlDomains.add(normalizedHost.replace(/^www\./, '')); + } } finalUrls.push(url); } @@ -152,67 +234,150 @@ function detectUrls(text: string): string[] { } /** - * Validate URL against security configuration. + * Validate URL security properties using WHATWG URL parsing. + * + * Ensures scheme compliance, hostname presence (for host-based schemes), and + * blocks userinfo when configured. Returns structured errors for guardrail + * reporting while keeping the parsed URL when valid. */ function validateUrlSecurity( urlString: string, config: UrlsConfig -): { parsedUrl: URL | null; reason: string } { +): { parsedUrl: URL | null; reason: string; hadScheme: boolean } { try { let parsedUrl: URL; let originalScheme: string; + let hadScheme: boolean; // Parse URL - preserve original scheme for validation if (urlString.includes('://')) { // Standard URL with double-slash scheme (http://, https://, ftp://, etc.) parsedUrl = new URL(urlString); - originalScheme = parsedUrl.protocol.replace(':', ''); + originalScheme = parsedUrl.protocol.replace(/:$/, ''); + hadScheme = true; } else if ( urlString.includes(':') && urlString.split(':', 1)[0].match(/^(data|javascript|vbscript|mailto)$/) ) { // Special single-colon schemes parsedUrl = new URL(urlString); - originalScheme = parsedUrl.protocol.replace(':', ''); + originalScheme = parsedUrl.protocol.replace(/:$/, ''); + hadScheme = true; } else { // Add http scheme for parsing, but remember this is a default parsedUrl = new URL(`http://${urlString}`); originalScheme = 'http'; // Default scheme for scheme-less URLs + hadScheme = false; } // Basic validation: must have scheme and hostname (except for special schemes) if (!parsedUrl.protocol) { - return { parsedUrl: null, reason: 'Invalid URL format' }; + return { parsedUrl: null, reason: 'Invalid URL format', hadScheme: false }; } // Special schemes like data: and javascript: don't need hostname - const specialSchemes = new Set(['data:', 'javascript:', 'vbscript:', 'mailto:']); - if (!specialSchemes.has(parsedUrl.protocol) && !parsedUrl.hostname) { - return { parsedUrl: null, reason: 'Invalid URL format' }; + const parsedScheme = parsedUrl.protocol.replace(/:$/, '').toLowerCase(); + if (!HOSTLESS_SCHEMES.has(parsedScheme) && !parsedUrl.hostname) { + return { parsedUrl: null, reason: 'Invalid URL format', hadScheme }; } // Security validations - use original scheme - if (!config.allowed_schemes.has(originalScheme)) { - return { parsedUrl: null, reason: `Blocked scheme: ${originalScheme}` }; + // Only check allowed_schemes if the URL explicitly had a scheme + const normalizedScheme = originalScheme.toLowerCase(); + + if (hadScheme && !config.allowed_schemes.has(normalizedScheme)) { + return { parsedUrl: null, reason: `Blocked scheme: ${normalizedScheme}`, hadScheme }; } - if (config.block_userinfo && parsedUrl.username) { - return { parsedUrl: null, reason: 'Contains userinfo (potential credential injection)' }; + if (config.block_userinfo && (parsedUrl.username || parsedUrl.password)) { + return { parsedUrl: null, reason: 'Contains userinfo (potential credential injection)', hadScheme }; } // Everything else (IPs, localhost, private IPs) goes through allow list logic - return { parsedUrl, reason: '' }; + return { parsedUrl, reason: '', hadScheme }; } catch (error) { // Provide specific error information for debugging + const errorName = error instanceof Error ? error.name : 'Error'; const errorMessage = error instanceof Error ? error.message : String(error); - return { parsedUrl: null, reason: `Invalid URL format: ${errorMessage}` }; + return { parsedUrl: null, reason: `URL parsing error: ${errorName}: ${errorMessage}`, hadScheme: false }; + } +} + +function safeGetPort(parsed: URL, scheme: string): number | null { + if (parsed.port) { + const portNumber = Number(parsed.port); + if (Number.isInteger(portNumber) && portNumber >= 0 && portNumber <= 65535) { + return portNumber; + } + return null; + } + + if (scheme) { + const defaultPort = DEFAULT_PORTS[scheme as keyof typeof DEFAULT_PORTS]; + if (typeof defaultPort === 'number') { + return defaultPort; + } + } + + return null; +} + +function isIpv4Address(value: string): boolean { + try { + ipToInt(value); + return true; + } catch { + return false; } } +/** + * Check if port matching should block the URL. + * + * Only enforces port matching when the allow list entry explicitly specifies + * a non-default port. Explicit default ports (e.g., :443 for https) are + * treated as equivalent to no port being specified. + * + * @param urlPort - The URL's port number (or default for its scheme) + * @param urlParsed - The parsed URL object + * @param allowedPort - The allow list entry's port number (or default for its scheme) + * @param allowedParsed - The parsed allow list entry URL object + * @param urlScheme - The URL's scheme + * @param allowedScheme - The allow list entry's scheme + * @returns true if the port doesn't match and should be blocked, false otherwise + */ +function shouldBlockDueToPortMismatch( + urlPort: number | null, + urlParsed: URL, + allowedPort: number | null, + allowedParsed: URL, + urlScheme: string, + allowedScheme: string +): boolean { + // Only enforce port matching when allow list entry explicitly specifies a non-default port + const allowedHasNonDefaultPort = allowedParsed.port && + (allowedPort !== DEFAULT_PORTS[allowedScheme as keyof typeof DEFAULT_PORTS]); + + if (!allowedHasNonDefaultPort) { + return false; // No port restriction when allow list has no non-default port + } + + // Allow list has explicit non-default port, so URL must match exactly + const urlHasNonDefaultPort = urlParsed.port && + (urlPort !== DEFAULT_PORTS[urlScheme as keyof typeof DEFAULT_PORTS]); + + return !urlHasNonDefaultPort || allowedPort !== urlPort; +} + /** * Check if URL is allowed based on the allow list configuration. + * + * @param parsedUrl - The parsed URL to check + * @param allowList - List of allowed URL patterns + * @param allowSubdomains - Whether to allow subdomains + * @param hadScheme - Whether the original URL had an explicit scheme */ -function isUrlAllowed(parsedUrl: URL, allowList: string[], allowSubdomains: boolean): boolean { +function isUrlAllowed(parsedUrl: URL, allowList: string[], allowSubdomains: boolean, hadScheme: boolean): boolean { if (allowList.length === 0) { return false; } @@ -222,82 +387,162 @@ function isUrlAllowed(parsedUrl: URL, allowList: string[], allowSubdomains: bool return false; } + const urlDomain = urlHost.replace(/^www\./, ''); + const schemeLower = parsedUrl.protocol ? parsedUrl.protocol.replace(/:$/, '').toLowerCase() : ''; + const urlPort = safeGetPort(parsedUrl, schemeLower); + const hostIndicatesPort = Boolean(parsedUrl.host) && parsedUrl.host.includes(':') && !parsedUrl.host.startsWith('['); + if (urlPort === null && hostIndicatesPort) { + return false; + } + + const urlPath = parsedUrl.pathname || '/'; + const urlQuery = parsedUrl.search ? parsedUrl.search.slice(1) : ''; + const urlFragment = parsedUrl.hash ? parsedUrl.hash.slice(1) : ''; + const urlIsIp = isIpv4Address(urlHost); + const urlIpInt = urlIsIp ? ipToInt(urlHost) : null; + for (const allowedEntry of allowList) { - const entry = allowedEntry.toLowerCase().trim(); + const normalizedEntry = allowedEntry.toLowerCase().trim(); + if (!normalizedEntry) { + continue; + } + + // Handle CIDR notation before URL parsing + // CIDR blocks like "10.0.0.0/8" should not be parsed as URLs + const cidrMatch = normalizedEntry.match(/^(\d+\.\d+\.\d+\.\d+)\/(\d+)$/); + if (cidrMatch) { + // Only match against IP URLs + if (!urlIsIp || urlIpInt === null) { + continue; + } + + const [, network, prefixStr] = cidrMatch; + const prefix = Number(prefixStr); + + if (!Number.isInteger(prefix) || prefix < 0 || prefix > 32) { + console.warn(`Warning: Invalid CIDR prefix in allow list: "${normalizedEntry}"`); + continue; + } + + // Validate /0 must use 0.0.0.0 for clarity + // Any other network address with /0 is ambiguous and likely a configuration error + if (prefix === 0 && network !== '0.0.0.0') { + console.warn( + `Warning: CIDR /0 prefix must use 0.0.0.0, not "${network}". Entry: "${normalizedEntry}"` + ); + continue; + } - // Handle full URLs with specific paths - if (entry.includes('://')) { try { - const allowedUrl = new URL(entry); - const allowedHost = allowedUrl.hostname?.toLowerCase(); - const allowedPath = allowedUrl.pathname; - - if (urlHost === allowedHost) { - // Check if the URL path starts with the allowed path - if (!allowedPath || allowedPath === '/' || parsedUrl.pathname.startsWith(allowedPath)) { - return true; - } + const networkInt = ipToInt(network); + const mask = prefix === 0 ? 0 : (0xffffffff << (32 - prefix)) >>> 0; + if ((networkInt & mask) === (urlIpInt & mask)) { + return true; } } catch (error) { - // Invalid URL in allow list - log warning for configuration issues console.warn( - `Warning: Invalid URL in allow list: "${entry}" - ${error instanceof Error ? error.message : error}` + `Warning: Invalid CIDR network address in allow list: "${normalizedEntry}" - ${error instanceof Error ? error.message : error}` ); } - continue; + + continue; // Skip URL parsing for CIDR entries } - // Handle IP addresses and CIDR blocks + const hasExplicitScheme = SCHEME_PREFIX_RE.test(normalizedEntry); + + let parsedAllowed: URL; try { - // Basic IP pattern check - if (/^\d+\.\d+\.\d+\.\d+/.test(entry.split('/')[0])) { - if (entry === urlHost) { - return true; - } - // Proper CIDR validation - if (entry.includes('/') && urlHost.match(/^\d+\.\d+\.\d+\.\d+$/)) { - const [network, prefixStr] = entry.split('/'); - const prefix = parseInt(prefixStr); - - if (prefix >= 0 && prefix <= 32) { - // Convert IPs to 32-bit integers for bitwise comparison - const networkInt = ipToInt(network); - const hostInt = ipToInt(urlHost); - - // Create subnet mask - const mask = (0xffffffff << (32 - prefix)) >>> 0; - - // Check if host is in the network - if ((networkInt & mask) === (hostInt & mask)) { - return true; - } - } - } + parsedAllowed = hasExplicitScheme + ? new URL(normalizedEntry) + : new URL(`http://${normalizedEntry}`); + } catch (error) { + console.warn( + `Warning: Invalid URL in allow list: "${normalizedEntry}" - ${error instanceof Error ? error.message : error}` + ); + continue; + } + + const allowedHost = (parsedAllowed.hostname || '').toLowerCase(); + if (!allowedHost) { + continue; + } + + const allowedScheme = hasExplicitScheme ? parsedAllowed.protocol.replace(/:$/, '').toLowerCase() : ''; + const allowedPort = safeGetPort(parsedAllowed, allowedScheme); + const allowIndicatesPort = Boolean(parsedAllowed.host) && parsedAllowed.host.includes(':') && !parsedAllowed.host.startsWith('['); + if (allowedPort === null && allowIndicatesPort) { + continue; + } + + const allowedPath = parsedAllowed.pathname || ''; + const allowedQuery = parsedAllowed.search ? parsedAllowed.search.slice(1) : ''; + const allowedFragment = parsedAllowed.hash ? parsedAllowed.hash.slice(1) : ''; + + const allowedHostIsIp = isIpv4Address(allowedHost); + if (allowedHostIsIp) { + if (!urlIsIp || urlIpInt === null) { continue; } - } catch (error) { - // Expected: entry is not an IP address/CIDR, continue to domain matching - // Only log if it looks like it was intended to be an IP but failed parsing - if (/^\d+\.\d+/.test(entry)) { - console.warn( - `Warning: Malformed IP address in allow list: "${entry}" - ${error instanceof Error ? error.message : error}` - ); + + // Scheme matching for IPs: only enforce when BOTH allow list entry AND URL have explicit schemes + if (hasExplicitScheme && hadScheme && allowedScheme !== schemeLower) { + continue; } + + // Port matching: only enforce when allow list entry explicitly specifies a non-default port + if (shouldBlockDueToPortMismatch(urlPort, parsedUrl, allowedPort, parsedAllowed, schemeLower, allowedScheme)) { + continue; + } + + // Exact IP match + if (ipToInt(allowedHost) === urlIpInt) { + return true; + } + + continue; } - // Handle domain matching - const allowedDomain = entry.replace(/^www\./, ''); - const urlDomain = urlHost.replace(/^www\./, ''); + const allowedDomain = allowedHost.replace(/^www\./, ''); - // Exact match always allowed - if (urlDomain === allowedDomain) { - return true; + // Port matching: only enforce when allow list entry explicitly specifies a non-default port + if (shouldBlockDueToPortMismatch(urlPort, parsedUrl, allowedPort, parsedAllowed, schemeLower, allowedScheme)) { + continue; } - // Subdomain matching if enabled - if (allowSubdomains && urlDomain.endsWith(`.${allowedDomain}`)) { - return true; + const hostMatches = + urlDomain === allowedDomain || (allowSubdomains && urlDomain.endsWith(`.${allowedDomain}`)); + if (!hostMatches) { + continue; } + + // Scheme matching for domains: only enforce when BOTH allow list entry AND URL have explicit schemes + if (hasExplicitScheme && hadScheme && allowedScheme !== schemeLower) { + continue; + } + + // Path matching: only enforce when allow list entry explicitly specifies a non-root path + // Note: Empty string ('') and root ('/') are both treated as "no path restriction" + if (allowedPath && allowedPath !== '/') { + // Normalize trailing slashes to avoid double-slash issues when checking subpaths + // e.g., if allowedPath is "/api/", we normalize to "/api" before adding "/" + // so we check "/api/" not "/api//" when matching "/api/users" + const normalizedAllowedPath = allowedPath.replace(/\/+$/, ''); + const normalizedUrlPath = urlPath.replace(/\/+$/, ''); + + if (normalizedUrlPath !== normalizedAllowedPath && !normalizedUrlPath.startsWith(`${normalizedAllowedPath}/`)) { + continue; + } + } + + if (allowedQuery && allowedQuery !== urlQuery) { + continue; + } + + if (allowedFragment && allowedFragment !== urlFragment) { + continue; + } + + return true; } return false; @@ -318,7 +563,7 @@ export const urls: CheckFn = async (ctx, data, for (const urlString of detectedUrls) { // Validate URL with security checks - const { parsedUrl, reason } = validateUrlSecurity(urlString, actualConfig); + const { parsedUrl, reason, hadScheme } = validateUrlSecurity(urlString, actualConfig); if (parsedUrl === null) { blocked.push(urlString); @@ -329,13 +574,13 @@ export const urls: CheckFn = async (ctx, data, // Check against allow list // Special schemes (data:, javascript:, mailto:) don't have meaningful hosts // so they only need scheme validation, not host-based allow list checking - const hostlessSchemes = new Set(['data:', 'javascript:', 'vbscript:', 'mailto:']); - if (hostlessSchemes.has(parsedUrl.protocol)) { + const parsedScheme = parsedUrl.protocol.replace(/:$/, '').toLowerCase(); + if (HOSTLESS_SCHEMES.has(parsedScheme)) { // For hostless schemes, only scheme permission matters (no allow list needed) // They were already validated for scheme permission in validateUrlSecurity allowed.push(urlString); } else if ( - isUrlAllowed(parsedUrl, actualConfig.url_allow_list, actualConfig.allow_subdomains) + isUrlAllowed(parsedUrl, actualConfig.url_allow_list, actualConfig.allow_subdomains, hadScheme) ) { allowed.push(urlString); } else { @@ -349,7 +594,7 @@ export const urls: CheckFn = async (ctx, data, return { tripwireTriggered: tripwireTriggered, info: { - guardrail_name: 'URL Filter (Direct Config)', + guardrail_name: 'URL Filter', config: { allowed_schemes: Array.from(actualConfig.allowed_schemes), block_userinfo: actualConfig.block_userinfo,