diff --git a/docs/ref/checks/pii.md b/docs/ref/checks/pii.md index 36b8a93..1e5c59f 100644 --- a/docs/ref/checks/pii.md +++ b/docs/ref/checks/pii.md @@ -24,10 +24,48 @@ Detects personally identifiable information (PII) such as SSNs, phone numbers, c ### Parameters -- **`entities`** (required): List of PII entity types to detect. See the `PIIEntity` enum in `src/checks/pii.ts` for the full list, including custom entities such as `CVV` (credit card security codes) and `BIC_SWIFT` (bank identification codes). +- **`entities`** (optional): List of PII entity types to detect. Defaults to all entities except `NRP` and `PERSON` (see note below). See the `PIIEntity` enum in `src/checks/pii.ts` for the full list, including custom entities such as `CVV` (credit card security codes) and `BIC_SWIFT` (bank identification codes). - **`block`** (optional): Whether to block content or just mask PII (default: `false`) - **`detect_encoded_pii`** (optional): If `true`, detects PII in Base64/URL-encoded/hex strings (default: `false`) +### Important: NRP and PERSON Entity Deprecation + +**As of v0.2.0**, the `NRP` and `PERSON` entities have been **removed from the default entity list** due to their high false positive rates. These patterns are overly broad and cause issues in production: + +- **`NRP`** matches any two consecutive words (e.g., "nuevo cliente", "crea un", "the user") +- **`PERSON`** matches any two capitalized words (e.g., "New York", "The User", "European Union") + +**Impact:** + +- ❌ Causes false positives in natural language conversation +- ❌ Particularly problematic for non-English languages (Spanish, French, etc.) +- ❌ Breaks normal text in pre-flight masking mode + +> **Future Improvement:** More robust implementations of `NRP` and `PERSON` detection are planned for a future release. Stay tuned for updates. + +**Migration Path:** + +If you need to detect person names or national registration numbers, consider these alternatives: + +1. **For National Registration Numbers**: Use region-specific patterns instead: + - `SG_NRIC_FIN` (Singapore) + - `UK_NINO` (UK National Insurance Number) + - `FI_PERSONAL_IDENTITY_CODE` (Finland) + - `KR_RRN` (Korea Resident Registration Number) + +2. **For Person Names**: Consider using a dedicated NER (Named Entity Recognition) service or LLM-based detection for more accurate results. + +3. **If you still need these patterns**: You can explicitly include them in your configuration, but be aware of the false positives: + ```json + { + "entities": ["NRP", "PERSON", "EMAIL_ADDRESS"], + "block": false + } + ``` + A deprecation warning will be logged when these entities are used. + +**Reference:** [Issue #47](https://github.com/openai/openai-guardrails-js/issues/47) + ## Implementation Notes Under the hood the TypeScript guardrail normalizes text (Unicode NFKC), strips zero-width characters, and runs curated regex patterns for each configured entity. When `detect_encoded_pii` is enabled the check also decodes Base64, URL-encoded, and hexadecimal substrings before rescanning them for matches, remapping any findings back to the original encoded content. diff --git a/src/__tests__/unit/checks/pii.test.ts b/src/__tests__/unit/checks/pii.test.ts index 24ec246..06f51ce 100644 --- a/src/__tests__/unit/checks/pii.test.ts +++ b/src/__tests__/unit/checks/pii.test.ts @@ -2,8 +2,8 @@ * Unit tests for the PII guardrail functionality. */ -import { describe, it, expect } from 'vitest'; -import { pii, PIIConfig, PIIEntity } from '../../../checks/pii'; +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; +import { pii, PIIConfig, PIIEntity, _clearDeprecationWarnings } from '../../../checks/pii'; describe('pii guardrail', () => { it('masks detected PII when block=false', async () => { @@ -286,4 +286,141 @@ describe('pii guardrail', () => { ); expect(result.info?.checked_text).toBe('Ship to for delivery.'); }); + + describe('NRP and PERSON deprecation (Issue #47)', () => { + beforeEach(() => { + // Clear deprecation warnings before each test to ensure clean state + _clearDeprecationWarnings(); + }); + + afterEach(() => { + // Restore all mocks to prevent leaking between tests + vi.restoreAllMocks(); + }); + + it('excludes NRP and PERSON from default entities', () => { + const config = PIIConfig.parse({}); + + expect(config.entities).not.toContain(PIIEntity.NRP); + expect(config.entities).not.toContain(PIIEntity.PERSON); + }); + + it('does not mask common two-word phrases when using defaults', async () => { + const config = PIIConfig.parse({ + block: false, + }); + const text = 'crea un nuevo cliente con email test@gmail.com'; + + const result = await pii({}, text, config); + + // Should only mask the email, not "crea un" or "nuevo cliente" + expect(result.info?.checked_text).toBe('crea un nuevo cliente con email '); + expect((result.info?.detected_entities as Record)?.NRP).toBeUndefined(); + }); + + it('does not mask capitalized phrases when using defaults', async () => { + const config = PIIConfig.parse({ + block: false, + }); + const text = 'Welcome to New York, The User can access the system.'; + + const result = await pii({}, text, config); + + // Should not mask "New York" or "The User" + expect(result.info?.checked_text).toBe('Welcome to New York, The User can access the system.'); + expect((result.info?.detected_entities as Record)?.PERSON).toBeUndefined(); + }); + + it('still detects NRP when explicitly configured', async () => { + const consoleWarnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {}); + + const config = PIIConfig.parse({ + entities: [PIIEntity.NRP], + block: false, + }); + const text = 'hello world'; + + const result = await pii({}, text, config); + + expect((result.info?.detected_entities as Record)?.NRP).toEqual(['hello world']); + expect(result.info?.checked_text).toBe(''); + + consoleWarnSpy.mockRestore(); + }); + + it('still detects PERSON when explicitly configured', async () => { + const consoleWarnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {}); + + const config = PIIConfig.parse({ + entities: [PIIEntity.PERSON], + block: false, + }); + const text = 'John Smith lives in New York'; + + const result = await pii({}, text, config); + + expect((result.info?.detected_entities as Record)?.PERSON).toContain('John Smith'); + expect((result.info?.detected_entities as Record)?.PERSON).toContain('New York'); + + consoleWarnSpy.mockRestore(); + }); + + it('shows deprecation warning for NRP', async () => { + const consoleWarnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {}); + + const config = PIIConfig.parse({ + entities: [PIIEntity.NRP], + block: false, + }); + + await pii({}, 'test data', config); + + expect(consoleWarnSpy).toHaveBeenCalledWith( + expect.stringContaining('DEPRECATION: PIIEntity.NRP') + ); + expect(consoleWarnSpy).toHaveBeenCalledWith( + expect.stringContaining('more robust implementation') + ); + + consoleWarnSpy.mockRestore(); + }); + + it('shows deprecation warning for PERSON', async () => { + const consoleWarnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {}); + + const config = PIIConfig.parse({ + entities: [PIIEntity.PERSON], + block: false, + }); + + await pii({}, 'test data', config); + + expect(consoleWarnSpy).toHaveBeenCalledWith( + expect.stringContaining('DEPRECATION: PIIEntity.PERSON') + ); + expect(consoleWarnSpy).toHaveBeenCalledWith( + expect.stringContaining('more robust implementation') + ); + + consoleWarnSpy.mockRestore(); + }); + + it('only shows deprecation warning once per entity', async () => { + const consoleWarnSpy = vi.spyOn(console, 'warn').mockImplementation(() => {}); + + const config = PIIConfig.parse({ + entities: [PIIEntity.NRP, PIIEntity.PERSON], + block: false, + }); + + await pii({}, 'test data', config); + await pii({}, 'more test data', config); + await pii({}, 'even more data', config); + + // Should only be called once for each entity (2 total) + expect(consoleWarnSpy).toHaveBeenCalledTimes(2); + + consoleWarnSpy.mockRestore(); + }); + }); }); diff --git a/src/checks/pii.ts b/src/checks/pii.ts index 5527d63..e22dd41 100644 --- a/src/checks/pii.ts +++ b/src/checks/pii.ts @@ -145,8 +145,24 @@ export enum PIIEntity { * * Used to control which entity types are checked and the behavior mode. */ +/** + * Default PII entities to check. + * + * **IMPORTANT:** NRP and PERSON are excluded from defaults due to high false positive rates. + * These patterns match overly broad text patterns: + * - NRP: Matches any two consecutive words (e.g., "nuevo cliente", "crea un") + * - PERSON: Matches any two capitalized words (e.g., "New York", "The User") + * + * If you need to detect person names or national registration numbers, explicitly + * include these entities in your configuration, or use more specific region-based + * patterns like SG_NRIC_FIN, UK_NINO, etc. + */ +const DEFAULT_PII_ENTITIES: PIIEntity[] = Object.values(PIIEntity).filter( + (entity) => entity !== PIIEntity.NRP && entity !== PIIEntity.PERSON +); + export const PIIConfig = z.object({ - entities: z.array(z.nativeEnum(PIIEntity)).default(() => Object.values(PIIEntity)), + entities: z.array(z.nativeEnum(PIIEntity)).default(() => DEFAULT_PII_ENTITIES), block: z .boolean() .default(false) @@ -844,6 +860,52 @@ function _asResult( }; } +/** + * Deprecated PII entities that have high false positive rates. + */ +const DEPRECATED_ENTITIES = new Set([PIIEntity.NRP, PIIEntity.PERSON]); + +/** + * Track which deprecation warnings have been shown to avoid spam. + */ +const shownDeprecationWarnings = new Set(); + +/** + * Clear deprecation warning cache. FOR TESTING ONLY. + * @internal + */ +export function _clearDeprecationWarnings(): void { + shownDeprecationWarnings.clear(); +} + +/** + * Warn users about deprecated PII entities with high false positive rates. + * + * @param entities The list of entities being checked + */ +function _warnDeprecatedEntities(entities: PIIEntity[]): void { + const deprecated = entities.filter((entity) => DEPRECATED_ENTITIES.has(entity)); + + for (const entity of deprecated) { + if (shownDeprecationWarnings.has(entity)) { + continue; + } + + shownDeprecationWarnings.add(entity); + + const description = + entity === PIIEntity.NRP + ? 'NRP matches any two consecutive words' + : 'PERSON matches any two capitalized words'; + + console.warn( + `[openai-guardrails-js] DEPRECATION: PIIEntity.${entity} removed from defaults (${description}).\n` + + ` A more robust implementation will be released in a future version.\n` + + ` To suppress: remove PIIEntity.${entity} from config. See: https://github.com/openai/openai-guardrails-js/issues/47` + ); + } +} + /** * Async guardrail check_fn for PII entity detection in text. * @@ -861,6 +923,9 @@ export const pii: CheckFn, string, PIIConfig> = async ( data, config ): Promise => { + // Warn about deprecated entities + _warnDeprecatedEntities(config.entities); + const result = _detectPii(data, config); return _asResult(result, config, 'Contains PII', data); };