Skip to content

Commit ab24a3a

Browse files
marc-shadeclaude
andcommitted
Add defense-grade compliance modules for federal document processing
Bump version to 0.2.0. Add five new compliance modules: - CUI Detection (32 CFR Part 2002 / NIST 800-171): Detects CUI markings, classification banners (UNCLASSIFIED through TS//SCI), dissemination controls (NOFORN, REL TO, ORCON, PROPIN, FISA), validates CUI Registry categories, flags marking deficiencies, generates handling recommendations. - Enhanced PII/PHI Detection: 30+ detection categories with confidence scoring. Covers HIPAA PHI (MRN, health plan IDs, patient IDs), defense PII (DoD ID, CAC, clearances, CAGE, DUNS), financial PII (routing numbers, SWIFT/BIC, EIN/TIN), and ITAR/EAR markers. Maps to HIPAA, Privacy Act, GLBA, PCI DSS, DFARS, and NIST 800-53 SI-4/SI-19. - Document Sanitization: Metadata stripping, EXIF detection, hidden text (zero-width chars, CSS hiding), macro/script quarantine, embedded file scanning, hyperlink exfiltration analysis, font fingerprinting, tracked changes detection, hidden Excel sheets. SHA-256 integrity verification. - Export Control Screening (ITAR/EAR): USML categories I-XXI, ECCN patterns with CCL mapping, 60+ controlled technology keywords (encryption, night vision, propulsion, guidance, stealth, DEW, nuclear, cyber), foreign person/entity detection for deemed exports, dual-use indicators. - Audit Trail (FedRAMP-ready): Tamper-evident SHA-256 hash chain, CEF export for SIEM integration, chain-of-custody reporting, integrity verification. Maps to NIST 800-53 AU-2/3/6/8/9/11/12 and 800-171 3.3.x. Pipeline integration via --cui-scan, --sanitize, --export-control, --defense-mode, --audit-log, and --compliance-report CLI flags. Comprehensive test suites for all five modules (170+ test cases). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 796da84 commit ab24a3a

17 files changed

+6431
-321
lines changed

README.md

Lines changed: 308 additions & 220 deletions
Large diffs are not rendered by default.

docsingest/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,5 @@
22
from .ingest import ingest
33

44
__all__ = ["ingest", "main"]
5+
6+
__version__ = "0.2.0"

docsingest/cli.py

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import argparse
2+
import json
3+
import os
24
import sys
35
from typing import Optional
46

@@ -19,10 +21,10 @@ def main(argv: Optional[list[str]] = None) -> int:
1921

2022
parser.add_argument("directory", help="Path to the directory containing documents")
2123

22-
parser.add_argument("-o", "--output", default="document_context.md",
24+
parser.add_argument("-o", "--output", default="document_context.md",
2325
help="Output markdown file path (default: document_context.md)")
2426

25-
parser.add_argument("--agent", default=None,
27+
parser.add_argument("--agent", default=None,
2628
help="Initial AI agent prompt (default: Comprehensive Compliance Prompt)")
2729

2830
# Restore hidden arguments for visibility
@@ -32,12 +34,45 @@ def main(argv: Optional[list[str]] = None) -> int:
3234
parser.add_argument("--compress", action="store_true", help="Compress document content")
3335
parser.add_argument("--compression-level", type=float, default=0.5, help="Compression level (0-1)")
3436

37+
# Defense compliance flags
38+
compliance_group = parser.add_argument_group("Defense Compliance Options")
39+
compliance_group.add_argument(
40+
"--cui-scan", action="store_true",
41+
help="Enable CUI (Controlled Unclassified Information) detection per 32 CFR Part 2002"
42+
)
43+
compliance_group.add_argument(
44+
"--sanitize", action="store_true",
45+
help="Enable document sanitization (metadata stripping, hidden content detection)"
46+
)
47+
compliance_group.add_argument(
48+
"--export-control", action="store_true",
49+
help="Enable ITAR/EAR export control screening"
50+
)
51+
compliance_group.add_argument(
52+
"--defense-mode", action="store_true",
53+
help="Enable ALL compliance features (CUI, sanitization, export control, enhanced PII)"
54+
)
55+
compliance_group.add_argument(
56+
"--audit-log", type=str, default=None, metavar="PATH",
57+
help="Path for audit trail output (JSON lines with SHA-256 hash chain)"
58+
)
59+
compliance_group.add_argument(
60+
"--compliance-report", type=str, default=None, metavar="PATH",
61+
help="Path for separate compliance report output (Markdown)"
62+
)
63+
3564
args = parser.parse_args(argv)
3665

3766
try:
3867
# Use default compliance prompt if not specified
3968
agent_prompt = args.agent or args.prompt or DEFAULT_COMPLIANCE_PROMPT
4069

70+
# Determine which compliance features are enabled
71+
cui_scan = args.cui_scan or args.defense_mode
72+
sanitize = args.sanitize or args.defense_mode
73+
export_control = args.export_control or args.defense_mode
74+
enhanced_pii = args.defense_mode # Enhanced PII only in defense mode or when PII is enabled
75+
4176
# Perform document ingestion
4277
summary, tree, content, pii_reports = ingest(
4378
args.directory,
@@ -46,7 +81,13 @@ def main(argv: Optional[list[str]] = None) -> int:
4681
pii_analysis=not args.no_pii_analysis if hasattr(args, 'no_pii_analysis') else True,
4782
verbose=args.verbose if hasattr(args, 'verbose') else False,
4883
compress_content=args.compress if hasattr(args, 'compress') else False,
49-
compression_level=args.compression_level if hasattr(args, 'compression_level') else 0.5
84+
compression_level=args.compression_level if hasattr(args, 'compression_level') else 0.5,
85+
cui_scan=cui_scan,
86+
sanitize=sanitize,
87+
export_control=export_control,
88+
enhanced_pii=enhanced_pii,
89+
audit_log_path=args.audit_log,
90+
compliance_report_path=args.compliance_report,
5091
)
5192

5293
# Print summary to console
@@ -61,6 +102,11 @@ def main(argv: Optional[list[str]] = None) -> int:
61102
# Indicate successful completion
62103
print(f"\nDocument analysis complete. Output: {args.output}")
63104

105+
if args.compliance_report:
106+
print(f"Compliance report: {args.compliance_report}")
107+
if args.audit_log:
108+
print(f"Audit trail: {args.audit_log}")
109+
64110
return 0
65111

66112
except Exception as e:

docsingest/compliance/__init__.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
"""
2+
Defense-grade compliance modules for docsingest.
3+
4+
Provides CUI detection, enhanced PII/PHI detection, document sanitization,
5+
export control screening, and FedRAMP-ready audit trail capabilities.
6+
7+
Compliance Frameworks Supported:
8+
- NIST SP 800-171 (CUI Protection)
9+
- NIST SP 800-53 (Security Controls)
10+
- ITAR (22 CFR 120-130)
11+
- EAR (15 CFR 730-774)
12+
- HIPAA (Health Insurance Portability and Accountability Act)
13+
- 32 CFR Part 2002 (CUI Program)
14+
- FedRAMP (Federal Risk and Authorization Management Program)
15+
"""
16+
17+
from docsingest.compliance.cui_detector import CUIDetector
18+
from docsingest.compliance.enhanced_pii import EnhancedPIIDetector
19+
from docsingest.compliance.sanitizer import DocumentSanitizer
20+
from docsingest.compliance.export_control import ExportControlScreener
21+
from docsingest.compliance.audit_trail import AuditTrail
22+
23+
__all__ = [
24+
"CUIDetector",
25+
"EnhancedPIIDetector",
26+
"DocumentSanitizer",
27+
"ExportControlScreener",
28+
"AuditTrail",
29+
]

0 commit comments

Comments
 (0)