/** * Threat Detection Service * * Core detection logic for AI manipulation attempts. * Embedded implementation based on AIMDS patterns. * * Performance targets: * - Detection: <10ms * - Pattern matching: <5ms * - PII scan: <3ms */ import { createThreat } from '../entities/threat.js'; import { createHash } from 'crypto'; /** * Prompt injection patterns (50+ patterns from AIMDS) */ const PROMPT_INJECTION_PATTERNS = [ // Instruction override patterns { pattern: /ignore\s+(all\s+)?(previous\s+)?instructions/i, type: 'instruction_override', severity: 'critical', description: 'Attempt to override system instructions', baseConfidence: 0.95, }, { pattern: /forget\s+(everything|all|previous)/i, type: 'instruction_override', severity: 'critical', description: 'Attempt to reset system context', baseConfidence: 0.92, }, { pattern: /disregard\s+(all\s+)?(previous|prior|above)/i, type: 'instruction_override', severity: 'critical', description: 'Attempt to disregard instructions', baseConfidence: 0.93, }, { pattern: /do\s+not\s+follow\s+(the\s+)?(previous|above|prior)/i, type: 'instruction_override', severity: 'high', description: 'Attempt to bypass previous instructions', baseConfidence: 0.88, }, // Role switching patterns { pattern: /you\s+are\s+now\s+(?!going|about|ready)/i, type: 'role_switching', severity: 'high', description: 'Attempt to change AI identity', baseConfidence: 0.85, }, { pattern: /act\s+as\s+(if\s+you\s+are\s+)?a?\s*(different|new|another)/i, type: 'role_switching', severity: 'high', description: 'Attempt to assume different role', baseConfidence: 0.82, }, { pattern: /pretend\s+(to\s+be|you\s+are)/i, type: 'role_switching', severity: 'medium', description: 'Roleplay instruction that may be legitimate', baseConfidence: 0.65, }, // Jailbreak patterns { pattern: /\bDAN\b.*\bmode\b|\bmode\b.*\bDAN\b/i, type: 'jailbreak', severity: 'critical', description: 'DAN jailbreak attempt', baseConfidence: 0.98, }, { pattern: /jailbreak/i, type: 'jailbreak', severity: 'critical', description: 'Explicit jailbreak mention', baseConfidence: 0.95, }, { pattern: /bypass\s+(your\s+)?(restrictions|limitations|rules|filters)/i, type: 'jailbreak', severity: 'critical', description: 'Attempt to bypass restrictions', baseConfidence: 0.93, }, { pattern: /without\s+(any\s+)?(restrictions|limitations|rules)/i, type: 'jailbreak', severity: 'high', description: 'Request for unrestricted output', baseConfidence: 0.85, }, { pattern: /disable\s+(your\s+)?(safety|content\s+)?filter/i, type: 'jailbreak', severity: 'critical', description: 'Attempt to disable safety filters', baseConfidence: 0.96, }, // Context manipulation patterns { pattern: /system\s*:\s*|<\|system\|>|/i, type: 'context_manipulation', severity: 'critical', description: 'Fake system message injection', baseConfidence: 0.97, }, { pattern: /\[system\]|\{system\}|system\s+prompt/i, type: 'context_manipulation', severity: 'high', description: 'System prompt reference', baseConfidence: 0.88, }, { pattern: /reveal\s+(your\s+)?(system\s+)?prompt/i, type: 'context_manipulation', severity: 'high', description: 'Attempt to extract system prompt', baseConfidence: 0.90, }, { pattern: /what\s+(is|are)\s+your\s+(initial\s+)?instructions/i, type: 'context_manipulation', severity: 'medium', description: 'Query for system instructions', baseConfidence: 0.75, }, { pattern: /\[\[.*?\]\]|<<.*?>>|\{\{.*?\}\}/, type: 'context_manipulation', severity: 'medium', description: 'Special bracket injection attempt', baseConfidence: 0.70, }, // Encoding attack patterns { pattern: /base64|rot13|hex\s+encode|url\s+encode/i, type: 'encoding_attack', severity: 'medium', description: 'Potential encoding-based bypass', baseConfidence: 0.60, }, { pattern: /decode\s+this|encrypted\s+message/i, type: 'encoding_attack', severity: 'medium', description: 'Request to decode potentially malicious content', baseConfidence: 0.55, }, // Hypothetical bypass patterns { pattern: /hypothetically|in\s+theory|theoretically|if\s+you\s+could/i, type: 'prompt_injection', severity: 'low', description: 'Hypothetical framing (may be legitimate)', baseConfidence: 0.45, }, { pattern: /for\s+(educational|research|academic)\s+purposes/i, type: 'prompt_injection', severity: 'low', description: 'Educational framing (often legitimate)', baseConfidence: 0.35, }, // Developer mode patterns { pattern: /developer\s+mode|dev\s+mode|debug\s+mode/i, type: 'jailbreak', severity: 'high', description: 'Attempt to enable developer mode', baseConfidence: 0.85, }, { pattern: /enable\s+(hidden|secret|special)\s+(features|mode|commands)/i, type: 'jailbreak', severity: 'high', description: 'Attempt to enable hidden features', baseConfidence: 0.88, }, // Delimiter abuse patterns { pattern: /```system|```instruction|```prompt/i, type: 'context_manipulation', severity: 'high', description: 'Code block delimiter abuse', baseConfidence: 0.85, }, { pattern: /---\s*(system|instruction|prompt)/i, type: 'context_manipulation', severity: 'medium', description: 'Markdown delimiter abuse', baseConfidence: 0.70, }, ]; /** * PII detection patterns */ const PII_PATTERNS = [ { pattern: /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g, type: 'email', description: 'Email address', }, { pattern: /\b\d{3}-\d{2}-\d{4}\b/g, type: 'ssn', description: 'Social Security Number', }, { pattern: /\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b/g, type: 'credit_card', description: 'Credit card number', }, { pattern: /\b(sk-[a-zA-Z0-9]{48}|sk-ant-[a-zA-Z0-9-]{90,})\b/g, type: 'api_key', description: 'API key (OpenAI/Anthropic format)', }, { pattern: /\b(ghp_[a-zA-Z0-9]{36}|github_pat_[a-zA-Z0-9_]{82})\b/g, type: 'api_key', description: 'GitHub token', }, { pattern: /password\s*[:=]\s*["']?[^"'\s]{4,}["']?/gi, type: 'password', description: 'Hardcoded password', }, ]; /** * Threat Detection Service */ export class ThreatDetectionService { patterns; detectionCount = 0; totalDetectionTimeMs = 0; constructor(customPatterns) { this.patterns = customPatterns ?? PROMPT_INJECTION_PATTERNS; } /** * Detect threats in input text * Target: <10ms latency */ detect(input) { const startTime = performance.now(); const threats = []; // Normalize input const normalizedInput = this.normalizeInput(input); // Pattern matching for (const pattern of this.patterns) { const match = pattern.pattern.exec(normalizedInput); if (match) { // Calculate confidence with context const confidence = this.calculateConfidence(pattern, match, normalizedInput); threats.push(createThreat({ type: pattern.type, severity: this.adjustSeverity(pattern.severity, confidence), confidence, pattern: pattern.pattern.source, description: pattern.description, location: { start: match.index, end: match.index + match[0].length, }, })); } } // PII detection const piiFound = this.detectPII(input); const detectionTimeMs = performance.now() - startTime; this.detectionCount++; this.totalDetectionTimeMs += detectionTimeMs; return { safe: threats.length === 0, threats: this.deduplicateThreats(threats), detectionTimeMs, piiFound, inputHash: this.hashInput(input), }; } /** * Quick scan - pattern matching only * Target: <5ms latency */ quickScan(input) { const normalizedInput = this.normalizeInput(input); let maxConfidence = 0; let threatFound = false; for (const pattern of this.patterns) { if (pattern.pattern.test(normalizedInput)) { threatFound = true; maxConfidence = Math.max(maxConfidence, pattern.baseConfidence); // Early exit on critical threats if (pattern.severity === 'critical') { return { threat: true, confidence: maxConfidence }; } } } return { threat: threatFound, confidence: maxConfidence }; } /** * Detect PII in text */ detectPII(input) { for (const pii of PII_PATTERNS) { if (pii.pattern.test(input)) { return true; } } return false; } /** * Get detection statistics */ getStats() { return { detectionCount: this.detectionCount, avgDetectionTimeMs: this.detectionCount > 0 ? this.totalDetectionTimeMs / this.detectionCount : 0, }; } /** * Normalize input for consistent detection */ normalizeInput(input) { return input // Normalize unicode .normalize('NFKC') // Remove zero-width characters .replace(/[\u200B-\u200D\uFEFF]/g, '') // Normalize whitespace .replace(/\s+/g, ' ') .trim(); } /** * Calculate confidence with contextual factors */ calculateConfidence(pattern, match, input) { let confidence = pattern.baseConfidence; // Boost confidence if multiple threat indicators const threatIndicatorCount = this.patterns.filter(p => p.pattern.test(input)).length; if (threatIndicatorCount > 1) { confidence = Math.min(confidence + 0.05 * (threatIndicatorCount - 1), 0.99); } // Reduce confidence for very short inputs (less context) if (input.length < 50) { confidence *= 0.9; } // Boost confidence if at start of input (more likely intentional) if (match.index < 20) { confidence = Math.min(confidence + 0.05, 0.99); } return Math.round(confidence * 100) / 100; } /** * Adjust severity based on confidence */ adjustSeverity(baseSeverity, confidence) { if (confidence < 0.5 && baseSeverity === 'critical') { return 'high'; } if (confidence < 0.4 && baseSeverity === 'high') { return 'medium'; } return baseSeverity; } /** * Deduplicate threats by type */ deduplicateThreats(threats) { const seen = new Map(); for (const threat of threats) { const existing = seen.get(threat.type); if (!existing || threat.confidence > existing.confidence) { seen.set(threat.type, threat); } } return Array.from(seen.values()) .sort((a, b) => { // Sort by severity first, then confidence const severityOrder = { critical: 0, high: 1, medium: 2, low: 3 }; const severityDiff = severityOrder[a.severity] - severityOrder[b.severity]; return severityDiff !== 0 ? severityDiff : b.confidence - a.confidence; }); } /** * Hash input for caching/deduplication */ hashInput(input) { return createHash('sha256').update(input).digest('hex').slice(0, 16); } } /** * Create a new ThreatDetectionService instance */ export function createThreatDetectionService(customPatterns) { return new ThreatDetectionService(customPatterns); } //# sourceMappingURL=threat-detection-service.js.map