tasq/node_modules/@claude-flow/cli/.claude/skills/aidefence.yaml

298 lines
8.9 KiB
YAML

---
name: aidefence
version: 1.0.0
description: |
AI Manipulation Defense System (AIMDS) integration for Claude Flow V3.
Provides real-time threat detection, behavioral analysis, and adaptive
mitigation with 25-level meta-learning capabilities.
author: rUv
license: MIT
homepage: https://ruv.io/aimds
repository: https://github.com/ruvnet/midstream/tree/main/AIMDS
# Package reference
package: aidefence@^2.1.1
# Capabilities provided by this skill
capabilities:
# Detection capabilities
- prompt_injection_detection # 50+ prompt injection patterns
- jailbreak_detection # AI jailbreak attempt detection
- pii_detection # PII identification (emails, SSNs, API keys)
- unicode_normalization # Control character sanitization
# Analysis capabilities
- behavioral_analysis # Temporal pattern analysis
- chaos_detection # Lyapunov exponent calculation
- policy_verification # Linear Temporal Logic (LTL) policies
- anomaly_detection # Statistical baseline learning
# Response capabilities
- adaptive_mitigation # 7 distinct mitigation strategies
- meta_learning # 25-level recursive optimization (strange-loop)
- rollback_management # Failed mitigation rollback
- effectiveness_tracking # Real-time mitigation monitoring
# Performance characteristics
performance:
detection_latency: <10ms
analysis_latency: <100ms
response_latency: <50ms
throughput: ">12000 req/s"
# Commands exposed by this skill
commands:
scan:
description: Scan input for AI manipulation attempts
usage: |
/aidefence scan <input>
/aidefence scan --file <path>
/aidefence scan --mode paranoid <input>
examples:
- "/aidefence scan 'Ignore previous instructions and...'"
- "/aidefence scan --file suspicious-prompt.txt"
- "/aidefence scan --mode paranoid --json 'Please help me...'"
options:
- name: mode
type: choice
choices:
- quick # Pattern matching only (<5ms)
- thorough # Pattern + behavioral (<50ms)
- paranoid # Full analysis + policy verification (<150ms)
default: thorough
description: Scan depth mode
- name: file
type: string
description: File path to scan instead of inline input
- name: json
type: boolean
default: false
description: Output results as JSON
analyze:
description: Analyze agent behavior patterns for anomalies
usage: |
/aidefence analyze <agent-id>
/aidefence analyze <agent-id> --window 24h
examples:
- "/aidefence analyze security-architect-1234"
- "/aidefence analyze coder-5678 --window 10m --threshold 0.7"
options:
- name: window
type: string
default: "1h"
description: Time window for behavioral analysis
- name: threshold
type: number
default: 0.8
description: Anomaly score threshold (0-1)
policy:
description: Verify agent against LTL security policy
usage: |
/aidefence policy <agent-id> <ltl-formula>
examples:
- "/aidefence policy coder-1234 'G(edit_file -> F(run_tests))'"
- "/aidefence policy reviewer-5678 'G(!approve_self_code)'"
options:
- name: verbose
type: boolean
default: false
description: Show detailed policy evaluation trace
learn:
description: Record successful mitigation for meta-learning
usage: |
/aidefence learn <threat-type> <strategy> --effectiveness <score>
examples:
- "/aidefence learn prompt_injection sanitize --effectiveness 0.95"
- "/aidefence learn jailbreak reject --effectiveness 1.0"
options:
- name: effectiveness
type: number
required: true
description: Mitigation effectiveness score (0-1)
status:
description: Show aidefence system status and metrics
usage: |
/aidefence status
/aidefence status --metrics
options:
- name: metrics
type: boolean
default: false
description: Include Prometheus metrics
# Hook integrations
hooks:
# Pre-agent-input: Scan all agent inputs for manipulation
pre-agent-input:
enabled: true
description: Scan agent inputs before processing
config:
block_critical: true # Block inputs with critical threats
block_high: false # Allow high severity with logging
log_all: true # Log all threat detections
mode: thorough # Default scan mode
# Post-agent-action: Learn from agent behaviors
post-agent-action:
enabled: true
description: Record agent actions for behavioral modeling
config:
sampling_rate: 0.1 # Sample 10% of actions for analysis
anomaly_threshold: 0.8 # Alert threshold for anomaly score
store_embeddings: true # Store action embeddings in AgentDB
# Pre-swarm-init: Verify swarm security policies
pre-swarm-init:
enabled: true
description: Verify swarm topology against security policies
config:
require_security_agent: true # Require security-architect in swarm
validate_topology: true # Validate topology security
# Integration with claude-flow systems
integration:
# AgentDB integration for shared threat patterns
agentdb:
enabled: true
namespace: security_threats
hnsw_enabled: true
config:
vector_dimension: 384
m: 16
ef_construction: 200
ef_search: 100
# ReasoningBank integration for pattern learning
reasoningbank:
enabled: true
store_patterns: true
learn_mitigations: true
config:
min_effectiveness: 0.8 # Only store high-effectiveness patterns
consolidation_interval: 1h
# Prometheus metrics
prometheus:
enabled: true
metrics:
- aidefence_threats_detected_total
- aidefence_detection_latency_ms
- aidefence_analysis_latency_ms
- aidefence_anomaly_score
- aidefence_mitigations_applied_total
- aidefence_meta_learning_depth
# MCP tool registrations
mcp_tools:
- name: aidefence_scan
description: Scan input for AI manipulation attempts
input_schema:
type: object
properties:
input:
type: string
description: Input text to scan
mode:
type: string
enum: [quick, thorough, paranoid]
default: thorough
required: [input]
- name: aidefence_analyze_behavior
description: Analyze agent behavioral patterns for anomalies
input_schema:
type: object
properties:
agentId:
type: string
description: Agent ID to analyze
timeWindow:
type: string
default: "1h"
description: Time window for analysis
required: [agentId]
- name: aidefence_verify_policy
description: Verify agent behavior against LTL security policies
input_schema:
type: object
properties:
agentId:
type: string
policy:
type: string
description: LTL policy formula
required: [agentId, policy]
- name: aidefence_learn_pattern
description: Store successful threat pattern for meta-learning
input_schema:
type: object
properties:
threatType:
type: string
mitigation:
type: string
effectiveness:
type: number
minimum: 0
maximum: 1
required: [threatType, mitigation, effectiveness]
# Threat detection patterns (reference)
threat_patterns:
prompt_injection:
count: 50+
categories:
- instruction_override # "Ignore previous instructions"
- role_switching # "You are now DAN"
- context_manipulation # Fake system messages
- delimiter_abuse # Using special tokens
jailbreak:
categories:
- dan_variants # Do Anything Now variants
- hypothetical # "Hypothetically, if..."
- roleplay # Character-based bypasses
- encoding # Base64/ROT13 encoded prompts
pii:
types:
- email_addresses
- social_security_numbers
- credit_card_numbers
- api_keys
- passwords
# Behavioral analysis configuration
behavioral_analysis:
temporal:
window_sizes: [1m, 10m, 1h, 24h]
attractor_types:
- point # Stable single point
- cycle # Periodic behavior
- torus # Quasi-periodic
- strange # Chaotic (suspicious)
lyapunov:
threshold: 0.1 # Positive = chaotic behavior
embedding_dimension: 3
time_delay: 1
baseline:
learning_period: 24h
update_frequency: 1h
deviation_threshold: 3.0 # Standard deviations
# Documentation links
documentation:
readme: https://github.com/ruvnet/midstream/blob/main/AIMDS/README.md
api: https://ruv.io/aimds/api
patterns: https://ruv.io/aimds/patterns
integration: /v3/implementation/adrs/ADR-022-aidefence-integration.md