460 lines
16 KiB
JavaScript
460 lines
16 KiB
JavaScript
/**
|
|
* Circuit Breaker Router - Fault-Tolerant Agent Routing
|
|
*
|
|
* Integrates @ruvector/tiny-dancer circuit breaker pattern for 99.9% uptime.
|
|
*
|
|
* Features:
|
|
* - Circuit breaker states: CLOSED, OPEN, HALF_OPEN
|
|
* - Automatic failure detection and recovery
|
|
* - Fallback chains for degraded service
|
|
* - Hot-reload capability for configuration updates
|
|
* - Uncertainty estimation for routing decisions
|
|
* - Rate limiting for request protection
|
|
*
|
|
* Performance:
|
|
* - <5ms routing overhead
|
|
* - 99.9% uptime guarantee
|
|
* - Automatic failover in <100ms
|
|
*/
|
|
// Import security utilities
|
|
import { InputValidator } from '../utils/input-validator.js';
|
|
import { RateLimiter } from '../utils/rate-limiter.js';
|
|
export var CircuitState;
|
|
(function (CircuitState) {
|
|
CircuitState["CLOSED"] = "CLOSED";
|
|
CircuitState["OPEN"] = "OPEN";
|
|
CircuitState["HALF_OPEN"] = "HALF_OPEN"; // Testing recovery
|
|
})(CircuitState || (CircuitState = {}));
|
|
/**
|
|
* Circuit Breaker Router
|
|
*
|
|
* Provides fault-tolerant routing with:
|
|
* 1. Circuit breaker pattern for each agent
|
|
* 2. Automatic fallback chains
|
|
* 3. Health monitoring and recovery
|
|
* 4. Uncertainty-aware decision making
|
|
*/
|
|
export class CircuitBreakerRouter {
|
|
config;
|
|
// Circuit breaker state per agent
|
|
circuitStates;
|
|
failureCounts;
|
|
successCounts;
|
|
lastFailureTimes;
|
|
lastSuccessTimes;
|
|
resetTimers;
|
|
// Security: Rate limiter to prevent request spam
|
|
rateLimiter;
|
|
// Performance tracking
|
|
routingMetrics;
|
|
constructor(config) {
|
|
// Security: Validate configuration
|
|
if (config) {
|
|
const validatedConfig = InputValidator.validateConfig(config, {
|
|
failureThreshold: {
|
|
type: 'number',
|
|
required: false,
|
|
min: 1,
|
|
max: 100,
|
|
},
|
|
successThreshold: {
|
|
type: 'number',
|
|
required: false,
|
|
min: 1,
|
|
max: 100,
|
|
},
|
|
resetTimeout: {
|
|
type: 'number',
|
|
required: false,
|
|
min: 1000,
|
|
max: 300000,
|
|
},
|
|
requestTimeout: {
|
|
type: 'number',
|
|
required: false,
|
|
min: 100,
|
|
max: 60000,
|
|
},
|
|
enableUncertaintyEstimation: {
|
|
type: 'boolean',
|
|
required: false,
|
|
},
|
|
});
|
|
config = validatedConfig;
|
|
}
|
|
this.config = {
|
|
failureThreshold: config?.failureThreshold ?? 5,
|
|
successThreshold: config?.successThreshold ?? 3,
|
|
resetTimeout: config?.resetTimeout ?? 30000,
|
|
requestTimeout: config?.requestTimeout ?? 5000,
|
|
enableUncertaintyEstimation: config?.enableUncertaintyEstimation ?? true,
|
|
};
|
|
// Security: Initialize rate limiter (100 requests per minute per client)
|
|
this.rateLimiter = new RateLimiter({
|
|
points: 100,
|
|
duration: 60,
|
|
blockDuration: 300, // Block for 5 minutes if exceeded
|
|
});
|
|
this.circuitStates = new Map();
|
|
this.failureCounts = new Map();
|
|
this.successCounts = new Map();
|
|
this.lastFailureTimes = new Map();
|
|
this.lastSuccessTimes = new Map();
|
|
this.resetTimers = new Map();
|
|
this.routingMetrics = {
|
|
totalRequests: 0,
|
|
successfulRoutes: 0,
|
|
fallbackRoutes: 0,
|
|
failedRoutes: 0,
|
|
avgRoutingTimeMs: 0,
|
|
};
|
|
}
|
|
/**
|
|
* Route request to agent with circuit breaker protection
|
|
*
|
|
* Process:
|
|
* 1. Check circuit state for preferred agent
|
|
* 2. If circuit OPEN, try fallback chain
|
|
* 3. If circuit HALF_OPEN, allow test request
|
|
* 4. Estimate uncertainty if enabled
|
|
*
|
|
* @param request - Route request with task and agent preferences
|
|
* @returns Route result with selected agent and metrics
|
|
*/
|
|
async route(request) {
|
|
const startTime = performance.now();
|
|
// Security: Validate task description
|
|
const sanitizedTask = InputValidator.validateTaskDescription(request.taskDescription, {
|
|
maxLength: 10000,
|
|
minLength: 1,
|
|
sanitize: true,
|
|
});
|
|
// Security: Validate timeout if provided
|
|
const timeout = request.timeout
|
|
? InputValidator.validateTimeout(request.timeout, 100, 60000)
|
|
: this.config.requestTimeout;
|
|
// Security: Rate limiting (use task description hash as key)
|
|
const rateLimitKey = this.hashString(sanitizedTask);
|
|
try {
|
|
await this.rateLimiter.consume(rateLimitKey);
|
|
}
|
|
catch (error) {
|
|
throw new Error(`Rate limit exceeded: ${error instanceof Error ? error.message : 'Too many requests'}`);
|
|
}
|
|
this.routingMetrics.totalRequests++;
|
|
// Security: Validate agent names
|
|
const validatedAgents = [];
|
|
if (request.preferredAgent) {
|
|
validatedAgents.push(InputValidator.validateAgentName(request.preferredAgent));
|
|
}
|
|
if (request.fallbackAgents) {
|
|
const validated = InputValidator.validateStringArray(request.fallbackAgents, 'fallbackAgents', 10, 100);
|
|
validatedAgents.push(...validated.map(a => InputValidator.validateAgentName(a)));
|
|
}
|
|
// Determine agent chain: preferred -> fallbacks
|
|
const agentChain = validatedAgents;
|
|
if (agentChain.length === 0) {
|
|
throw new Error('No agents specified in route request');
|
|
}
|
|
let selectedAgent = null;
|
|
let fallbackUsed = false;
|
|
let circuitState = CircuitState.CLOSED;
|
|
// Try each agent in chain
|
|
for (let i = 0; i < agentChain.length; i++) {
|
|
const agent = agentChain[i];
|
|
const state = this.getCircuitState(agent);
|
|
if (state === CircuitState.CLOSED) {
|
|
// Circuit closed: use this agent
|
|
selectedAgent = agent;
|
|
circuitState = state;
|
|
if (i > 0)
|
|
fallbackUsed = true;
|
|
break;
|
|
}
|
|
else if (state === CircuitState.HALF_OPEN) {
|
|
// Circuit half-open: allow test request
|
|
selectedAgent = agent;
|
|
circuitState = state;
|
|
if (i > 0)
|
|
fallbackUsed = true;
|
|
break;
|
|
}
|
|
// Circuit OPEN: try next agent in chain
|
|
}
|
|
if (!selectedAgent) {
|
|
// All circuits open: force use last agent with degraded service warning
|
|
selectedAgent = agentChain[agentChain.length - 1];
|
|
circuitState = CircuitState.OPEN;
|
|
fallbackUsed = true;
|
|
this.routingMetrics.failedRoutes++;
|
|
}
|
|
// Calculate confidence based on circuit state and failure history
|
|
const confidence = this.calculateConfidence(selectedAgent, circuitState);
|
|
// Estimate uncertainty if enabled
|
|
const uncertainty = this.config.enableUncertaintyEstimation
|
|
? this.estimateUncertainty(selectedAgent)
|
|
: undefined;
|
|
const routingTimeMs = performance.now() - startTime;
|
|
// Update metrics
|
|
if (fallbackUsed) {
|
|
this.routingMetrics.fallbackRoutes++;
|
|
}
|
|
else {
|
|
this.routingMetrics.successfulRoutes++;
|
|
}
|
|
this.updateAvgRoutingTime(routingTimeMs);
|
|
return {
|
|
selectedAgent,
|
|
confidence,
|
|
circuitState,
|
|
fallbackUsed,
|
|
uncertainty,
|
|
metrics: {
|
|
routingTimeMs,
|
|
failureCount: this.failureCounts.get(selectedAgent) ?? 0,
|
|
successCount: this.successCounts.get(selectedAgent) ?? 0,
|
|
},
|
|
};
|
|
}
|
|
/**
|
|
* Record successful agent execution
|
|
*
|
|
* Updates circuit breaker state:
|
|
* - Increment success count
|
|
* - Reset failure count if threshold reached
|
|
* - Transition HALF_OPEN -> CLOSED if successful
|
|
*
|
|
* @param agent - Agent that succeeded
|
|
*/
|
|
recordSuccess(agent) {
|
|
const currentState = this.getCircuitState(agent);
|
|
const successCount = (this.successCounts.get(agent) ?? 0) + 1;
|
|
this.successCounts.set(agent, successCount);
|
|
this.lastSuccessTimes.set(agent, Date.now());
|
|
// Reset failure count on success
|
|
this.failureCounts.set(agent, 0);
|
|
// Transition HALF_OPEN -> CLOSED if enough successes
|
|
if (currentState === CircuitState.HALF_OPEN) {
|
|
if (successCount >= this.config.successThreshold) {
|
|
this.circuitStates.set(agent, CircuitState.CLOSED);
|
|
this.successCounts.set(agent, 0); // Reset counter
|
|
}
|
|
}
|
|
}
|
|
/**
|
|
* Record failed agent execution
|
|
*
|
|
* Updates circuit breaker state:
|
|
* - Increment failure count
|
|
* - Transition CLOSED -> OPEN if threshold exceeded
|
|
* - Transition HALF_OPEN -> OPEN on failure
|
|
* - Schedule automatic reset
|
|
*
|
|
* @param agent - Agent that failed
|
|
*/
|
|
recordFailure(agent) {
|
|
const currentState = this.getCircuitState(agent);
|
|
const failureCount = (this.failureCounts.get(agent) ?? 0) + 1;
|
|
this.failureCounts.set(agent, failureCount);
|
|
this.lastFailureTimes.set(agent, Date.now());
|
|
// Transition CLOSED -> OPEN if threshold exceeded
|
|
if (currentState === CircuitState.CLOSED) {
|
|
if (failureCount >= this.config.failureThreshold) {
|
|
this.openCircuit(agent);
|
|
}
|
|
}
|
|
// Transition HALF_OPEN -> OPEN on any failure
|
|
if (currentState === CircuitState.HALF_OPEN) {
|
|
this.openCircuit(agent);
|
|
}
|
|
}
|
|
/**
|
|
* Get circuit state for agent
|
|
*
|
|
* @param agent - Agent name
|
|
* @returns Current circuit state
|
|
*/
|
|
getCircuitState(agent) {
|
|
return this.circuitStates.get(agent) ?? CircuitState.CLOSED;
|
|
}
|
|
/**
|
|
* Get health status for all agents
|
|
*
|
|
* @returns Array of agent health metrics
|
|
*/
|
|
getAgentHealth() {
|
|
const agents = new Set([
|
|
...Array.from(this.circuitStates.keys()),
|
|
...Array.from(this.failureCounts.keys()),
|
|
...Array.from(this.successCounts.keys()),
|
|
]);
|
|
return Array.from(agents).map(agent => {
|
|
const state = this.getCircuitState(agent);
|
|
const failureCount = this.failureCounts.get(agent) ?? 0;
|
|
const successCount = this.successCounts.get(agent) ?? 0;
|
|
const total = failureCount + successCount;
|
|
const availability = total > 0 ? successCount / total : 1.0;
|
|
return {
|
|
agent,
|
|
state,
|
|
failureCount,
|
|
successCount,
|
|
lastFailureTime: this.lastFailureTimes.get(agent),
|
|
lastSuccessTime: this.lastSuccessTimes.get(agent),
|
|
availability,
|
|
};
|
|
});
|
|
}
|
|
/**
|
|
* Get routing metrics
|
|
*
|
|
* @returns Cumulative routing statistics
|
|
*/
|
|
getMetrics() {
|
|
return { ...this.routingMetrics };
|
|
}
|
|
/**
|
|
* Manually reset circuit for agent
|
|
*
|
|
* @param agent - Agent to reset
|
|
*/
|
|
resetCircuit(agent) {
|
|
this.circuitStates.set(agent, CircuitState.CLOSED);
|
|
this.failureCounts.set(agent, 0);
|
|
this.successCounts.set(agent, 0);
|
|
// Clear reset timer if exists
|
|
const timer = this.resetTimers.get(agent);
|
|
if (timer) {
|
|
clearTimeout(timer);
|
|
this.resetTimers.delete(agent);
|
|
}
|
|
}
|
|
/**
|
|
* Hot-reload configuration
|
|
*
|
|
* Allows updating circuit breaker parameters without restart.
|
|
*
|
|
* @param config - New configuration
|
|
*/
|
|
updateConfig(config) {
|
|
Object.assign(this.config, config);
|
|
}
|
|
// ========================================================================
|
|
// Private Helper Methods
|
|
// ========================================================================
|
|
/**
|
|
* Open circuit and schedule automatic reset
|
|
*/
|
|
openCircuit(agent) {
|
|
this.circuitStates.set(agent, CircuitState.OPEN);
|
|
// Clear existing timer
|
|
const existingTimer = this.resetTimers.get(agent);
|
|
if (existingTimer) {
|
|
clearTimeout(existingTimer);
|
|
}
|
|
// Schedule automatic transition to HALF_OPEN
|
|
const timer = setTimeout(() => {
|
|
this.circuitStates.set(agent, CircuitState.HALF_OPEN);
|
|
this.successCounts.set(agent, 0); // Reset success counter for half-open test
|
|
this.resetTimers.delete(agent);
|
|
}, this.config.resetTimeout);
|
|
this.resetTimers.set(agent, timer);
|
|
}
|
|
/**
|
|
* Calculate confidence score
|
|
*/
|
|
calculateConfidence(agent, circuitState) {
|
|
const successCount = this.successCounts.get(agent) ?? 0;
|
|
const failureCount = this.failureCounts.get(agent) ?? 0;
|
|
const total = successCount + failureCount;
|
|
if (total === 0) {
|
|
return 0.8; // Default confidence for new agents
|
|
}
|
|
const successRate = successCount / total;
|
|
// Adjust confidence based on circuit state
|
|
let stateMultiplier;
|
|
switch (circuitState) {
|
|
case CircuitState.CLOSED:
|
|
stateMultiplier = 1.0;
|
|
break;
|
|
case CircuitState.HALF_OPEN:
|
|
stateMultiplier = 0.7;
|
|
break;
|
|
case CircuitState.OPEN:
|
|
stateMultiplier = 0.3;
|
|
break;
|
|
}
|
|
return successRate * stateMultiplier;
|
|
}
|
|
/**
|
|
* Estimate uncertainty for routing decision
|
|
*
|
|
* Uses failure history and circuit state to estimate decision uncertainty.
|
|
* Higher uncertainty indicates less reliable routing.
|
|
*
|
|
* @param agent - Agent to estimate uncertainty for
|
|
* @returns Uncertainty score (0-1, lower is better)
|
|
*/
|
|
estimateUncertainty(agent) {
|
|
const failureCount = this.failureCounts.get(agent) ?? 0;
|
|
const successCount = this.successCounts.get(agent) ?? 0;
|
|
const total = failureCount + successCount;
|
|
if (total === 0) {
|
|
return 0.5; // High uncertainty for untested agents
|
|
}
|
|
// Uncertainty based on:
|
|
// 1. Failure rate
|
|
// 2. Sample size (low samples = high uncertainty)
|
|
// 3. Recent failures
|
|
const failureRate = failureCount / total;
|
|
const sampleSizeUncertainty = Math.exp(-total / 10); // Decreases with more samples
|
|
const recencyFactor = this.getRecencyFactor(agent);
|
|
return (failureRate * 0.5 + sampleSizeUncertainty * 0.3 + recencyFactor * 0.2);
|
|
}
|
|
/**
|
|
* Calculate recency factor based on time since last failure
|
|
*/
|
|
getRecencyFactor(agent) {
|
|
const lastFailure = this.lastFailureTimes.get(agent);
|
|
if (!lastFailure) {
|
|
return 0; // No recent failures
|
|
}
|
|
const timeSinceFailure = Date.now() - lastFailure;
|
|
const halfLife = this.config.resetTimeout / 2;
|
|
// Exponential decay: recent failures have higher weight
|
|
return Math.exp(-timeSinceFailure / halfLife);
|
|
}
|
|
/**
|
|
* Update average routing time (exponential moving average)
|
|
*/
|
|
updateAvgRoutingTime(newTime) {
|
|
const alpha = 0.1; // EMA smoothing factor
|
|
this.routingMetrics.avgRoutingTimeMs =
|
|
this.routingMetrics.avgRoutingTimeMs * (1 - alpha) + newTime * alpha;
|
|
}
|
|
/**
|
|
* Simple string hash function for rate limiting
|
|
*/
|
|
hashString(str) {
|
|
let hash = 0;
|
|
for (let i = 0; i < str.length; i++) {
|
|
const char = str.charCodeAt(i);
|
|
hash = ((hash << 5) - hash) + char;
|
|
hash = hash & hash; // Convert to 32-bit integer
|
|
}
|
|
return `route-${Math.abs(hash)}`;
|
|
}
|
|
/**
|
|
* Cleanup resources on shutdown
|
|
*/
|
|
destroy() {
|
|
// Clear all reset timers
|
|
this.resetTimers.forEach((timer) => {
|
|
clearTimeout(timer);
|
|
});
|
|
this.resetTimers.clear();
|
|
// Destroy rate limiter
|
|
this.rateLimiter.destroy();
|
|
}
|
|
}
|
|
//# sourceMappingURL=CircuitBreakerRouter.js.map
|