tasq/node_modules/agentic-flow/docs/plans/agent-booster/03-BENCHMARKS.md

16 KiB
Raw Permalink Blame History

Agent Booster: Benchmark Methodology

🎯 Benchmark Goals

  1. Establish baseline - Measure Morph LLM performance with Anthropic models
  2. Measure speedup - Quantify Agent Booster performance improvements
  3. Validate accuracy - Ensure quality is maintained or improved
  4. Calculate savings - Demonstrate cost reduction
  5. Identify limitations - Understand where Agent Booster excels vs struggles

📊 Benchmark Suite Structure

benchmarks/
├── datasets/                    # Test code samples
│   ├── javascript/
│   │   ├── simple/             # 40 samples
│   │   ├── medium/             # 40 samples
│   │   └── complex/            # 20 samples
│   ├── typescript/
│   ├── python/
│   └── rust/
│
├── baselines/                   # Morph LLM baselines
│   ├── morph-claude-sonnet-4.ts
│   ├── morph-claude-opus-4.ts
│   └── morph-claude-haiku-4.ts
│
├── agent-booster/               # Agent Booster tests
│   ├── native-addon.ts
│   ├── wasm.ts
│   └── typescript-fallback.ts
│
├── results/                     # Benchmark outputs
│   ├── raw/                    # Raw JSON results
│   ├── analysis/               # Processed results
│   └── reports/                # HTML/PDF reports
│
└── scripts/
    ├── run-all.sh              # Run full suite
    ├── run-baseline.sh         # Morph LLM only
    ├── run-agent-booster.sh    # Agent Booster only
    ├── compare.ts              # Generate comparison
    └── visualize.ts            # Create charts

📝 Test Datasets

Simple Edits (40 samples per language)

Characteristics:

  • Single function/method modifications
  • Clear, unambiguous edit descriptions
  • < 50 lines of code
  • Expected accuracy: 99%+

Examples:

  1. Add parameter

    // Original
    function greet(name: string) {
      return `Hello, ${name}!`;
    }
    
    // Edit: "add optional greeting parameter with default 'Hello'"
    
    // Expected
    function greet(name: string, greeting: string = 'Hello') {
      return `${greeting}, ${name}!`;
    }
    
  2. Add error handling

    // Original
    function parseJSON(text: string) {
      return JSON.parse(text);
    }
    
    // Edit: "add try-catch error handling"
    
    // Expected
    function parseJSON(text: string) {
      try {
        return JSON.parse(text);
      } catch (error) {
        console.error('Failed to parse JSON:', error);
        return null;
      }
    }
    
  3. Rename variable

    // Edit: "rename 'data' to 'userData'"
    
  4. Add return type

    // Edit: "add explicit return type annotation"
    
  5. Add JSDoc comment

    // Edit: "add JSDoc documentation"
    

Medium Edits (40 samples per language)

Characteristics:

  • Multi-line function bodies
  • Some ambiguity in edit description
  • 50-200 lines of code
  • Expected accuracy: 95%+

Examples:

  1. Convert to async/await

    // Edit: "convert promises to async/await"
    
  2. Add input validation

    // Edit: "add parameter validation for email format"
    
  3. Extract helper function

    // Edit: "extract password hashing logic into separate function"
    
  4. Add type safety

    // Edit: "replace 'any' types with proper types"
    

Complex Edits (20 samples per language)

Characteristics:

  • Architectural changes
  • Multiple functions affected
  • 200+ lines of code
  • Expected accuracy: 85%+

Examples:

  1. Refactor to design pattern

    // Edit: "refactor to use Strategy pattern for authentication"
    
  2. Add dependency injection

    // Edit: "convert to use dependency injection for database"
    
  3. Extract class

    // Edit: "extract user validation into separate class"
    

Baseline: Morph LLM Performance

Test Configuration

// benchmarks/baselines/morph-claude-sonnet-4.ts

import Anthropic from '@anthropic-ai/sdk';

const MORPH_API_KEY = process.env.MORPH_API_KEY;
const MORPH_BASE_URL = 'https://api.morphllm.com/v1';

interface MorphBenchmarkConfig {
  model: 'claude-sonnet-4' | 'claude-opus-4' | 'claude-haiku-4';
  morphModel: 'morph-v3-fast' | 'morph-v3-large';
  dataset: string;
  iterations: number;
}

async function benchmarkMorph(config: MorphBenchmarkConfig) {
  const client = new Anthropic({
    apiKey: MORPH_API_KEY,
    baseURL: MORPH_BASE_URL,
  });

  const results = [];
  const dataset = loadDataset(config.dataset);

  for (const sample of dataset) {
    for (let i = 0; i < config.iterations; i++) {
      const startTime = performance.now();

      const response = await client.messages.create({
        model: config.morphModel,
        max_tokens: 4096,
        messages: [{
          role: 'user',
          content: formatMorphPrompt(sample.original, sample.edit),
        }],
      });

      const latency = performance.now() - startTime;
      const mergedCode = response.content[0].text;

      // Validate result
      const isCorrect = validateResult(mergedCode, sample.expected);
      const syntaxValid = checkSyntax(mergedCode, sample.language);

      // Calculate cost
      const cost = calculateCost(response.usage);

      results.push({
        sample_id: sample.id,
        iteration: i,
        model: config.model,
        morph_model: config.morphModel,
        latency_ms: latency,
        correct: isCorrect,
        syntax_valid: syntaxValid,
        cost_usd: cost,
        tokens_input: response.usage.input_tokens,
        tokens_output: response.usage.output_tokens,
        timestamp: new Date().toISOString(),
      });

      // Rate limiting
      await sleep(1000); // 1 req/sec to be safe
    }
  }

  return aggregateResults(results);
}

function formatMorphPrompt(original: string, edit: string): string {
  return `<instruction>${edit}</instruction>
<code>${original}</code>
<update>Apply the edit</update>`;
}

function calculateCost(usage: { input_tokens: number; output_tokens: number }): number {
  // Claude Sonnet 4 pricing (example)
  const inputCost = (usage.input_tokens / 1000) * 0.003;
  const outputCost = (usage.output_tokens / 1000) * 0.015;
  return inputCost + outputCost;
}

Anthropic Models to Test

1. Claude Sonnet 4 (claude-sonnet-4-20250514)

  • Use Case: Production default (best balance)
  • Expected Performance: 6000ms latency, 98% accuracy
  • Cost: ~$0.01 per edit

2. Claude Opus 4 (claude-opus-4-20250514)

  • Use Case: Maximum accuracy
  • Expected Performance: 8000ms latency, 99% accuracy
  • Cost: ~$0.02 per edit

3. Claude Haiku 4 (claude-haiku-4-20250320)

  • Use Case: Speed-optimized
  • Expected Performance: 3000ms latency, 96% accuracy
  • Cost: ~$0.005 per edit

Morph Model Variants

1. morph-v3-large

  • Use Case: Best accuracy
  • Expected: Slower but more accurate

2. morph-v3-fast

  • Use Case: Speed-optimized
  • Expected: Faster but slightly less accurate

Agent Booster Benchmarks

Test Configuration

// benchmarks/agent-booster/native-addon.ts

import { AgentBooster } from 'agent-booster';

interface AgentBoosterBenchmarkConfig {
  model: 'jina-code-v2' | 'all-MiniLM-L6-v2';
  dataset: string;
  iterations: number;
  variant: 'native' | 'wasm' | 'typescript';
}

async function benchmarkAgentBooster(config: AgentBoosterBenchmarkConfig) {
  const booster = new AgentBooster({
    model: config.model,
    confidenceThreshold: 0.0, // Disable fallback for pure benchmark
  });

  const results = [];
  const dataset = loadDataset(config.dataset);

  for (const sample of dataset) {
    for (let i = 0; i < config.iterations; i++) {
      const startTime = performance.now();

      try {
        const result = await booster.applyEdit({
          originalCode: sample.original,
          editSnippet: sample.edit,
          language: sample.language,
        });

        const latency = performance.now() - startTime;

        // Validate result
        const isCorrect = validateResult(result.mergedCode, sample.expected);
        const syntaxValid = checkSyntax(result.mergedCode, sample.language);

        results.push({
          sample_id: sample.id,
          iteration: i,
          variant: config.variant,
          model: config.model,
          latency_ms: latency,
          correct: isCorrect,
          syntax_valid: syntaxValid,
          confidence: result.confidence,
          strategy: result.strategy,
          cost_usd: 0, // Always $0
          timestamp: new Date().toISOString(),
        });
      } catch (error) {
        results.push({
          sample_id: sample.id,
          iteration: i,
          variant: config.variant,
          error: error.message,
          latency_ms: performance.now() - startTime,
          correct: false,
          syntax_valid: false,
        });
      }
    }
  }

  return aggregateResults(results);
}

Variants to Test

1. Native Addon (napi-rs)

  • Platform: Node.js on native hardware
  • Expected: Fastest (30-50ms)

2. WASM

  • Platform: Node.js with WASM
  • Expected: Medium (50-100ms)

3. TypeScript Fallback

  • Platform: Pure TypeScript (no Rust)
  • Expected: Slower (100-200ms)

📊 Metrics to Collect

Performance Metrics

interface PerformanceMetrics {
  // Latency
  latency_p50: number;      // Median
  latency_p95: number;      // 95th percentile
  latency_p99: number;      // 99th percentile
  latency_max: number;      // Maximum
  latency_min: number;      // Minimum
  latency_mean: number;     // Average
  latency_stddev: number;   // Standard deviation

  // Throughput
  throughput_edits_per_sec: number;
  throughput_tokens_per_sec: number;

  // Memory
  memory_peak_mb: number;
  memory_avg_mb: number;

  // Startup
  cold_start_ms: number;
  warm_start_ms: number;
}

Accuracy Metrics

interface AccuracyMetrics {
  // Overall
  accuracy_exact_match: number;      // Exact code match
  accuracy_semantic_match: number;   // Semantically equivalent
  accuracy_syntax_valid: number;     // Valid syntax

  // By complexity
  accuracy_simple: number;   // Simple edits
  accuracy_medium: number;   // Medium edits
  accuracy_complex: number;  // Complex edits

  // Confidence correlation
  confidence_avg: number;
  confidence_accuracy_correlation: number;

  // Error rates
  false_positive_rate: number;
  false_negative_rate: number;
  syntax_error_rate: number;
}

Cost Metrics

interface CostMetrics {
  cost_per_edit: number;
  cost_total: number;
  cost_saved_vs_baseline: number;
  cost_saved_percentage: number;

  // Token usage (for LLM baselines)
  tokens_per_edit_avg: number;
  tokens_input_avg: number;
  tokens_output_avg: number;
}

📈 Comparison Analysis

Statistical Tests

interface ComparisonAnalysis {
  // Speed comparison
  speedup_factor: number;          // Agent Booster vs Morph
  speedup_confidence_interval: [number, number];
  speedup_p_value: number;         // T-test significance

  // Accuracy comparison
  accuracy_difference: number;     // Percentage points
  accuracy_significance: boolean;  // Statistically significant?

  // Cost savings
  cost_savings_per_edit: number;
  cost_savings_per_1000_edits: number;
  break_even_point: number;        // Number of edits to break even

  // Quality metrics
  quality_score: number;           // Weighted score (accuracy + speed)
  recommended_use_cases: string[];
}

Visualization

// Generate comparison charts
async function generateCharts(results: BenchmarkResults) {
  await generateLatencyChart(results);
  await generateAccuracyChart(results);
  await generateCostChart(results);
  await generateConfidenceDistribution(results);
  await generateComplexityBreakdown(results);
}

🎯 Benchmark Execution Plan

Phase 1: Baseline (Week 1)

# 1. Setup Morph LLM account and get API key
export MORPH_API_KEY=sk-morph-xxx

# 2. Prepare datasets
npm run benchmark:prepare-datasets

# 3. Run Morph + Claude Sonnet 4 baseline
npm run benchmark:baseline -- --model claude-sonnet-4 --iterations 3

# 4. Run Morph + Claude Opus 4 baseline
npm run benchmark:baseline -- --model claude-opus-4 --iterations 3

# 5. Run Morph + Claude Haiku 4 baseline
npm run benchmark:baseline -- --model claude-haiku-4 --iterations 3

# 6. Analyze baseline results
npm run benchmark:analyze-baseline

Expected Duration: 8-12 hours (100 samples × 3 iterations × 3 models × 6s)

Expected Cost: ~$30-50 (300 edits × $0.01-0.02 per edit)

Phase 2: Agent Booster (Week 2)

# 1. Build Agent Booster
cargo build --release
npm run build

# 2. Download embedding models
npm run download-models

# 3. Run native addon benchmarks
npm run benchmark:agent-booster -- --variant native --iterations 10

# 4. Run WASM benchmarks
npm run benchmark:agent-booster -- --variant wasm --iterations 10

# 5. Run TypeScript fallback benchmarks
npm run benchmark:agent-booster -- --variant typescript --iterations 10

# 6. Analyze Agent Booster results
npm run benchmark:analyze-agent-booster

Expected Duration: 1-2 hours (100 samples × 10 iterations × 3 variants × 50ms)

Expected Cost: $0

Phase 3: Comparison (Week 3)

# 1. Generate comparison analysis
npm run benchmark:compare

# 2. Generate charts and visualizations
npm run benchmark:visualize

# 3. Generate HTML report
npm run benchmark:report

# 4. Publish results
npm run benchmark:publish

📋 Expected Results

Latency Comparison

Metric Morph + Sonnet 4 Agent Booster (Native) Improvement
p50 5,800ms 35ms 166x faster
p95 8,200ms 52ms 158x faster
p99 12,000ms 85ms 141x faster
Max 18,000ms 150ms 120x faster

Accuracy Comparison

Complexity Morph + Sonnet 4 Agent Booster Difference
Simple 99.2% 98.5% -0.7%
Medium 97.8% 96.2% -1.6%
Complex 96.1% 93.8% -2.3%
Overall 98.0% 96.8% -1.2%

Cost Comparison (1000 edits)

Solution Total Cost Cost per Edit Savings
Morph + Sonnet 4 $10.00 $0.010 -
Morph + Opus 4 $20.00 $0.020 -
Agent Booster $0.00 $0.000 100%

Based on benchmarks, recommend:

// For maximum performance
const config = {
  primaryMethod: 'agent-booster',
  model: 'jina-code-v2',
  confidenceThreshold: 0.65,
  fallbackToMorph: true,
  morphModel: 'claude-sonnet-4'
};

// Expected results with 1000 edits:
// - 850 edits via Agent Booster (85%, avg 40ms, $0)
// - 150 edits via Morph fallback (15%, avg 6000ms, $1.50)
// - Overall avg latency: 934ms (vs 6000ms pure Morph)
// - Overall cost: $1.50 (vs $10 pure Morph)
// - 6.4x faster, 85% cost savings

📊 Benchmark Report Template

# Agent Booster Benchmark Report

**Date**: YYYY-MM-DD
**Version**: agent-booster@0.1.0
**Dataset**: 100 samples (40 simple, 40 medium, 20 complex)
**Iterations**: 3 per sample (baseline), 10 per sample (Agent Booster)

## Executive Summary

- **Speed**: Agent Booster is **166x faster** than Morph + Claude Sonnet 4
- **Accuracy**: 96.8% vs 98.0% (-1.2 percentage points)
- **Cost**: **100% savings** ($0 vs $0.01 per edit)
- **Recommendation**: Use Agent Booster with fallback for best ROI

## Detailed Results

[Charts and tables here]

## Conclusions

[Analysis and recommendations]