tasq/node_modules/agentic-flow/dist/router/test-onnx-benchmark.js

146 lines
6.0 KiB
JavaScript

#!/usr/bin/env node
/**
* Comprehensive benchmark for ONNX local inference
* Tests CPU performance against targets
*/
import { ONNXLocalProvider } from './providers/onnx-local.js';
async function runBenchmark() {
console.log('🏁 ONNX Local Inference Benchmark (Phi-4 CPU)\n');
console.log('Target: 15-25 tokens/sec on CPU');
console.log('================================================\n');
const provider = new ONNXLocalProvider({
modelPath: './models/phi-4/cpu_and_mobile/cpu-int4-rtn-block-32-acc-level-4/model.onnx',
executionProviders: ['cpu'],
maxTokens: 50,
temperature: 0.7
});
const results = [];
// Test 1: Short response
console.log('Test 1: Short Response (Math)');
console.log('==============================');
const test1Start = Date.now();
const response1 = await provider.chat({
model: 'phi-4',
messages: [
{ role: 'user', content: 'What is 2+2?' }
],
maxTokens: 20
});
const test1Latency = Date.now() - test1Start;
const test1TPS = (response1.usage?.outputTokens || 0) / (test1Latency / 1000);
console.log(`✅ Response: ${response1.content[0].type === 'text' ? response1.content[0].text : ''}`);
console.log(`⏱️ ${test1Latency}ms | ${test1TPS.toFixed(1)} tokens/sec\n`);
results.push({
test: 'Short Math',
tokens: response1.usage?.outputTokens || 0,
latency: test1Latency,
tokensPerSecond: test1TPS
});
// Test 2: Medium response
console.log('Test 2: Medium Response (Reasoning)');
console.log('=====================================');
const test2Start = Date.now();
const response2 = await provider.chat({
model: 'phi-4',
messages: [
{ role: 'user', content: 'Explain why the sky is blue in one sentence.' }
],
maxTokens: 30
});
const test2Latency = Date.now() - test2Start;
const test2TPS = (response2.usage?.outputTokens || 0) / (test2Latency / 1000);
console.log(`✅ Response: ${response2.content[0].type === 'text' ? response2.content[0].text : ''}`);
console.log(`⏱️ ${test2Latency}ms | ${test2TPS.toFixed(1)} tokens/sec\n`);
results.push({
test: 'Medium Reasoning',
tokens: response2.usage?.outputTokens || 0,
latency: test2Latency,
tokensPerSecond: test2TPS
});
// Test 3: Longer response
console.log('Test 3: Longer Response (Creative)');
console.log('====================================');
const test3Start = Date.now();
const response3 = await provider.chat({
model: 'phi-4',
messages: [
{ role: 'user', content: 'List 5 programming languages.' }
],
maxTokens: 50
});
const test3Latency = Date.now() - test3Start;
const test3TPS = (response3.usage?.outputTokens || 0) / (test3Latency / 1000);
console.log(`✅ Response: ${response3.content[0].type === 'text' ? response3.content[0].text : ''}`);
console.log(`⏱️ ${test3Latency}ms | ${test3TPS.toFixed(1)} tokens/sec\n`);
results.push({
test: 'Longer Creative',
tokens: response3.usage?.outputTokens || 0,
latency: test3Latency,
tokensPerSecond: test3TPS
});
// Test 4: Multi-turn conversation
console.log('Test 4: Multi-Turn Conversation');
console.log('================================');
const test4Start = Date.now();
const response4 = await provider.chat({
model: 'phi-4',
messages: [
{ role: 'user', content: 'Hello!' },
{ role: 'assistant', content: 'Hello! How can I help you today?' },
{ role: 'user', content: 'Tell me a fun fact about computers.' }
],
maxTokens: 40
});
const test4Latency = Date.now() - test4Start;
const test4TPS = (response4.usage?.outputTokens || 0) / (test4Latency / 1000);
console.log(`✅ Response: ${response4.content[0].type === 'text' ? response4.content[0].text : ''}`);
console.log(`⏱️ ${test4Latency}ms | ${test4TPS.toFixed(1)} tokens/sec\n`);
results.push({
test: 'Multi-Turn',
tokens: response4.usage?.outputTokens || 0,
latency: test4Latency,
tokensPerSecond: test4TPS
});
// Summary
console.log('\n📊 Benchmark Summary');
console.log('====================\n');
const avgTPS = results.reduce((sum, r) => sum + r.tokensPerSecond, 0) / results.length;
const avgLatency = results.reduce((sum, r) => sum + r.latency, 0) / results.length;
const totalTokens = results.reduce((sum, r) => sum + r.tokens, 0);
console.table(results.map(r => ({
Test: r.test,
Tokens: r.tokens,
'Latency (ms)': r.latency,
'Tokens/Sec': r.tokensPerSecond.toFixed(1)
})));
console.log(`\n📈 Performance Metrics:`);
console.log(` Average Tokens/Sec: ${avgTPS.toFixed(1)}`);
console.log(` Average Latency: ${avgLatency.toFixed(0)}ms`);
console.log(` Total Tokens Generated: ${totalTokens}`);
console.log(`\n🎯 Target Validation:`);
const targetMin = 15;
const targetMax = 25;
if (avgTPS >= targetMin && avgTPS <= targetMax * 1.5) {
console.log(` ✅ PASS: ${avgTPS.toFixed(1)} tokens/sec is within acceptable range`);
}
else if (avgTPS < targetMin) {
console.log(` ⚠️ SLOW: ${avgTPS.toFixed(1)} tokens/sec is below target (${targetMin}+)`);
}
else {
console.log(` 🚀 FAST: ${avgTPS.toFixed(1)} tokens/sec exceeds target!`);
}
console.log(`\n💰 Cost Savings:`);
console.log(` Local Inference: $0.00`);
console.log(` Anthropic Equivalent: ~$${(totalTokens * 0.00003).toFixed(4)}`);
console.log(` Savings: 100%`);
console.log(`\n🔒 Privacy:`);
console.log(` ✅ All processing local`);
console.log(` ✅ No data sent to cloud`);
console.log(` ✅ GDPR/HIPAA compliant`);
await provider.dispose();
}
runBenchmark().catch(error => {
console.error('❌ Benchmark failed:', error);
process.exit(1);
});
//# sourceMappingURL=test-onnx-benchmark.js.map