|
| 1 | +/** |
| 2 | + * Metrics Collection and Analysis for ReasoningBank Benchmarks |
| 3 | + */ |
| 4 | +export class MetricsCollector { |
| 5 | + baselineResults = []; |
| 6 | + reasoningbankResults = []; |
| 7 | + learningCurve = []; |
| 8 | + addResult(result) { |
| 9 | + if (result.agentType === 'baseline') { |
| 10 | + this.baselineResults.push(result); |
| 11 | + } |
| 12 | + else { |
| 13 | + this.reasoningbankResults.push(result); |
| 14 | + } |
| 15 | + } |
| 16 | + addLearningPoint(point) { |
| 17 | + this.learningCurve.push(point); |
| 18 | + } |
| 19 | + calculateAgentMetrics(results) { |
| 20 | + const successful = results.filter(r => r.success); |
| 21 | + const totalTokens = results.reduce((sum, r) => sum + r.tokens, 0); |
| 22 | + const totalLatency = results.reduce((sum, r) => sum + r.latency, 0); |
| 23 | + const errors = results.filter(r => r.error).map(r => r.error); |
| 24 | + const metrics = { |
| 25 | + successRate: results.length > 0 ? successful.length / results.length : 0, |
| 26 | + totalTasks: results.length, |
| 27 | + successfulTasks: successful.length, |
| 28 | + avgTokens: results.length > 0 ? totalTokens / results.length : 0, |
| 29 | + totalTokens, |
| 30 | + avgLatency: results.length > 0 ? totalLatency / results.length : 0, |
| 31 | + totalLatency, |
| 32 | + errors |
| 33 | + }; |
| 34 | + // Add ReasoningBank-specific metrics |
| 35 | + const rbResults = results.filter(r => r.memoriesUsed !== undefined); |
| 36 | + if (rbResults.length > 0) { |
| 37 | + const totalMemoriesUsed = rbResults.reduce((sum, r) => sum + (r.memoriesUsed || 0), 0); |
| 38 | + const totalMemoriesCreated = rbResults.reduce((sum, r) => sum + (r.memoriesCreated || 0), 0); |
| 39 | + const totalConfidence = rbResults.reduce((sum, r) => sum + (r.confidence || 0), 0); |
| 40 | + metrics.memoriesUsed = totalMemoriesUsed; |
| 41 | + metrics.memoriesCreated = totalMemoriesCreated; |
| 42 | + metrics.avgConfidence = rbResults.length > 0 ? totalConfidence / rbResults.length : 0; |
| 43 | + } |
| 44 | + return metrics; |
| 45 | + } |
| 46 | + calculateImprovement(baseline, reasoningbank) { |
| 47 | + const successRateDelta = reasoningbank.successRate - baseline.successRate; |
| 48 | + const successRatePercent = baseline.successRate > 0 |
| 49 | + ? (successRateDelta / baseline.successRate) * 100 |
| 50 | + : (reasoningbank.successRate > 0 ? 100 : 0); |
| 51 | + const tokenDelta = baseline.avgTokens - reasoningbank.avgTokens; |
| 52 | + const tokenSavings = baseline.avgTokens > 0 |
| 53 | + ? (tokenDelta / baseline.avgTokens) * 100 |
| 54 | + : 0; |
| 55 | + const latencyDelta = reasoningbank.avgLatency - baseline.avgLatency; |
| 56 | + const latencyOverhead = baseline.avgLatency > 0 |
| 57 | + ? (latencyDelta / baseline.avgLatency) * 100 |
| 58 | + : 0; |
| 59 | + // Calculate learning velocity (iterations to reach 100% success) |
| 60 | + let learningVelocity; |
| 61 | + if (this.learningCurve.length > 0) { |
| 62 | + const firstSuccess = this.learningCurve.findIndex(p => p.reasoningbankSuccess === 1.0); |
| 63 | + const baselineFirstSuccess = this.learningCurve.findIndex(p => p.baselineSuccess === 1.0); |
| 64 | + if (firstSuccess !== -1 && baselineFirstSuccess !== -1) { |
| 65 | + learningVelocity = baselineFirstSuccess / firstSuccess; |
| 66 | + } |
| 67 | + else if (firstSuccess !== -1) { |
| 68 | + learningVelocity = this.learningCurve.length / firstSuccess; |
| 69 | + } |
| 70 | + } |
| 71 | + return { |
| 72 | + successRateDelta: this.formatPercent(successRateDelta), |
| 73 | + successRatePercent, |
| 74 | + tokenEfficiency: this.formatPercent(tokenSavings / 100), |
| 75 | + tokenSavings, |
| 76 | + latencyOverhead: this.formatPercent(latencyOverhead / 100), |
| 77 | + latencyDelta, |
| 78 | + learningVelocity |
| 79 | + }; |
| 80 | + } |
| 81 | + formatPercent(value) { |
| 82 | + const sign = value >= 0 ? '+' : ''; |
| 83 | + return `${sign}${(value * 100).toFixed(1)}%`; |
| 84 | + } |
| 85 | + generateScenarioResults(scenarioName) { |
| 86 | + const baseline = this.calculateAgentMetrics(this.baselineResults); |
| 87 | + const reasoningbank = this.calculateAgentMetrics(this.reasoningbankResults); |
| 88 | + const improvement = this.calculateImprovement(baseline, reasoningbank); |
| 89 | + return { |
| 90 | + scenarioName, |
| 91 | + baseline, |
| 92 | + reasoningbank, |
| 93 | + improvement, |
| 94 | + learningCurve: [...this.learningCurve], |
| 95 | + timestamp: new Date().toISOString() |
| 96 | + }; |
| 97 | + } |
| 98 | + reset() { |
| 99 | + this.baselineResults = []; |
| 100 | + this.reasoningbankResults = []; |
| 101 | + this.learningCurve = []; |
| 102 | + } |
| 103 | + // Statistical analysis methods |
| 104 | + calculateStandardDeviation(values) { |
| 105 | + if (values.length === 0) |
| 106 | + return 0; |
| 107 | + const mean = values.reduce((sum, v) => sum + v, 0) / values.length; |
| 108 | + const variance = values.reduce((sum, v) => sum + Math.pow(v - mean, 2), 0) / values.length; |
| 109 | + return Math.sqrt(variance); |
| 110 | + } |
| 111 | + calculateConfidenceInterval(values, confidenceLevel = 0.95) { |
| 112 | + if (values.length === 0) { |
| 113 | + return { lower: 0, upper: 0, mean: 0 }; |
| 114 | + } |
| 115 | + const mean = values.reduce((sum, v) => sum + v, 0) / values.length; |
| 116 | + const std = this.calculateStandardDeviation(values); |
| 117 | + const z = confidenceLevel === 0.95 ? 1.96 : 2.576; // 95% or 99% |
| 118 | + const margin = z * (std / Math.sqrt(values.length)); |
| 119 | + return { |
| 120 | + lower: mean - margin, |
| 121 | + upper: mean + margin, |
| 122 | + mean |
| 123 | + }; |
| 124 | + } |
| 125 | + // Performance insights |
| 126 | + generateInsights(results) { |
| 127 | + const insights = []; |
| 128 | + const { baseline, reasoningbank, improvement } = results; |
| 129 | + // Success rate insights |
| 130 | + if (improvement.successRatePercent > 50) { |
| 131 | + insights.push(`🎯 Excellent improvement: ${improvement.successRateDelta} success rate increase`); |
| 132 | + } |
| 133 | + else if (improvement.successRatePercent < 0) { |
| 134 | + insights.push(`⚠️ Warning: Baseline outperformed ReasoningBank (${improvement.successRateDelta})`); |
| 135 | + } |
| 136 | + // Token efficiency insights |
| 137 | + if (improvement.tokenSavings > 30) { |
| 138 | + insights.push(`💰 Significant token savings: ${improvement.tokenEfficiency} reduction`); |
| 139 | + } |
| 140 | + else if (improvement.tokenSavings < 0) { |
| 141 | + insights.push(`⚠️ Token overhead: ${Math.abs(improvement.tokenSavings).toFixed(1)}% increase`); |
| 142 | + } |
| 143 | + // Latency insights |
| 144 | + if (Math.abs(improvement.latencyDelta) < 500) { |
| 145 | + insights.push(`⚡ Minimal latency overhead: ${Math.abs(improvement.latencyDelta).toFixed(0)}ms`); |
| 146 | + } |
| 147 | + else if (improvement.latencyDelta > 1000) { |
| 148 | + insights.push(`🐌 High latency overhead: ${improvement.latencyDelta.toFixed(0)}ms - consider optimization`); |
| 149 | + } |
| 150 | + // Memory insights |
| 151 | + if (reasoningbank.memoriesUsed && reasoningbank.memoriesUsed > 0) { |
| 152 | + const avgMemoriesPerTask = reasoningbank.memoriesUsed / reasoningbank.totalTasks; |
| 153 | + insights.push(`🧠 Memory utilization: ${avgMemoriesPerTask.toFixed(1)} memories per task`); |
| 154 | + } |
| 155 | + // Learning velocity insights |
| 156 | + if (improvement.learningVelocity && improvement.learningVelocity > 2) { |
| 157 | + insights.push(`🚀 Fast learner: ${improvement.learningVelocity.toFixed(1)}x faster than baseline`); |
| 158 | + } |
| 159 | + return insights; |
| 160 | + } |
| 161 | +} |
| 162 | +// Export singleton instance |
| 163 | +export const metrics = new MetricsCollector(); |
0 commit comments