perf: Add IQR outlier removal and median to benchmark statistics

krodak · krodak · commit 67555812ef76 · 2026-04-21T11:55:19.000+02:00
Benchmark results can be noisy due to GC timing and V8 JIT compilation.
IQR filtering discards values outside Q1-1.5*IQR to Q3+1.5*IQR before
computing statistics. The Samples column shows retained count (e.g.
'4 (-1)' means 4 kept, 1 discarded). Falls back to the full dataset
if fewer than 4 samples.

Applies to all benchmarks, not just identity mode.
diff --git a/Benchmarks/run.js b/Benchmarks/run.js
@@ -2,7 +2,7 @@ import { instantiate } from "./.build/plugins/PackageToJS/outputs/Package/instan
 import { defaultNodeSetup } from "./.build/plugins/PackageToJS/outputs/Package/platforms/node.js"
 import fs from 'fs';
 import path from 'path';
-import { parseArgs } from "util"
+import { parseArgs } from "util";
 import { parseIdentityModes, parseIdentityReusePools, runIdentityModeBenchmarks, summarizeIdentityMemory } from "./lib/identity-benchmarks.js"
 import { APIResultValues as APIResult, ComplexResultValues as ComplexResult } from "./.build/plugins/PackageToJS/outputs/Package/bridge-js.js";
 
@@ -63,17 +63,44 @@ function createNameFilter(arg) {
  * @returns {number} Coefficient of variation as a percentage
  */
 function calculateCV(values) {
-    if (values.length < 2) return 0;
-
-    const sum = values.reduce((a, b) => a + b, 0);
-    const mean = sum / values.length;
-
-    if (mean === 0) return 0;
+    if (values.length < 2) return 0
+    const filtered = removeOutliers(values)
+    const sum = filtered.reduce((a, b) => a + b, 0)
+    const mean = sum / filtered.length
+    if (mean === 0) return 0
+    const variance = filtered.reduce((a, b) => a + Math.pow(b - mean, 2), 0) / filtered.length
+    const stdDev = Math.sqrt(variance)
+    return (stdDev / mean) * 100
+}
 
-    const variance = values.reduce((a, b) => a + Math.pow(b - mean, 2), 0) / values.length;
-    const stdDev = Math.sqrt(variance);
+/**
+ * Remove outliers using the IQR (interquartile range) method.
+ * Discards values below Q1-1.5*IQR or above Q3+1.5*IQR.
+ * Returns the filtered array (or the original if too few samples).
+ * @param {Array<number>} values - Array of measurement values
+ * @returns {Array<number>} Values with outliers removed
+ */
+function removeOutliers(values) {
+    if (values.length < 4) return values
+    const sorted = [...values].sort((a, b) => a - b)
+    const q1 = sorted[Math.floor(sorted.length * 0.25)]
+    const q3 = sorted[Math.floor(sorted.length * 0.75)]
+    const iqr = q3 - q1
+    const lower = q1 - 1.5 * iqr
+    const upper = q3 + 1.5 * iqr
+    const filtered = values.filter(v => v >= lower && v <= upper)
+    return filtered.length > 0 ? filtered : values
+}
 
-    return (stdDev / mean) * 100; // Return as percentage
+/**
+ * Calculate the median of an array of numbers
+ * @param {Array<number>} values - Array of measurement values
+ * @returns {number} Median value
+ */
+function median(values) {
+    const sorted = [...values].sort((a, b) => a - b)
+    const mid = Math.floor(sorted.length / 2)
+    return sorted.length % 2 !== 0 ? sorted[mid] : (sorted[mid - 1] + sorted[mid]) / 2
 }
 
 /**
@@ -86,33 +113,38 @@ function calculateStatistics(results) {
     const consoleTable = [];
 
     for (const [name, times] of Object.entries(results)) {
-        const sum = times.reduce((a, b) => a + b, 0);
-        const avg = sum / times.length;
-        const min = Math.min(...times);
-        const max = Math.max(...times);
-        const variance = times.reduce((a, b) => a + Math.pow(b - avg, 2), 0) / times.length;
-        const stdDev = Math.sqrt(variance);
-        const cv = (stdDev / avg) * 100; // Coefficient of variation as percentage
+        const filtered = removeOutliers(times)
+        const sum = filtered.reduce((a, b) => a + b, 0)
+        const avg = sum / filtered.length
+        const med = median(filtered)
+        const min = Math.min(...filtered)
+        const max = Math.max(...filtered)
+        const variance = filtered.reduce((a, b) => a + Math.pow(b - avg, 2), 0) / filtered.length
+        const stdDev = Math.sqrt(variance)
+        const cv = (stdDev / avg) * 100
+        const outliers = times.length - filtered.length
 
         formattedResults[name] = {
             "avg_ms": parseFloat(avg.toFixed(2)),
+            "median_ms": parseFloat(med.toFixed(2)),
             "min_ms": parseFloat(min.toFixed(2)),
             "max_ms": parseFloat(max.toFixed(2)),
             "stdDev_ms": parseFloat(stdDev.toFixed(2)),
             "cv_percent": parseFloat(cv.toFixed(2)),
-            "samples": times.length,
+            "samples": filtered.length,
+            "outliers_removed": outliers,
             "rawTimes_ms": times.map(t => parseFloat(t.toFixed(2)))
-        };
+        }
 
         consoleTable.push({
             Test: name,
+            'Median (ms)': med.toFixed(2),
             'Avg (ms)': avg.toFixed(2),
             'Min (ms)': min.toFixed(2),
             'Max (ms)': max.toFixed(2),
-            'StdDev (ms)': stdDev.toFixed(2),
             'CV (%)': cv.toFixed(2),
-            'Samples': times.length
-        });
+            'Samples': filtered.length + (outliers > 0 ? ` (-${outliers})` : '')
+        })
     }
 
     return { formattedResults, consoleTable };
@@ -283,7 +315,7 @@ async function singleRun(results, nameFilter, iterations, identityConfig) {
             return;
         }
         // Warmup to reduce JIT/IC noise.
-        body();
+        body()
         if (typeof globalThis.gc === "function") {
             globalThis.gc();
         }
@@ -900,7 +932,7 @@ async function runUntilStable(results, options, width, nameFilter, filterArg, it
         // Update progress with estimated completion
         updateProgress(runs, maxRuns, "Benchmark Progress:", width);
 
-        await singleRun(results, nameFilter, iterations, identityConfig)
+        await singleRun(results, nameFilter, iterations, identityConfig);
         runs++;
 
         if (runs === 1 && Object.keys(results).length === 0) {
@@ -973,7 +1005,7 @@ Options:
   --filter=PATTERN      Filter benchmarks by name (substring or /regex/flags)
   --identity-mode=MODE  Identity benchmarks: off, none, pointer, both (default: off)
   --identity-iterations=N  Iterations for identity benchmarks (default: 1000000)
-  --identity-reuse-pools=N,N  Pool sizes for reuse scenarios (default: 1)
+  --identity-reuse-pools=N,N  Pool sizes for reuse scenarios (default: 1,8,64)
   --identity-memory     Enable memory profiling for identity benchmarks
   --help                Show this help message
 `);