Benchmarking¶

Write and run reliable benchmarks.

Python Benchmarking¶

timeit Module¶

import timeit

# Simple timing
time = timeit.timeit(
    'sum(range(1000))',
    number=10000
)
print(f"Average: {time / 10000 * 1000:.3f}ms")

# Compare approaches
setup = "data = list(range(1000))"

approach_1 = timeit.timeit(
    'sum(data)',
    setup=setup,
    number=10000
)

approach_2 = timeit.timeit(
    'result = 0\nfor x in data:\n    result += x',
    setup=setup,
    number=10000
)

print(f"sum(): {approach_1:.3f}s")
print(f"loop: {approach_2:.3f}s")

pytest-benchmark¶

pip install pytest-benchmark

# test_benchmark.py
def fibonacci(n):
    if n < 2:
        return n
    return fibonacci(n - 1) + fibonacci(n - 2)

def fibonacci_memo(n, memo={}):
    if n in memo:
        return memo[n]
    if n < 2:
        return n
    memo[n] = fibonacci_memo(n - 1, memo) + fibonacci_memo(n - 2, memo)
    return memo[n]

def test_fibonacci(benchmark):
    result = benchmark(fibonacci, 20)
    assert result == 6765

def test_fibonacci_memo(benchmark):
    result = benchmark(fibonacci_memo, 20)
    assert result == 6765

pytest test_benchmark.py --benchmark-only

Output:

Name                    Min      Max     Mean    StdDev  Rounds
test_fibonacci      2.5ms    3.1ms   2.7ms    0.2ms     100
test_fibonacci_memo 1.2us    2.0us   1.5us    0.3us    10000

Rich Benchmark Output¶

import time
from rich.console import Console
from rich.table import Table

def benchmark(name, func, iterations=1000):
    times = []
    for _ in range(iterations):
        start = time.perf_counter()
        func()
        times.append(time.perf_counter() - start)

    return {
        "name": name,
        "min": min(times) * 1000,
        "max": max(times) * 1000,
        "mean": sum(times) / len(times) * 1000,
    }

# Run benchmarks
results = [
    benchmark("approach_1", lambda: sum(range(1000))),
    benchmark("approach_2", lambda: functools.reduce(lambda a,b: a+b, range(1000))),
]

# Display
console = Console()
table = Table(title="Benchmark Results")
table.add_column("Name")
table.add_column("Min (ms)")
table.add_column("Max (ms)")
table.add_column("Mean (ms)")

for r in results:
    table.add_row(r["name"], f"{r['min']:.3f}", f"{r['max']:.3f}", f"{r['mean']:.3f}")

console.print(table)

JavaScript Benchmarking¶

Performance API¶

function benchmark(name, fn, iterations = 1000) {
  const times = [];

  for (let i = 0; i < iterations; i++) {
    const start = performance.now();
    fn();
    times.push(performance.now() - start);
  }

  const sorted = times.sort((a, b) => a - b);
  return {
    name,
    min: sorted[0],
    max: sorted[sorted.length - 1],
    mean: times.reduce((a, b) => a + b, 0) / times.length,
    p50: sorted[Math.floor(sorted.length * 0.5)],
    p95: sorted[Math.floor(sorted.length * 0.95)],
  };
}

// Usage
const result = benchmark('array sort', () => {
  const arr = Array.from({ length: 1000 }, () => Math.random());
  arr.sort((a, b) => a - b);
});

console.table([result]);

Vitest Benchmarking¶

// bench/sort.bench.ts
import { bench, describe } from 'vitest';

describe('sorting', () => {
  const data = Array.from({ length: 1000 }, () => Math.random());

  bench('native sort', () => {
    [...data].sort((a, b) => a - b);
  });

  bench('custom quicksort', () => {
    quicksort([...data]);
  });
});

vitest bench

Benchmarking Best Practices¶

Warm-up¶

def benchmark_with_warmup(func, warmup=100, iterations=1000):
    # Warm up (JIT, caches, etc.)
    for _ in range(warmup):
        func()

    # Actual benchmark
    times = []
    for _ in range(iterations):
        start = time.perf_counter()
        func()
        times.append(time.perf_counter() - start)

    return times

Statistical Significance¶

import statistics

def analyze_results(times):
    return {
        "mean": statistics.mean(times),
        "median": statistics.median(times),
        "stdev": statistics.stdev(times),
        "p95": sorted(times)[int(len(times) * 0.95)],
        "p99": sorted(times)[int(len(times) * 0.99)],
    }

def compare_approaches(times_a, times_b):
    """Check if difference is significant."""
    from scipy import stats

    t_stat, p_value = stats.ttest_ind(times_a, times_b)
    return {
        "t_statistic": t_stat,
        "p_value": p_value,
        "significant": p_value < 0.05,
    }

Isolate External Factors¶

import gc

def isolated_benchmark(func, iterations=1000):
    # Disable garbage collection
    gc.disable()

    try:
        times = []
        for _ in range(iterations):
            start = time.perf_counter()
            func()
            times.append(time.perf_counter() - start)
        return times
    finally:
        gc.enable()

Avoid Common Pitfalls¶

# Bad: Dead code elimination
def benchmark_bad():
    result = expensive_computation()
    # Result never used, compiler might optimize away

# Good: Use the result
def benchmark_good():
    result = expensive_computation()
    return result  # Or use a side effect

# Bad: Microbenchmark doesn't reflect real usage
def benchmark_unrealistic():
    for i in range(1000000):
        x = i + 1  # Too simple, cache effects dominate

# Good: Realistic workload
def benchmark_realistic():
    data = load_test_data()
    return process_data(data)

Continuous Benchmarking¶

GitHub Actions¶

# .github/workflows/benchmark.yml
name: Benchmark

on:
  push:
    branches: [main]
  pull_request:

jobs:
  benchmark:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - name: Run benchmarks
        run: |
          pytest tests/benchmark/ --benchmark-json=benchmark.json

      - name: Store benchmark result
        uses: benchmark-action/github-action-benchmark@v1
        with:
          tool: pytest
          output-file-path: benchmark.json
          github-token: ${{ secrets.GITHUB_TOKEN }}
          auto-push: true
          alert-threshold: '150%'
          comment-on-alert: true

Tracking Over Time¶

import json
from datetime import datetime

def save_benchmark_result(name, result):
    record = {
        "timestamp": datetime.now().isoformat(),
        "name": name,
        "commit": os.environ.get("GIT_COMMIT", "unknown"),
        **result
    }

    with open("benchmark_history.jsonl", "a") as f:
        f.write(json.dumps(record) + "\n")

When to Benchmark¶

Scenario	Approach
Comparing algorithms	Microbenchmark
Optimizing hot path	Profile first, then benchmark
Preventing regressions	CI benchmarks
Capacity planning	Load testing
Real-world performance	APM in production

Reporting Results¶

## Benchmark Results

### Environment
- CPU: Intel i7-10700 @ 2.90GHz
- RAM: 32GB
- Python: 3.12.0
- OS: Ubuntu 22.04

### Results

| Approach | Mean (ms) | P95 (ms) | P99 (ms) |
|----------|-----------|----------|----------|
| Baseline | 45.2 | 52.1 | 58.3 |
| Optimized | 12.3 | 14.8 | 16.2 |

**Improvement: 3.7x faster**

### Methodology
- 1000 iterations
- 100 warmup iterations
- GC disabled during measurement