Benchmarking¶
Write and run reliable benchmarks.
Python Benchmarking¶
timeit Module¶
import timeit
# Simple timing
time = timeit.timeit(
'sum(range(1000))',
number=10000
)
print(f"Average: {time / 10000 * 1000:.3f}ms")
# Compare approaches
setup = "data = list(range(1000))"
approach_1 = timeit.timeit(
'sum(data)',
setup=setup,
number=10000
)
approach_2 = timeit.timeit(
'result = 0\nfor x in data:\n result += x',
setup=setup,
number=10000
)
print(f"sum(): {approach_1:.3f}s")
print(f"loop: {approach_2:.3f}s")
pytest-benchmark¶
# test_benchmark.py
def fibonacci(n):
if n < 2:
return n
return fibonacci(n - 1) + fibonacci(n - 2)
def fibonacci_memo(n, memo={}):
if n in memo:
return memo[n]
if n < 2:
return n
memo[n] = fibonacci_memo(n - 1, memo) + fibonacci_memo(n - 2, memo)
return memo[n]
def test_fibonacci(benchmark):
result = benchmark(fibonacci, 20)
assert result == 6765
def test_fibonacci_memo(benchmark):
result = benchmark(fibonacci_memo, 20)
assert result == 6765
Output:
Name Min Max Mean StdDev Rounds
test_fibonacci 2.5ms 3.1ms 2.7ms 0.2ms 100
test_fibonacci_memo 1.2us 2.0us 1.5us 0.3us 10000
Rich Benchmark Output¶
import time
from rich.console import Console
from rich.table import Table
def benchmark(name, func, iterations=1000):
times = []
for _ in range(iterations):
start = time.perf_counter()
func()
times.append(time.perf_counter() - start)
return {
"name": name,
"min": min(times) * 1000,
"max": max(times) * 1000,
"mean": sum(times) / len(times) * 1000,
}
# Run benchmarks
results = [
benchmark("approach_1", lambda: sum(range(1000))),
benchmark("approach_2", lambda: functools.reduce(lambda a,b: a+b, range(1000))),
]
# Display
console = Console()
table = Table(title="Benchmark Results")
table.add_column("Name")
table.add_column("Min (ms)")
table.add_column("Max (ms)")
table.add_column("Mean (ms)")
for r in results:
table.add_row(r["name"], f"{r['min']:.3f}", f"{r['max']:.3f}", f"{r['mean']:.3f}")
console.print(table)
JavaScript Benchmarking¶
Performance API¶
function benchmark(name, fn, iterations = 1000) {
const times = [];
for (let i = 0; i < iterations; i++) {
const start = performance.now();
fn();
times.push(performance.now() - start);
}
const sorted = times.sort((a, b) => a - b);
return {
name,
min: sorted[0],
max: sorted[sorted.length - 1],
mean: times.reduce((a, b) => a + b, 0) / times.length,
p50: sorted[Math.floor(sorted.length * 0.5)],
p95: sorted[Math.floor(sorted.length * 0.95)],
};
}
// Usage
const result = benchmark('array sort', () => {
const arr = Array.from({ length: 1000 }, () => Math.random());
arr.sort((a, b) => a - b);
});
console.table([result]);
Vitest Benchmarking¶
// bench/sort.bench.ts
import { bench, describe } from 'vitest';
describe('sorting', () => {
const data = Array.from({ length: 1000 }, () => Math.random());
bench('native sort', () => {
[...data].sort((a, b) => a - b);
});
bench('custom quicksort', () => {
quicksort([...data]);
});
});
Benchmarking Best Practices¶
Warm-up¶
def benchmark_with_warmup(func, warmup=100, iterations=1000):
# Warm up (JIT, caches, etc.)
for _ in range(warmup):
func()
# Actual benchmark
times = []
for _ in range(iterations):
start = time.perf_counter()
func()
times.append(time.perf_counter() - start)
return times
Statistical Significance¶
import statistics
def analyze_results(times):
return {
"mean": statistics.mean(times),
"median": statistics.median(times),
"stdev": statistics.stdev(times),
"p95": sorted(times)[int(len(times) * 0.95)],
"p99": sorted(times)[int(len(times) * 0.99)],
}
def compare_approaches(times_a, times_b):
"""Check if difference is significant."""
from scipy import stats
t_stat, p_value = stats.ttest_ind(times_a, times_b)
return {
"t_statistic": t_stat,
"p_value": p_value,
"significant": p_value < 0.05,
}
Isolate External Factors¶
import gc
def isolated_benchmark(func, iterations=1000):
# Disable garbage collection
gc.disable()
try:
times = []
for _ in range(iterations):
start = time.perf_counter()
func()
times.append(time.perf_counter() - start)
return times
finally:
gc.enable()
Avoid Common Pitfalls¶
# Bad: Dead code elimination
def benchmark_bad():
result = expensive_computation()
# Result never used, compiler might optimize away
# Good: Use the result
def benchmark_good():
result = expensive_computation()
return result # Or use a side effect
# Bad: Microbenchmark doesn't reflect real usage
def benchmark_unrealistic():
for i in range(1000000):
x = i + 1 # Too simple, cache effects dominate
# Good: Realistic workload
def benchmark_realistic():
data = load_test_data()
return process_data(data)
Continuous Benchmarking¶
GitHub Actions¶
# .github/workflows/benchmark.yml
name: Benchmark
on:
push:
branches: [main]
pull_request:
jobs:
benchmark:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Run benchmarks
run: |
pytest tests/benchmark/ --benchmark-json=benchmark.json
- name: Store benchmark result
uses: benchmark-action/github-action-benchmark@v1
with:
tool: pytest
output-file-path: benchmark.json
github-token: ${{ secrets.GITHUB_TOKEN }}
auto-push: true
alert-threshold: '150%'
comment-on-alert: true
Tracking Over Time¶
import json
from datetime import datetime
def save_benchmark_result(name, result):
record = {
"timestamp": datetime.now().isoformat(),
"name": name,
"commit": os.environ.get("GIT_COMMIT", "unknown"),
**result
}
with open("benchmark_history.jsonl", "a") as f:
f.write(json.dumps(record) + "\n")
When to Benchmark¶
| Scenario | Approach |
|---|---|
| Comparing algorithms | Microbenchmark |
| Optimizing hot path | Profile first, then benchmark |
| Preventing regressions | CI benchmarks |
| Capacity planning | Load testing |
| Real-world performance | APM in production |
Reporting Results¶
## Benchmark Results
### Environment
- CPU: Intel i7-10700 @ 2.90GHz
- RAM: 32GB
- Python: 3.12.0
- OS: Ubuntu 22.04
### Results
| Approach | Mean (ms) | P95 (ms) | P99 (ms) |
|----------|-----------|----------|----------|
| Baseline | 45.2 | 52.1 | 58.3 |
| Optimized | 12.3 | 14.8 | 16.2 |
**Improvement: 3.7x faster**
### Methodology
- 1000 iterations
- 100 warmup iterations
- GC disabled during measurement