# ------------------------------------------------------------------------
# RF-DETR
# Copyright (c) 2025 Roboflow. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------
RF-DETR Inference Latency Benchmark¶
Measures inference latency for three RF-DETR families across three configs:
| Config | Description |
|---|---|
| FP32 | predict() — unoptimized baseline |
| FP16+JIT | optimize_for_inference(dtype=torch.float16) |
| ONNX | exported .onnx via onnxruntime-gpu |
1. Install¶
We need onnxruntime-gpu built against CUDA 12 — the default PyPI wheel targets CUDA 11.8 and silently
falls back to CPU on modern GPUs. The Microsoft CUDA-12 package index ships the correct build.
Colab: after running this cell, go to Runtime → Restart session, then run from the next cell.
!pip uninstall -y onnxruntime onnxruntime-gpu
!pip install -q "rfdetr[onnx]" pillow pandas
!pip install -q onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
2. Config¶
WARMUP_RUNS discards the first N inferences — GPU kernels are JIT-compiled on first use, so early timings
are outliers. MEASURE_RUNS then collects the steady-state distribution. 20 + 100 is a reasonable balance
between statistical stability and total wall-clock time per model.
from collections.abc import Callable
from pathlib import Path
from typing import Any, NamedTuple
import numpy as np
import torch
from PIL import Image
WARMUP_RUNS = 20
MEASURE_RUNS = 100
EXPORT_DIR = Path("benchmark_output")
EXPORT_DIR.mkdir(exist_ok=True)
if not torch.cuda.is_available():
raise RuntimeError("This benchmark requires a CUDA GPU.")
print(f"GPU: {torch.cuda.get_device_name(0)}")
import onnxruntime as ort
_ort_providers = ort.get_available_providers()
print(f"ORT {ort.__version__}, providers: {_ort_providers}")
if "CUDAExecutionProvider" not in _ort_providers:
raise RuntimeError(
f"onnxruntime-gpu with CUDA support required. Available providers: {_ort_providers}. "
"Fix: reinstall from the CUDA-12 index (see install cell) then restart runtime."
)
3. Sample images¶
Latency depends on resolution, not pixel content, so synthetic noise images are equivalent to real photos for benchmarking purposes. Using a fixed seed makes results reproducible across runs.
rng = np.random.default_rng(42)
images: list[Image.Image] = [Image.fromarray(rng.integers(0, 256, (640, 640, 3), dtype=np.uint8)) for _ in range(10)]
print(f"Generated {len(images)} synthetic 640×640 RGB images")
4. Latency helpers¶
GPU kernels execute asynchronously — time.perf_counter() returns before the GPU finishes, giving
misleadingly low numbers. CUDA events are inserted directly into the GPU command stream and timestamped
on the device, so elapsed_time() measures actual kernel execution. torch.cuda.synchronize() after
each run flushes the stream and ensures the event fires before we read it.
class BenchmarkResult(NamedTuple):
"""Single benchmark measurement."""
label: str
mean_ms: float
std_ms: float
@property
def fps(self) -> float:
"""Frames per second."""
return 1000.0 / self.mean_ms
def measure_latency_gpu(
fn: Callable[[], object],
warmup: int = WARMUP_RUNS,
runs: int = MEASURE_RUNS,
) -> tuple[float, float]:
"""Return (mean_ms, std_ms) using CUDA events."""
for _ in range(warmup):
fn()
torch.cuda.synchronize()
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
timings: list[float] = []
for _ in range(runs):
start.record()
fn()
end.record()
torch.cuda.synchronize()
timings.append(start.elapsed_time(end))
arr = np.array(timings)
return float(arr.mean()), float(arr.std())
_measure = measure_latency_gpu
5. Per-config benchmark functions¶
Three inference paths are compared:
- FP32 —
predict()as shipped. Full 32-bit arithmetic on GPU. Baseline. - FP16+JIT —
optimize_for_inference(dtype=torch.float16)fuses layers withtorch.jit.scriptand halves the arithmetic precision. Typically 1.5–2× faster than FP32 with negligible accuracy loss on modern tensor cores. - ONNX — the model is exported to the Open Neural Network Exchange format and run through ONNX Runtime, bypassing PyTorch entirely. ORT applies its own graph optimisations and can use the TensorRT execution provider for additional speedup. Benchmarked separately on CPU and GPU to show the provider impact.
from rfdetr.export._onnx.inference import _onnx_runtime
def _predict_fp32(model: Any, image: Image.Image) -> BenchmarkResult:
"""Baseline FP32 predict() latency."""
mean, std = _measure(lambda: model.predict(image))
return BenchmarkResult("predict() FP32", mean, std)
def _predict_fp16(model: Any, image: Image.Image) -> BenchmarkResult:
"""FP16+JIT latency — applies and removes optimize_for_inference."""
model.optimize_for_inference(dtype=torch.float16)
mean, std = _measure(lambda: model.predict(image))
model.remove_optimized_model()
return BenchmarkResult("predict() FP16+JIT", mean, std)
def _export_onnx(model: Any, export_dir: Path) -> Path:
"""Export model to ONNX and return the path."""
return Path(model.export(output_dir=str(export_dir)))
6. Model benchmark runner¶
Each model is loaded fresh, exported once, and then each inference config is timed independently.
The ONNX path reuses the same exported file for both CPU and CUDA providers so export cost is not
counted in latency. CUDA ONNX is skipped gracefully when no GPU is available; missing onnxruntime-gpu
raises immediately so misconfigured environments are caught early.
def run_model_benchmark(
model_cls: type,
model_name: str,
images: list[Image.Image],
) -> list[BenchmarkResult]:
"""Run FP32 / FP16 / ONNX benchmarks for one model and print results."""
print(f"\n{'=' * 62}")
print(f" {model_name}")
print("=" * 62)
model: Any = model_cls()
export_dir = EXPORT_DIR / model_name.split()[0]
export_dir.mkdir(exist_ok=True)
image = images[0]
fp32 = _predict_fp32(model, image)
fp16 = _predict_fp16(model, image)
results: list[BenchmarkResult] = [fp32, fp16]
onnx_path = _export_onnx(model, export_dir)
for providers in (["CPUExecutionProvider"], ["CUDAExecutionProvider", "CPUExecutionProvider"]):
if providers[0] == "CUDAExecutionProvider" and not torch.cuda.is_available():
print(" ⚠ ONNX (CUDA) skipped — no CUDA GPU")
continue
mean_ms, std_ms, label = _onnx_runtime(onnx_path, image, providers, WARMUP_RUNS, MEASURE_RUNS)
results.append(BenchmarkResult(f"ONNX ({label})", mean_ms, std_ms))
for r in results:
print(f" {r.label:<30} {r.mean_ms:6.2f} ms ± {r.std_ms:5.2f} ({r.fps:6.1f} FPS)")
onnx_results = [r for r in results if r.label.startswith("ONNX")]
speedups = [f"FP16 {fp32.mean_ms / fp16.mean_ms:.1f}×"]
if onnx_results:
speedups.append(f"ONNX {fp32.mean_ms / onnx_results[0].mean_ms:.1f}×")
print(f" Speedup vs FP32: {' | '.join(speedups)}")
return results
7. Benchmark loop — detection · segmentation · keypoint¶
Three model families are benchmarked to show how task complexity affects latency. Detection (RFDETRMedium)
outputs boxes only; segmentation (RFDETRSegSmall) additionally predicts per-object masks, which adds
decoder cost; keypoint (RFDETRKeypointPreview) predicts skeleton joints and is typically the lightest
of the three at smaller resolutions.
from rfdetr import RFDETRKeypointPreview, RFDETRMedium, RFDETRSegSmall
MODELS: list[tuple[type, str]] = [
(RFDETRMedium, "RFDETRMedium — detection"),
(RFDETRSegSmall, "RFDETRSegSmall — segmentation"),
(RFDETRKeypointPreview, "RFDETRKeypointPreview — keypoint"),
]
all_results: dict[str, list[BenchmarkResult]] = {}
for _model_cls, _model_name in MODELS:
all_results[_model_name] = run_model_benchmark(_model_cls, _model_name, images)
8. Summary¶
The table shows FPS (frames per second) for each model × config combination. Higher is better. Compare columns to see which model fits your latency budget; compare rows to choose the right inference backend for your deployment target (Python server, edge CPU, or ONNX Runtime service).
import pandas as pd
summary = {
model_name.split()[0]: {r.label: round(r.fps, 1) for r in results} for model_name, results in all_results.items()
}
df = pd.DataFrame(summary)
df.index.name = "Config \\ Model"
print(df.to_string())
print(f"\nFPS — {MEASURE_RUNS} timed + {WARMUP_RUNS} warmup runs, batch 1, GPU: {torch.cuda.get_device_name(0)}.")