In [ ]:

Copied!





# ------------------------------------------------------------------------
# RF-DETR
# Copyright (c) 2025 Roboflow. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------
# ------------------------------------------------------------------------
# RF-DETR
# Copyright (c) 2025 Roboflow. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------

RF-DETR Inference Latency Benchmark¶

Measures inference latency for three RF-DETR families across three configs:

Config	Description
FP32	`predict()` — unoptimized baseline
FP16+JIT	`optimize_for_inference(dtype=torch.float16)`
ONNX	exported `.onnx` via `onnxruntime-gpu`

1. Install¶

We need onnxruntime-gpu built against CUDA 12 — the default PyPI wheel targets CUDA 11.8 and silently falls back to CPU on modern GPUs. The Microsoft CUDA-12 package index ships the correct build.

Colab: after running this cell, go to Runtime → Restart session, then run from the next cell.

In [ ]:

Copied!

!pip uninstall -y onnxruntime onnxruntime-gpu
!pip install -q "rfdetr[onnx]" pillow pandas
!pip install -q onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
!pip uninstall -y onnxruntime onnxruntime-gpu
!pip install -q "rfdetr[onnx]" pillow pandas
!pip install -q onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/

2. Config¶

WARMUP_RUNS discards the first N inferences — GPU kernels are JIT-compiled on first use, so early timings are outliers. MEASURE_RUNS then collects the steady-state distribution. 20 + 100 is a reasonable balance between statistical stability and total wall-clock time per model.

In [ ]:

Copied!





from collections.abc import Callable
from pathlib import Path
from typing import Any, NamedTuple

import numpy as np
import torch
from PIL import Image

WARMUP_RUNS = 20
MEASURE_RUNS = 100
EXPORT_DIR = Path("benchmark_output")
EXPORT_DIR.mkdir(exist_ok=True)

if not torch.cuda.is_available():
    raise RuntimeError("This benchmark requires a CUDA GPU.")
print(f"GPU: {torch.cuda.get_device_name(0)}")

import onnxruntime as ort

_ort_providers = ort.get_available_providers()
print(f"ORT {ort.__version__}, providers: {_ort_providers}")
if "CUDAExecutionProvider" not in _ort_providers:
    raise RuntimeError(
        f"onnxruntime-gpu with CUDA support required. Available providers: {_ort_providers}. "
        "Fix: reinstall from the CUDA-12 index (see install cell) then restart runtime."
    )
from collections.abc import Callable
from pathlib import Path
from typing import Any, NamedTuple

import numpy as np
import torch
from PIL import Image

WARMUP_RUNS = 20
MEASURE_RUNS = 100
EXPORT_DIR = Path("benchmark_output")
EXPORT_DIR.mkdir(exist_ok=True)

if not torch.cuda.is_available():
    raise RuntimeError("This benchmark requires a CUDA GPU.")
print(f"GPU: {torch.cuda.get_device_name(0)}")

import onnxruntime as ort

_ort_providers = ort.get_available_providers()
print(f"ORT {ort.__version__}, providers: {_ort_providers}")
if "CUDAExecutionProvider" not in _ort_providers:
    raise RuntimeError(
        f"onnxruntime-gpu with CUDA support required. Available providers: {_ort_providers}. "
        "Fix: reinstall from the CUDA-12 index (see install cell) then restart runtime."
    )

3. Sample images¶

Latency depends on resolution, not pixel content, so synthetic noise images are equivalent to real photos for benchmarking purposes. Using a fixed seed makes results reproducible across runs.

In [ ]:

Copied!

rng = np.random.default_rng(42)
images: list[Image.Image] = [Image.fromarray(rng.integers(0, 256, (640, 640, 3), dtype=np.uint8)) for _ in range(10)]
print(f"Generated {len(images)} synthetic 640×640 RGB images")
rng = np.random.default_rng(42)
images: list[Image.Image] = [Image.fromarray(rng.integers(0, 256, (640, 640, 3), dtype=np.uint8)) for _ in range(10)]
print(f"Generated {len(images)} synthetic 640×640 RGB images")

4. Latency helpers¶

GPU kernels execute asynchronously — time.perf_counter() returns before the GPU finishes, giving misleadingly low numbers. CUDA events are inserted directly into the GPU command stream and timestamped on the device, so elapsed_time() measures actual kernel execution. torch.cuda.synchronize() after each run flushes the stream and ensures the event fires before we read it.

In [ ]:

Copied!





class BenchmarkResult(NamedTuple):
    """Single benchmark measurement."""

    label: str
    mean_ms: float
    std_ms: float

    @property
    def fps(self) -> float:
        """Frames per second."""
        return 1000.0 / self.mean_ms


def measure_latency_gpu(
    fn: Callable[[], object],
    warmup: int = WARMUP_RUNS,
    runs: int = MEASURE_RUNS,
) -> tuple[float, float]:
    """Return (mean_ms, std_ms) using CUDA events."""
    for _ in range(warmup):
        fn()
    torch.cuda.synchronize()
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    timings: list[float] = []
    for _ in range(runs):
        start.record()
        fn()
        end.record()
        torch.cuda.synchronize()
        timings.append(start.elapsed_time(end))
    arr = np.array(timings)
    return float(arr.mean()), float(arr.std())


_measure = measure_latency_gpu
class BenchmarkResult(NamedTuple):
    """Single benchmark measurement."""

    label: str
    mean_ms: float
    std_ms: float

    @property
    def fps(self) -> float:
        """Frames per second."""
        return 1000.0 / self.mean_ms


def measure_latency_gpu(
    fn: Callable[[], object],
    warmup: int = WARMUP_RUNS,
    runs: int = MEASURE_RUNS,
) -> tuple[float, float]:
    """Return (mean_ms, std_ms) using CUDA events."""
    for _ in range(warmup):
        fn()
    torch.cuda.synchronize()
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    timings: list[float] = []
    for _ in range(runs):
        start.record()
        fn()
        end.record()
        torch.cuda.synchronize()
        timings.append(start.elapsed_time(end))
    arr = np.array(timings)
    return float(arr.mean()), float(arr.std())


_measure = measure_latency_gpu

5. Per-config benchmark functions¶

Three inference paths are compared:

FP32 — predict() as shipped. Full 32-bit arithmetic on GPU. Baseline.
FP16+JIT — optimize_for_inference(dtype=torch.float16) fuses layers with torch.jit.script and halves the arithmetic precision. Typically 1.5–2× faster than FP32 with negligible accuracy loss on modern tensor cores.
ONNX — the model is exported to the Open Neural Network Exchange format and run through ONNX Runtime, bypassing PyTorch entirely. ORT applies its own graph optimisations and can use the TensorRT execution provider for additional speedup. Benchmarked separately on CPU and GPU to show the provider impact.

In [ ]:

Copied!





from rfdetr.export._onnx.inference import _onnx_runtime


def _predict_fp32(model: Any, image: Image.Image) -> BenchmarkResult:
    """Baseline FP32 predict() latency."""
    mean, std = _measure(lambda: model.predict(image))
    return BenchmarkResult("predict() FP32", mean, std)


def _predict_fp16(model: Any, image: Image.Image) -> BenchmarkResult:
    """FP16+JIT latency — applies and removes optimize_for_inference."""
    model.optimize_for_inference(dtype=torch.float16)
    mean, std = _measure(lambda: model.predict(image))
    model.remove_optimized_model()
    return BenchmarkResult("predict() FP16+JIT", mean, std)


def _export_onnx(model: Any, export_dir: Path) -> Path:
    """Export model to ONNX and return the path."""
    return Path(model.export(output_dir=str(export_dir)))
from rfdetr.export._onnx.inference import _onnx_runtime


def _predict_fp32(model: Any, image: Image.Image) -> BenchmarkResult:
    """Baseline FP32 predict() latency."""
    mean, std = _measure(lambda: model.predict(image))
    return BenchmarkResult("predict() FP32", mean, std)


def _predict_fp16(model: Any, image: Image.Image) -> BenchmarkResult:
    """FP16+JIT latency — applies and removes optimize_for_inference."""
    model.optimize_for_inference(dtype=torch.float16)
    mean, std = _measure(lambda: model.predict(image))
    model.remove_optimized_model()
    return BenchmarkResult("predict() FP16+JIT", mean, std)


def _export_onnx(model: Any, export_dir: Path) -> Path:
    """Export model to ONNX and return the path."""
    return Path(model.export(output_dir=str(export_dir)))

6. Model benchmark runner¶

Each model is loaded fresh, exported once, and then each inference config is timed independently. The ONNX path reuses the same exported file for both CPU and CUDA providers so export cost is not counted in latency. CUDA ONNX is skipped gracefully when no GPU is available; missing onnxruntime-gpu raises immediately so misconfigured environments are caught early.

In [ ]:

Copied!





def run_model_benchmark(
    model_cls: type,
    model_name: str,
    images: list[Image.Image],
) -> list[BenchmarkResult]:
    """Run FP32 / FP16 / ONNX benchmarks for one model and print results."""
    print(f"\n{'=' * 62}")
    print(f"  {model_name}")
    print("=" * 62)

    model: Any = model_cls()
    export_dir = EXPORT_DIR / model_name.split()[0]
    export_dir.mkdir(exist_ok=True)
    image = images[0]

    fp32 = _predict_fp32(model, image)
    fp16 = _predict_fp16(model, image)
    results: list[BenchmarkResult] = [fp32, fp16]

    onnx_path = _export_onnx(model, export_dir)
    for providers in (["CPUExecutionProvider"], ["CUDAExecutionProvider", "CPUExecutionProvider"]):
        if providers[0] == "CUDAExecutionProvider" and not torch.cuda.is_available():
            print("  ⚠ ONNX (CUDA) skipped — no CUDA GPU")
            continue
        mean_ms, std_ms, label = _onnx_runtime(onnx_path, image, providers, WARMUP_RUNS, MEASURE_RUNS)
        results.append(BenchmarkResult(f"ONNX ({label})", mean_ms, std_ms))

    for r in results:
        print(f"  {r.label:<30}  {r.mean_ms:6.2f} ms ± {r.std_ms:5.2f}   ({r.fps:6.1f} FPS)")

    onnx_results = [r for r in results if r.label.startswith("ONNX")]
    speedups = [f"FP16 {fp32.mean_ms / fp16.mean_ms:.1f}×"]
    if onnx_results:
        speedups.append(f"ONNX {fp32.mean_ms / onnx_results[0].mean_ms:.1f}×")
    print(f"  Speedup vs FP32: {' | '.join(speedups)}")

    return results
def run_model_benchmark(
    model_cls: type,
    model_name: str,
    images: list[Image.Image],
) -> list[BenchmarkResult]:
    """Run FP32 / FP16 / ONNX benchmarks for one model and print results."""
    print(f"\n{'=' * 62}")
    print(f"  {model_name}")
    print("=" * 62)

    model: Any = model_cls()
    export_dir = EXPORT_DIR / model_name.split()[0]
    export_dir.mkdir(exist_ok=True)
    image = images[0]

    fp32 = _predict_fp32(model, image)
    fp16 = _predict_fp16(model, image)
    results: list[BenchmarkResult] = [fp32, fp16]

    onnx_path = _export_onnx(model, export_dir)
    for providers in (["CPUExecutionProvider"], ["CUDAExecutionProvider", "CPUExecutionProvider"]):
        if providers[0] == "CUDAExecutionProvider" and not torch.cuda.is_available():
            print("  ⚠ ONNX (CUDA) skipped — no CUDA GPU")
            continue
        mean_ms, std_ms, label = _onnx_runtime(onnx_path, image, providers, WARMUP_RUNS, MEASURE_RUNS)
        results.append(BenchmarkResult(f"ONNX ({label})", mean_ms, std_ms))

    for r in results:
        print(f"  {r.label:<30}  {r.mean_ms:6.2f} ms ± {r.std_ms:5.2f}   ({r.fps:6.1f} FPS)")

    onnx_results = [r for r in results if r.label.startswith("ONNX")]
    speedups = [f"FP16 {fp32.mean_ms / fp16.mean_ms:.1f}×"]
    if onnx_results:
        speedups.append(f"ONNX {fp32.mean_ms / onnx_results[0].mean_ms:.1f}×")
    print(f"  Speedup vs FP32: {' | '.join(speedups)}")

    return results

7. Benchmark loop — detection · segmentation · keypoint¶

Three model families are benchmarked to show how task complexity affects latency. Detection (RFDETRMedium) outputs boxes only; segmentation (RFDETRSegSmall) additionally predicts per-object masks, which adds decoder cost; keypoint (RFDETRKeypointPreview) predicts skeleton joints and is typically the lightest of the three at smaller resolutions.

In [ ]:

Copied!





from rfdetr import RFDETRKeypointPreview, RFDETRMedium, RFDETRSegSmall

MODELS: list[tuple[type, str]] = [
    (RFDETRMedium, "RFDETRMedium — detection"),
    (RFDETRSegSmall, "RFDETRSegSmall — segmentation"),
    (RFDETRKeypointPreview, "RFDETRKeypointPreview — keypoint"),
]

all_results: dict[str, list[BenchmarkResult]] = {}
for _model_cls, _model_name in MODELS:
    all_results[_model_name] = run_model_benchmark(_model_cls, _model_name, images)
from rfdetr import RFDETRKeypointPreview, RFDETRMedium, RFDETRSegSmall

MODELS: list[tuple[type, str]] = [
    (RFDETRMedium, "RFDETRMedium — detection"),
    (RFDETRSegSmall, "RFDETRSegSmall — segmentation"),
    (RFDETRKeypointPreview, "RFDETRKeypointPreview — keypoint"),
]

all_results: dict[str, list[BenchmarkResult]] = {}
for _model_cls, _model_name in MODELS:
    all_results[_model_name] = run_model_benchmark(_model_cls, _model_name, images)

8. Summary¶

The table shows FPS (frames per second) for each model × config combination. Higher is better. Compare columns to see which model fits your latency budget; compare rows to choose the right inference backend for your deployment target (Python server, edge CPU, or ONNX Runtime service).

In [ ]:

Copied!





import pandas as pd

summary = {
    model_name.split()[0]: {r.label: round(r.fps, 1) for r in results} for model_name, results in all_results.items()
}
df = pd.DataFrame(summary)
df.index.name = "Config \\ Model"
print(df.to_string())
print(f"\nFPS — {MEASURE_RUNS} timed + {WARMUP_RUNS} warmup runs, batch 1, GPU: {torch.cuda.get_device_name(0)}.")
import pandas as pd

summary = {
    model_name.split()[0]: {r.label: round(r.fps, 1) for r in results} for model_name, results in all_results.items()
}
df = pd.DataFrame(summary)
df.index.name = "Config \\ Model"
print(df.to_string())
print(f"\nFPS — {MEASURE_RUNS} timed + {WARMUP_RUNS} warmup runs, batch 1, GPU: {torch.cuda.get_device_name(0)}.")