Skip to content

Run an RF-DETR Object Detection Model

RF-DETR is a real-time transformer architecture for object detection, built on a DINOv2 vision transformer backbone. The base models are trained on the Microsoft COCO dataset and achieve state-of-the-art accuracy and latency trade-offs.

Pre-trained Checkpoints

RF-DETR offers model sizes from Nano to 2XLarge, allowing trade-offs between accuracy, latency, and parameter count. All latency numbers were measured on an NVIDIA T4 using TensorRT, FP16, and batch size 1. Core models (Nano to Large) are licensed under Apache 2.0, while XLarge and 2XLarge use the Platform Model License 1.0 and require a Roboflow account.

Size RF-DETR package class Inference package alias COCO AP50 COCO AP50:95 Latency (ms) Params (M) Resolution License
N RFDETRNano rfdetr-nano 67.6 48.4 2.3 30.5 384x384 Apache 2.0
S RFDETRSmall rfdetr-small 72.1 53.0 3.5 32.1 512x512 Apache 2.0
M RFDETRMedium rfdetr-medium 73.6 54.7 4.4 33.7 576x576 Apache 2.0
L RFDETRLarge rfdetr-large 75.1 56.5 6.8 33.9 704x704 Apache 2.0
XL RFDETRXLarge rfdetr-xlarge 77.4 58.6 11.5 126.4 700x700 PML 1.0
2XL RFDETR2XLarge rfdetr-2xlarge 78.5 60.1 17.2 126.9 880x880 PML 1.0

Run on an Image

Perform inference on an image using either the rfdetr package or the inference package. To use a different model size, select the corresponding class or alias from the table above.

import requests
import supervision as sv
from PIL import Image
from rfdetr import RFDETRMedium
from rfdetr.util.coco_classes import COCO_CLASSES

model = RFDETRMedium()

image = Image.open(requests.get('https://media.roboflow.com/dog.jpg', stream=True).raw)
detections = model.predict(image, threshold=0.5)

labels = [
    f"{COCO_CLASSES[class_id]}"
    for class_id
    in detections.class_id
]

annotated_image = sv.BoxAnnotator().annotate(image, detections)
annotated_image = sv.LabelAnnotator().annotate(annotated_image, detections, labels)
import requests
import supervision as sv
from PIL import Image
from inference import get_model

model = get_model("rfdetr-medium")

image = Image.open(requests.get('https://media.roboflow.com/dog.jpg', stream=True).raw)
predictions = model.infer(image, confidence=0.5)[0]
detections = sv.Detections.from_inference(predictions)

annotated_image = sv.BoxAnnotator().annotate(image, detections)
annotated_image = sv.LabelAnnotator().annotate(annotated_image, detections)

Run on video, webcam, or RTSP stream

These examples use OpenCV for decoding and display. Replace <SOURCE_VIDEO_PATH>, <WEBCAM_INDEX>, and <RTSP_STREAM_URL> with your inputs. <WEBCAM_INDEX> is usually 0 for the default camera.

import cv2
import supervision as sv
from rfdetr import RFDETRMedium
from rfdetr.util.coco_classes import COCO_CLASSES

model = RFDETRMedium()

video_capture = cv2.VideoCapture("<SOURCE_VIDEO_PATH>")
if not video_capture.isOpened():
    raise RuntimeError("Failed to open video source: <SOURCE_VIDEO_PATH>")

while True:
    success, frame_bgr = video_capture.read()
    if not success:
        break

    frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
    detections = model.predict(frame_rgb, threshold=0.5)

    labels = [
        COCO_CLASSES[class_id]
        for class_id in detections.class_id
    ]

    annotated_frame = sv.BoxAnnotator().annotate(frame_bgr, detections)
    annotated_frame = sv.LabelAnnotator().annotate(annotated_frame, detections, labels)

    cv2.imshow("RF-DETR Video", annotated_frame)
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

video_capture.release()
cv2.destroyAllWindows()
import cv2
import supervision as sv
from rfdetr import RFDETRMedium
from rfdetr.util.coco_classes import COCO_CLASSES

model = RFDETRMedium()

video_capture = cv2.VideoCapture(<WEBCAM_INDEX>)
if not video_capture.isOpened():
    raise RuntimeError("Failed to open webcam: <WEBCAM_INDEX>")

while True:
    success, frame_bgr = video_capture.read()
    if not success:
        break

    frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
    detections = model.predict(frame_rgb, threshold=0.5)

    labels = [
        COCO_CLASSES[class_id]
        for class_id in detections.class_id
    ]

    annotated_frame = sv.BoxAnnotator().annotate(frame_bgr, detections)
    annotated_frame = sv.LabelAnnotator().annotate(annotated_frame, detections, labels)

    cv2.imshow("RF-DETR Webcam", annotated_frame)
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

video_capture.release()
cv2.destroyAllWindows()
import cv2
import supervision as sv
from rfdetr import RFDETRMedium
from rfdetr.util.coco_classes import COCO_CLASSES

model = RFDETRMedium()

video_capture = cv2.VideoCapture("<RTSP_STREAM_URL>")
if not video_capture.isOpened():
    raise RuntimeError("Failed to open RTSP stream: <RTSP_STREAM_URL>")

while True:
    success, frame_bgr = video_capture.read()
    if not success:
        break

    frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
    detections = model.predict(frame_rgb, threshold=0.5)

    labels = [
        COCO_CLASSES[class_id]
        for class_id in detections.class_id
    ]

    annotated_frame = sv.BoxAnnotator().annotate(frame_bgr, detections)
    annotated_frame = sv.LabelAnnotator().annotate(annotated_frame, detections, labels)

    cv2.imshow("RF-DETR RTSP", annotated_frame)
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

video_capture.release()
cv2.destroyAllWindows()