Skip to content

Run an RF-DETR Instance Segmentation Model

RF-DETR is a real-time transformer architecture for instance segmentation, built on a DINOv2 vision transformer backbone. The base models are trained on the Microsoft COCO dataset and achieve strong accuracy and latency trade-offs.

Pre-trained Checkpoints

RF-DETR-Seg offers model sizes from Nano to 2XLarge, allowing trade-offs between accuracy, latency, and parameter count. All latency numbers were measured on an NVIDIA T4 using TensorRT, FP16, and batch size 1.

Size RF-DETR package class Inference package alias COCO AP50 COCO AP50:95 Latency (ms) Params (M) Resolution License
N RFDETRSegNano rfdetr-seg-nano 63.0 40.3 3.4 33.6 312x312 Apache 2.0
S RFDETRSegSmall rfdetr-seg-small 66.2 43.1 4.4 33.7 384x384 Apache 2.0
M RFDETRSegMedium rfdetr-seg-medium 68.4 45.3 5.9 35.7 432x432 Apache 2.0
L RFDETRSegLarge rfdetr-seg-large 70.5 47.1 8.8 36.2 504x504 Apache 2.0
XL RFDETRSegXLarge rfdetr-seg-xlarge 72.2 48.8 13.5 38.1 624x624 Apache 2.0
2XL RFDETRSeg2XLarge rfdetr-seg-2xlarge 73.1 49.9 21.8 38.6 768x768 Apache 2.0

Run on an Image

Perform inference on an image using either the rfdetr package or the inference package. To use a different model size, select the corresponding class or alias from the table above.

import requests
import supervision as sv
from PIL import Image
from rfdetr import RFDETRSegMedium
from rfdetr.util.coco_classes import COCO_CLASSES

model = RFDETRSegMedium()

image = Image.open(requests.get('https://media.roboflow.com/dog.jpg', stream=True).raw)
detections = model.predict(image, threshold=0.5)

labels = [
    f"{COCO_CLASSES[class_id]}"
    for class_id
    in detections.class_id
]

annotated_image = sv.MaskAnnotator().annotate(image, detections)
annotated_image = sv.LabelAnnotator().annotate(annotated_image, detections, labels)
import requests
import supervision as sv
from PIL import Image
from inference import get_model

model = get_model("rfdetr-seg-medium")

image = Image.open(requests.get('https://media.roboflow.com/dog.jpg', stream=True).raw)
predictions = model.infer(image, confidence=0.5)[0]
detections = sv.Detections.from_inference(predictions)

annotated_image = sv.MaskAnnotator().annotate(image, detections)
annotated_image = sv.LabelAnnotator().annotate(annotated_image, detections)

Run on video, webcam, or RTSP stream

These examples use OpenCV for decoding and display. Replace <SOURCE_VIDEO_PATH>, <WEBCAM_INDEX>, and <RTSP_STREAM_URL> with your inputs. <WEBCAM_INDEX> is usually 0 for the default camera.

import cv2
import supervision as sv
from rfdetr import RFDETRSegMedium
from rfdetr.util.coco_classes import COCO_CLASSES

model = RFDETRSegMedium()

video_capture = cv2.VideoCapture("<SOURCE_VIDEO_PATH>")
if not video_capture.isOpened():
    raise RuntimeError("Failed to open video source: <SOURCE_VIDEO_PATH>")

while True:
    success, frame_bgr = video_capture.read()
    if not success:
        break

    frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
    detections = model.predict(frame_rgb, threshold=0.5)

    labels = [
        COCO_CLASSES[class_id]
        for class_id in detections.class_id
    ]

    annotated_frame = sv.MaskAnnotator().annotate(frame_bgr, detections)
    annotated_frame = sv.LabelAnnotator().annotate(annotated_frame, detections, labels)

    cv2.imshow("RF-DETR-Seg Video", annotated_frame)
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

video_capture.release()
cv2.destroyAllWindows()
import cv2
import supervision as sv
from rfdetr import RFDETRSegMedium
from rfdetr.util.coco_classes import COCO_CLASSES

model = RFDETRSegMedium()

video_capture = cv2.VideoCapture(<WEBCAM_INDEX>)
if not video_capture.isOpened():
    raise RuntimeError("Failed to open webcam: <WEBCAM_INDEX>")

while True:
    success, frame_bgr = video_capture.read()
    if not success:
        break

    frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
    detections = model.predict(frame_rgb, threshold=0.5)

    labels = [
        COCO_CLASSES[class_id]
        for class_id in detections.class_id
    ]

    annotated_frame = sv.MaskAnnotator().annotate(frame_bgr, detections)
    annotated_frame = sv.LabelAnnotator().annotate(annotated_frame, detections, labels)

    cv2.imshow("RF-DETR-Seg Webcam", annotated_frame)
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

video_capture.release()
cv2.destroyAllWindows()
import cv2
import supervision as sv
from rfdetr import RFDETRSegMedium
from rfdetr.util.coco_classes import COCO_CLASSES

model = RFDETRSegMedium()

video_capture = cv2.VideoCapture("<RTSP_STREAM_URL>")
if not video_capture.isOpened():
    raise RuntimeError("Failed to open RTSP stream: <RTSP_STREAM_URL>")

while True:
    success, frame_bgr = video_capture.read()
    if not success:
        break

    frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
    detections = model.predict(frame_rgb, threshold=0.5)

    labels = [
        COCO_CLASSES[class_id]
        for class_id in detections.class_id
    ]

    annotated_frame = sv.MaskAnnotator().annotate(frame_bgr, detections)
    annotated_frame = sv.LabelAnnotator().annotate(annotated_frame, detections, labels)

    cv2.imshow("RF-DETR-Seg RTSP", annotated_frame)
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

video_capture.release()
cv2.destroyAllWindows()