RF-DETR XLarge

License Notice

This model is licensed under the Platform Model License (PML-1.0) and requires pip install rfdetr[plus]. See the rfdetr_plus repository for license details.

Bases: RFDETR

Source code in .venv/lib/python3.10/site-packages/rfdetr_plus/models/detection.py

class RFDETRXLarge(RFDETR):
    size: Literal["rfdetr-xlarge"] = "rfdetr-xlarge"

    def __init__(self, **kwargs: Any) -> None:
        kwargs.pop("accept_platform_model_license", None)
        super().__init__(**kwargs)

    def get_model_config(self, **kwargs: Any) -> RFDETRXLargeConfig:
        return RFDETRXLargeConfig(**kwargs)

    def get_train_config(self, **kwargs: Any) -> TrainConfig:
        return TrainConfig(**kwargs)

Attributes¶

`class_names` `property` ¶

Retrieve the class names supported by the loaded model.

Returns:

Type	Description
`list[str]`	A list of class name strings, 0-indexed. When no custom class
`list[str]`	names are embedded in the checkpoint, returns the standard 80
`list[str]`	COCO class names.

Functions¶

`deploy_to_roboflow(workspace, project_id, version, api_key=None, size=None)` ¶

Deploy the trained RF-DETR model to Roboflow.

Deploying with Roboflow will create a Serverless API to which you can make requests.

You can also download weights into a Roboflow Inference deployment for use in Roboflow Workflows and on-device deployment.

Parameters:

Name	Type	Description	Default
`workspace` ¶	`str`	The name of the Roboflow workspace to deploy to.	required
`project_id` ¶	`str`	The project ID to which the model will be deployed.	required
`version` ¶	`int \| str`	The project version to which the model will be deployed.	required
`api_key` ¶	`str \| None`	Your Roboflow API key. If not provided, it will be read from the environment variable `ROBOFLOW_API_KEY`.	`None`
`size` ¶	`str \| None`	The size of the model to deploy. If not provided, it will default to the size of the model being trained (e.g., "rfdetr-base", "rfdetr-large", etc.).	`None`

Raises:

Type	Description
`ValueError`	If the `api_key` is not provided and not found in the environment variable `ROBOFLOW_API_KEY`, or if the `size` is not set for custom architectures.

Source code in src/rfdetr/detr.py

def deploy_to_roboflow(
    self,
    workspace: str,
    project_id: str,
    version: int | str,
    api_key: str | None = None,
    size: str | None = None,
) -> None:
    """Deploy the trained RF-DETR model to Roboflow.

    Deploying with Roboflow will create a Serverless API to which you can make requests.

    You can also download weights into a Roboflow Inference deployment for use in
    Roboflow Workflows and on-device deployment.

    Args:
        workspace: The name of the Roboflow workspace to deploy to.
        project_id: The project ID to which the model will be deployed.
        version: The project version to which the model will be deployed.
        api_key: Your Roboflow API key. If not provided,
            it will be read from the environment variable `ROBOFLOW_API_KEY`.
        size: The size of the model to deploy. If not provided,
            it will default to the size of the model being trained (e.g., "rfdetr-base", "rfdetr-large", etc.).

    Raises:
        ValueError: If the `api_key` is not provided and not found in the
            environment variable `ROBOFLOW_API_KEY`, or if the `size` is
            not set for custom architectures.

    """
    from roboflow import Roboflow

    if api_key is None:
        api_key = os.getenv("ROBOFLOW_API_KEY")
        if api_key is None:
            raise ValueError("Set api_key=<KEY> in deploy_to_roboflow or export ROBOFLOW_API_KEY=<KEY>")

    rf = Roboflow(api_key=api_key)
    workspace = rf.workspace(workspace)

    if self.size is None and size is None:
        raise ValueError("Must set size for custom architectures")

    size = self.size or size
    with tempfile.TemporaryDirectory(prefix="roboflow_upload_") as tmp_out_dir:
        # Write class_names.txt so the Roboflow upload pipeline can discover
        # the class labels without relying on args.class_names in the checkpoint.
        class_names_path = os.path.join(tmp_out_dir, "class_names.txt")
        with open(class_names_path, "w", encoding="utf-8", newline="\n") as f:
            f.write("\n".join(self.class_names))

        # Also embed class_names in the args namespace so that any code path
        # that loads the checkpoint directly (e.g. roboflow-python's second
        # fallback) can find them.  Mutating the shared SimpleNamespace is
        # intentional here: this mirrors reinitialize_detection_head(), which
        # already mutates args.num_classes in-place.
        args = self.model.args
        if not hasattr(args, "class_names") or args.class_names is None:
            args.class_names = self.class_names

        outpath = os.path.join(tmp_out_dir, "weights.pt")
        torch.save({"model": self.model.model.state_dict(), "args": args}, outpath)
        project = workspace.project(project_id)
        project_version = project.version(version)
        project_version.deploy(model_type=size, model_path=tmp_out_dir, filename="weights.pt")

`export(output_dir='output', infer_dir=None, simplify=False, backbone_only=False, opset_version=17, verbose=True, force=False, shape=None, batch_size=1, dynamic_batch=False, patch_size=None, format='onnx', quantization=None, calibration_data=None, max_images=100)` ¶

Export the trained model to ONNX or TFLite format.

See the export documentation <https://rfdetr.roboflow.com/learn/export/>_ for more information.

Parameters:

Name	Type	Description	Default
`output_dir` ¶	`str`	Directory to write the exported model to.	`'output'`
`infer_dir` ¶	`str`	Optional directory of sample images for dynamic-axes inference.	`None`
`simplify` ¶	`bool`	Deprecated and ignored. Simplification is no longer run.	`False`
`backbone_only` ¶	`bool`	Export only the backbone (feature extractor).	`False`
`opset_version` ¶	`int`	ONNX opset version to target.	`17`
`verbose` ¶	`bool`	Print export progress information.	`True`
`force` ¶	`bool`	Deprecated and ignored.	`False`
`shape` ¶	`tuple[int, int] \| None`	`(height, width)` tuple; defaults to square at model resolution. Both dimensions must be divisible by `patch_size * num_windows`.	`None`
`batch_size` ¶	`int`	Static batch size to bake into the ONNX graph.	`1`
`dynamic_batch` ¶	`bool`	If True, export with a dynamic batch dimension so the ONNX model accepts variable batch sizes at runtime.	`False`
`patch_size` ¶	`int \| None`	Backbone patch size. Defaults to the value stored in `model_config.patch_size` (typically 14 or 16). When provided explicitly it must match the instantiated model's patch size. Shape divisibility is validated against `patch_size * num_windows`.	`None`
`format` ¶	`str`	Export format — `"onnx"` (default) or `"tflite"`. When `"tflite"` is selected the model is first exported to ONNX then converted to TFLite via `onnx2tf`. Requires `pip install rfdetr[onnx,tflite]`.	`'onnx'`
`quantization` ¶	`str \| None`	TFLite quantization mode (ignored when `format="onnx"`). One of `None`, `"fp32"`, `"fp16"`, `"int8"`. `None` / `"fp32"` / `"fp16"` produce FP32 + FP16 `.tflite` files; `"int8"` additionally produces an INT8-quantized model.	`None`
`calibration_data` ¶	`str \| ndarray \| None`	Representative images for INT8 calibration and `onnx2tf` output validation. Accepts: `None` — auto-generate random data (sufficient for fp32/fp16; warns for int8). A directory path (`str`) containing JPEG/PNG images — the converter automatically loads, resizes, and prepares them. This is the simplest approach. A path (`str`) to a `.npy` file of shape `(N, H, W, 3)`, dtype float32, values in `[0, 1]`. A :class:`numpy.ndarray` with the same format. For INT8 quantization, provide 20–100 representative images from your training/validation set for best accuracy.	`None`
`max_images` ¶	`int`	Maximum number of images to load from a calibration directory. Defaults to `100`. Only used when calibration_data is a directory path.	`100`

Source code in src/rfdetr/detr.py

@deprecated(
    target=True,
    # `simplify` / `force` are retained for API compatibility and treated as no-op.
    args_mapping={"simplify": False, "force": False},
    deprecated_in="1.6",
    remove_in="1.8",
    num_warns=1,
    stream=functools.partial(warnings.warn, category=DeprecationWarning, stacklevel=2),
)
def export(
    self,
    output_dir: str = "output",
    infer_dir: str = None,
    simplify: bool = False,
    backbone_only: bool = False,
    opset_version: int = 17,
    verbose: bool = True,
    force: bool = False,
    shape: tuple[int, int] | None = None,
    batch_size: int = 1,
    dynamic_batch: bool = False,
    patch_size: int | None = None,
    format: str = "onnx",
    quantization: str | None = None,
    calibration_data: str | np.ndarray | None = None,
    max_images: int = 100,
) -> None:
    """Export the trained model to ONNX or TFLite format.

    See the `export documentation <https://rfdetr.roboflow.com/learn/export/>`_
    for more information.

    Args:
        output_dir: Directory to write the exported model to.
        infer_dir: Optional directory of sample images for dynamic-axes inference.
        simplify: Deprecated and ignored. Simplification is no longer run.
        backbone_only: Export only the backbone (feature extractor).
        opset_version: ONNX opset version to target.
        verbose: Print export progress information.
        force: Deprecated and ignored.
        shape: ``(height, width)`` tuple; defaults to square at model resolution.
            Both dimensions must be divisible by ``patch_size * num_windows``.
        batch_size: Static batch size to bake into the ONNX graph.
        dynamic_batch: If True, export with a dynamic batch dimension
            so the ONNX model accepts variable batch sizes at runtime.
        patch_size: Backbone patch size. Defaults to the value stored in
            ``model_config.patch_size`` (typically 14 or 16). When provided
            explicitly it must match the instantiated model's patch size.
            Shape divisibility is validated against ``patch_size * num_windows``.
        format: Export format — ``"onnx"`` (default) or ``"tflite"``.
            When ``"tflite"`` is selected the model is first exported to ONNX
            then converted to TFLite via ``onnx2tf``.  Requires
            ``pip install rfdetr[onnx,tflite]``.
        quantization: TFLite quantization mode (ignored when
            ``format="onnx"``).  One of ``None``, ``"fp32"``, ``"fp16"``,
            ``"int8"``.  ``None`` / ``"fp32"`` / ``"fp16"`` produce FP32 +
            FP16 ``.tflite`` files; ``"int8"`` additionally produces an
            INT8-quantized model.
        calibration_data: Representative images for INT8 calibration
            and ``onnx2tf`` output validation.  Accepts:

            * ``None`` — auto-generate random data (sufficient for
              fp32/fp16; warns for int8).
            * A **directory path** (``str``) containing JPEG/PNG
              images — the converter automatically loads, resizes, and
              prepares them.  This is the simplest approach.
            * A path (``str``) to a ``.npy`` file of shape
              ``(N, H, W, 3)``, dtype float32, values in ``[0, 1]``.
            * A :class:`numpy.ndarray` with the same format.

            For INT8 quantization, provide 20–100 representative
            images from your training/validation set for best accuracy.
        max_images: Maximum number of images to load from a
            calibration directory.  Defaults to ``100``.  Only used
            when *calibration_data* is a directory path.
    """
    logger.info("Exporting model to ONNX format")
    _valid_formats = ("onnx", "tflite")
    if format not in _valid_formats:
        raise ValueError(f"Unsupported export format {format!r}. Choose from: {_valid_formats}")
    try:
        from rfdetr.export.main import export_onnx, make_infer_image
    except ImportError:
        logger.error(
            "It seems some dependencies for ONNX export are missing."
            " Please run `pip install rfdetr[onnx]` and try again.",
        )
        raise

    device = self.model.device
    # deepcopy(self.model.model.to("cpu")) moves the live model to CPU as a
    # side-effect before copying.  The finally block guarantees the original
    # model is restored to its original device even if export or conversion
    # raises an exception (review H1).
    model = deepcopy(self.model.model.to("cpu"))
    model.to(device)
    try:
        os.makedirs(output_dir, exist_ok=True)
        output_dir_path = Path(output_dir)
        patch_size = _resolve_patch_size(patch_size, self.model_config, "export")
        num_windows = getattr(self.model_config, "num_windows", 1)
        if isinstance(num_windows, bool) or not isinstance(num_windows, int) or num_windows <= 0:
            raise ValueError(f"num_windows must be a positive integer, got {num_windows!r}")
        block_size = patch_size * num_windows
        if shape is None:
            shape = (self.model.resolution, self.model.resolution)
            if shape[0] % block_size != 0:
                raise ValueError(
                    f"Model's default resolution ({self.model.resolution}) is not divisible by "
                    f"block_size={block_size} (patch_size={patch_size} * num_windows={num_windows}). "
                    f"Provide an explicit shape divisible by {block_size}.",
                )
        else:
            shape = _validate_shape_dims(shape, block_size, patch_size, num_windows)

        input_tensors = make_infer_image(
            infer_dir, shape, batch_size, device, num_channels=self.model_config.num_channels
        ).to(device)
        input_names = ["input"]
        if backbone_only:
            output_names = ["features"]
        elif self.model_config.segmentation_head:
            output_names = ["dets", "labels", "masks"]
        else:
            output_names = ["dets", "labels"]

        if dynamic_batch:
            dynamic_axes = {name: {0: "batch"} for name in input_names + output_names}
        else:
            dynamic_axes = None
        model.eval()
        with torch.no_grad():
            if backbone_only:
                features = model(input_tensors)
                logger.debug(f"PyTorch inference output shape: {features.shape}")
            elif self.model_config.segmentation_head:
                outputs = model(input_tensors)
                dets = outputs["pred_boxes"]
                labels = outputs["pred_logits"]
                masks = outputs["pred_masks"]
                if isinstance(masks, torch.Tensor):
                    logger.debug(
                        f"PyTorch inference output shapes - Boxes: {dets.shape}, Labels: {labels.shape}, "
                        f"Masks: {masks.shape}",
                    )
                else:
                    logger.debug(f"PyTorch inference output shapes - Boxes: {dets.shape}, Labels: {labels.shape}")
            else:
                outputs = model(input_tensors)
                dets = outputs["pred_boxes"]
                labels = outputs["pred_logits"]
                logger.debug(f"PyTorch inference output shapes - Boxes: {dets.shape}, Labels: {labels.shape}")

        model.cpu()
        input_tensors = input_tensors.cpu()

        output_file = export_onnx(
            output_dir=str(output_dir_path),
            model=model,
            input_names=input_names,
            input_tensors=input_tensors,
            output_names=output_names,
            dynamic_axes=dynamic_axes,
            backbone_only=backbone_only,
            verbose=verbose,
            opset_version=opset_version,
            variant_name=getattr(self, "size", None),
        )

        logger.info(f"Successfully exported ONNX model to: {output_file}")

        if format == "tflite":
            try:
                from rfdetr.export._tflite.converter import export_tflite
            except ImportError:
                logger.error(
                    "It seems some dependencies for TFLite export are missing."
                    " Please run `pip install rfdetr[onnx,tflite]` and try again.",
                )
                raise

            tflite_path = export_tflite(
                onnx_path=output_file,
                output_dir=str(output_dir_path),
                quantization=quantization,
                calibration_data=calibration_data,
                verbosity="info" if verbose else "error",
                max_images=max_images,
                verbose=verbose,
            )
            logger.info(f"Successfully exported TFLite model to: {tflite_path}")

        logger.info("Export completed successfully")
    finally:
        self.model.model = self.model.model.to(device)

`from_checkpoint(path, **kwargs)` `classmethod` ¶

Load an RF-DETR model from a training checkpoint, automatically inferring the model class.

The correct subclass is resolved in order of preference:

model_name key in the checkpoint (written by the PTL training stack since v1.7.0).
pretrain_weights field in the checkpoint's args entry (legacy fallback).

Both legacy argparse.Namespace checkpoints (produced by engine.py) and dict-style checkpoints (produced by the PTL training stack) are supported.

Parameters:

Name	Type	Description	Default
`path` ¶	`str \| PathLike[str]`	Path to a checkpoint file (e.g. `checkpoint_best_total.pth`).	required
`**kwargs` ¶	`Any`	Additional keyword arguments forwarded to the model constructor (e.g. `accept_platform_model_license=True` for XLarge / 2XLarge models).	`{}`

Returns:

Type	Description
`RFDETR`	An instance of the appropriate :class:`RFDETR` subclass loaded from
`RFDETR`	the checkpoint.

Warning

This method calls torch.load with weights_only=False, which unpickles arbitrary Python objects. Only load checkpoints from trusted sources.

Raises:

Type	Description
`FileNotFoundError`	If path does not exist.
`OSError`	If path exists but cannot be read.
`KeyError`	If the checkpoint does not contain an `"args"` key.
`ValueError`	If the model class cannot be inferred from `model_name` or `pretrain_weights`.

Examples:

>>> model = RFDETR.from_checkpoint("checkpoint_best_total.pth")
>>> model = RFDETRSmall.from_checkpoint("checkpoint_best_total.pth")

Source code in src/rfdetr/detr.py

@classmethod
def from_checkpoint(cls, path: str | os.PathLike[str], **kwargs: Any) -> RFDETR:
    """Load an RF-DETR model from a training checkpoint, automatically
    inferring the model class.

    The correct subclass is resolved in order of preference:

    1. ``model_name`` key in the checkpoint (written by the PTL training
       stack since v1.7.0).
    2. ``pretrain_weights`` field in the checkpoint's ``args`` entry
       (legacy fallback).

    Both legacy ``argparse.Namespace`` checkpoints (produced by
    ``engine.py``) and dict-style checkpoints (produced by the PTL
    training stack) are supported.

    Args:
        path: Path to a checkpoint file (e.g. ``checkpoint_best_total.pth``).
        **kwargs: Additional keyword arguments forwarded to the model
            constructor (e.g. ``accept_platform_model_license=True`` for
            XLarge / 2XLarge models).

    Returns:
        An instance of the appropriate :class:`RFDETR` subclass loaded from
        the checkpoint.

    Warning:
        This method calls ``torch.load`` with ``weights_only=False``, which
        unpickles arbitrary Python objects. Only load checkpoints from
        trusted sources.

    Raises:
        FileNotFoundError: If *path* does not exist.
        OSError: If *path* exists but cannot be read.
        KeyError: If the checkpoint does not contain an ``"args"`` key.
        ValueError: If the model class cannot be inferred from
            ``model_name`` or ``pretrain_weights``.

    Examples:
        >>> model = RFDETR.from_checkpoint("checkpoint_best_total.pth")  # doctest: +SKIP
        >>> model = RFDETRSmall.from_checkpoint("checkpoint_best_total.pth")  # doctest: +SKIP
    """
    # Local import breaks the variants → detr import cycle.
    import rfdetr.variants as rfdetr_variants

    _plus_available = False
    _plus_symbols: dict[str, type[RFDETR]] = {}
    _plus_entries: list[tuple[str, type[RFDETR]]] = []
    try:
        import rfdetr.platform.models as platform_models

        for class_symbol in _CHECKPOINT_PLUS_MODEL_NAME_CLASS_SYMBOLS:
            plus_obj = getattr(platform_models, class_symbol)
            _plus_symbols[class_symbol] = plus_obj
        _plus_entries = [
            (name, _plus_symbols[class_symbol]) for name, class_symbol in _CHECKPOINT_PLUS_MODEL_MAP_ENTRIES
        ]
        _plus_available = True
    except (ImportError, AttributeError):
        _plus_symbols = {}

    # weights_only=False is required because legacy checkpoints embed
    # argparse.Namespace objects that cannot be deserialised with
    # weights_only=True.
    ckpt: dict[str, Any] = torch.load(path, map_location="cpu", weights_only=False)
    args = ckpt["args"]

    _variant_name_to_class: dict[str, type[RFDETR]] = {
        getattr(variant_obj, "__name__", symbol): variant_obj
        for symbol in dir(rfdetr_variants)
        if symbol.startswith("RFDETR")
        for variant_obj in [getattr(rfdetr_variants, symbol)]
    }
    _variant_symbols: dict[str, type[RFDETR]] = {
        class_symbol: _variant_name_to_class[class_symbol] for class_symbol in _CHECKPOINT_MODEL_NAME_CLASS_SYMBOLS
    }
    # Build in three explicit segments: seg-* entries, then plus-model entries
    # (xlarge/2xlarge), then base entries — order determines lookup priority.
    _seg_map: list[tuple[str, type[RFDETR]]] = [
        (name, _variant_symbols[class_symbol])
        for name, class_symbol in _CHECKPOINT_MODEL_MAP_ENTRIES
        if name.startswith("seg-")
    ]
    _base_map: list[tuple[str, type[RFDETR]]] = [
        (name, _variant_symbols[class_symbol])
        for name, class_symbol in _CHECKPOINT_MODEL_MAP_ENTRIES
        if not name.startswith("seg-")
    ]
    _model_map: list[tuple[str, type[RFDETR]]] = _seg_map + _plus_entries + _base_map

    # New checkpoints store model_name directly — use it when available.
    _name_map: dict[str, type[RFDETR]] = dict(_variant_symbols)
    # Plus-model classes are resolved only when rfdetr_plus is installed.
    if _plus_available:
        _name_map.update(_plus_symbols)
    saved_model_name = ckpt.get("model_name")
    model_cls: type[RFDETR] | None = None
    if isinstance(saved_model_name, str):
        normalized_name = saved_model_name.strip()
        if normalized_name:
            model_cls = _name_map.get(normalized_name)
    else:
        normalized_name = ""

    # Fall back to pretrain_weights filename parsing for older checkpoints.
    if isinstance(args, dict):
        weights_name = str(args.get("pretrain_weights", "")).lower()
    else:
        weights_name = str(getattr(args, "pretrain_weights", "")).lower()

    if model_cls is None:
        # Guard: plus-only checkpoints should raise an actionable install error
        # when rfdetr_plus is missing, regardless of whether class inference
        # relies on model_name (new format) or pretrain_weights (legacy format).
        plus_by_model_name = normalized_name in _CHECKPOINT_PLUS_MODEL_NAME_CLASS_SYMBOLS
        plus_by_weights_name = "xlarge" in weights_name and "seg-" not in weights_name
        if not _plus_available and (plus_by_model_name or plus_by_weights_name):
            from rfdetr.platform import _INSTALL_MSG

            raise ImportError(
                f"Checkpoint model_name={saved_model_name!r}, pretrain_weights={weights_name!r} requires the "
                f"rfdetr_plus package. " + _INSTALL_MSG.format(name="platform model downloads")
            )

        for name, klass in _model_map:
            if name in weights_name:
                model_cls = klass
                break

    if model_cls is None:
        raise ValueError(
            f"Could not infer model class from checkpoint at {path!r} "
            f"(model_name={saved_model_name!r}, pretrain_weights={weights_name!r}). "
            f"Please instantiate the model class directly."
        )

    if isinstance(args, dict):
        num_classes: int | None = args.get("num_classes")
    else:
        num_classes = getattr(args, "num_classes", None)

    # pretrain_weights is placed after **kwargs so it always wins even if
    # a caller accidentally passes pretrain_weights inside kwargs.
    constructor_kwargs: dict[str, Any] = {**kwargs, "pretrain_weights": str(path)}
    if num_classes is not None and "num_classes" not in kwargs:
        constructor_kwargs["num_classes"] = num_classes

    return model_cls(**constructor_kwargs)

`get_model(config)` ¶

Retrieve a model context from the provided architecture configuration.

Parameters:

Name	Type	Description	Default
`config` ¶	`ModelConfig`	Architecture configuration.	required

Returns:

Type	Description
`ModelContext`	ModelContext with model, postprocess, device, resolution, args,
`ModelContext`	and class_names attributes.

Source code in src/rfdetr/detr.py

def get_model(self, config: ModelConfig) -> ModelContext:
    """Retrieve a model context from the provided architecture configuration.

    Args:
        config: Architecture configuration.

    Returns:
        ModelContext with model, postprocess, device, resolution, args,
        and class_names attributes.

    """
    return _build_model_context(config)

`maybe_download_pretrain_weights()` ¶

Download pre-trained weights if they are not already downloaded.

Bare filenames (no directory component, e.g. rf-detr-base.pth) are resolved to the model cache directory — set the RF_HOME environment variable to override the location (default: ~/.roboflow/models). Resolution happens in ModelConfig.expand_path for explicitly-provided values, and here as a fallback for field defaults (which Pydantic does not validate by default).

Paths that already contain a directory component are used as-is; the parent directory is created if it does not yet exist.

Source code in src/rfdetr/detr.py

def maybe_download_pretrain_weights(self):
    """Download pre-trained weights if they are not already downloaded.

    Bare filenames (no directory component, e.g. ``rf-detr-base.pth``) are
    resolved to the model cache directory — set the ``RF_HOME`` environment
    variable to override the location (default: ``~/.roboflow/models``).
    Resolution happens in ``ModelConfig.expand_path`` for explicitly-provided
    values, and here as a fallback for field defaults (which Pydantic does not
    validate by default).

    Paths that already contain a directory component are used as-is; the
    parent directory is created if it does not yet exist.
    """
    pretrain_weights = self.model_config.pretrain_weights
    if pretrain_weights is None:
        return
    if not os.path.dirname(pretrain_weights):
        # Field default was not processed by expand_path — resolve to cache dir.
        cache_dir = get_model_cache_dir()
        os.makedirs(cache_dir, exist_ok=True)
        pretrain_weights = os.path.join(cache_dir, pretrain_weights)
    else:
        os.makedirs(os.path.dirname(pretrain_weights), exist_ok=True)
    self.model_config.pretrain_weights = pretrain_weights
    download_pretrain_weights(self.model_config.pretrain_weights)

`optimize_for_inference(compile=True, batch_size=1, dtype=torch.float32)` ¶

Optimize the model for inference with optional JIT compilation and dtype casting.

Operations are wrapped in the correct CUDA device context to prevent context leaks on multi-GPU setups. When compile=True the model is traced with torch.jit.trace using a dummy input of batch_size images at the model's current resolution.

Parameters:

Name	Type	Description	Default
`compile` ¶	`bool`	If `True`, trace the model with `torch.jit.trace` to obtain a JIT-compiled `ScriptModule`. Set to `False` for broader compatibility (e.g. models with dynamic control flow).	`True`
`batch_size` ¶	`int`	Number of images the traced model will be optimized for. Ignored when `compile=False`.	`1`
`dtype` ¶	`dtype \| str`	Target floating-point dtype for the inference model. Accepts a `torch.dtype` directly (e.g. `torch.float16`) or its string name (e.g. `"float16"`). Defaults to `torch.float32`.	`float32`

Raises:

Type	Description
`TypeError`	If `dtype` is not a `torch.dtype`, or if `dtype` is a string that does not correspond to a valid `torch.dtype` attribute.

Examples:

>>> from types import SimpleNamespace
>>> import torch
>>> class _TinyModel(torch.nn.Module):
...     def __init__(self):
...         super().__init__()
...         self.linear = torch.nn.Linear(1, 1)
...     def forward(self, x):
...         return {"pred_boxes": self.linear(x[:, :1, :1, :1].squeeze(-1).squeeze(-1))}
...     def export(self):
...         return None
>>> class _TinyContext:
...     def __init__(self):
...         self.device = torch.device("cpu")
...         self.resolution = 28
...         self.model = _TinyModel()
...         self.inference_model = None
>>> model = object.__new__(RFDETR)
>>> model.model_config = SimpleNamespace(num_channels=3)
>>> model.model = _TinyContext()
>>> model._is_optimized_for_inference = False
>>> model._has_warned_about_not_being_optimized_for_inference = False
>>> model._optimized_has_been_compiled = False
>>> model._optimized_batch_size = None
>>> model._optimized_resolution = None
>>> model._optimized_dtype = None
>>> model.optimize_for_inference(compile=False, dtype="float16")
>>> model._is_optimized_for_inference
True
>>> model._optimized_dtype
torch.float16

Source code in src/rfdetr/detr.py

def optimize_for_inference(
    self, compile: bool = True, batch_size: int = 1, dtype: torch.dtype | str = torch.float32
) -> None:
    """Optimize the model for inference with optional JIT compilation and dtype casting.

    Operations are wrapped in the correct CUDA device context to prevent context
    leaks on multi-GPU setups. When ``compile=True`` the model is traced with
    ``torch.jit.trace`` using a dummy input of ``batch_size`` images at the
    model's current resolution.

    Args:
        compile: If ``True``, trace the model with ``torch.jit.trace`` to obtain
            a JIT-compiled ``ScriptModule``. Set to ``False`` for broader
            compatibility (e.g. models with dynamic control flow).
        batch_size: Number of images the traced model will be optimized for.
            Ignored when ``compile=False``.
        dtype: Target floating-point dtype for the inference model. Accepts a
            ``torch.dtype`` directly (e.g. ``torch.float16``) or its string name
            (e.g. ``"float16"``). Defaults to ``torch.float32``.

    Raises:
        TypeError: If ``dtype`` is not a ``torch.dtype``, or if ``dtype`` is a
            string that does not correspond to a valid ``torch.dtype`` attribute.

    Examples:
        >>> from types import SimpleNamespace
        >>> import torch
        >>> class _TinyModel(torch.nn.Module):
        ...     def __init__(self):
        ...         super().__init__()
        ...         self.linear = torch.nn.Linear(1, 1)
        ...     def forward(self, x):
        ...         return {"pred_boxes": self.linear(x[:, :1, :1, :1].squeeze(-1).squeeze(-1))}
        ...     def export(self):
        ...         return None
        >>> class _TinyContext:
        ...     def __init__(self):
        ...         self.device = torch.device("cpu")
        ...         self.resolution = 28
        ...         self.model = _TinyModel()
        ...         self.inference_model = None
        >>> model = object.__new__(RFDETR)
        >>> model.model_config = SimpleNamespace(num_channels=3)
        >>> model.model = _TinyContext()
        >>> model._is_optimized_for_inference = False
        >>> model._has_warned_about_not_being_optimized_for_inference = False
        >>> model._optimized_has_been_compiled = False
        >>> model._optimized_batch_size = None
        >>> model._optimized_resolution = None
        >>> model._optimized_dtype = None
        >>> model.optimize_for_inference(compile=False, dtype="float16")
        >>> model._is_optimized_for_inference
        True
        >>> model._optimized_dtype
        torch.float16
    """
    if isinstance(dtype, str):
        try:
            dtype = getattr(torch, dtype)
        except AttributeError:
            raise TypeError(f"dtype must be a torch.dtype or a string name of a dtype, got {dtype!r}") from None
    if not isinstance(dtype, torch.dtype):
        raise TypeError(f"dtype must be a torch.dtype or a string name of a dtype, got {type(dtype)!r}")

    # Clear any previously optimized state before starting a new optimization run.
    self.remove_optimized_model()

    _ensure_model_on_device(self.model)
    device = self.model.device
    cuda_ctx = torch.cuda.device(device) if device.type == "cuda" else contextlib.nullcontext()

    try:
        with cuda_ctx:
            self.model.inference_model = deepcopy(self.model.model)
            self.model.inference_model.eval()
            self.model.inference_model.export()

            self.model.inference_model = self.model.inference_model.to(dtype=dtype)

            if compile:
                self.model.inference_model = torch.jit.trace(
                    self.model.inference_model,
                    torch.randn(
                        batch_size,
                        self.model_config.num_channels,
                        self.model.resolution,
                        self.model.resolution,
                        device=self.model.device,
                        dtype=dtype,
                    ),
                )
                self._optimized_has_been_compiled = True
                self._optimized_batch_size = batch_size

            # Set success flags only after all operations complete.
            self._optimized_resolution = self.model.resolution
            self._is_optimized_for_inference = True
            self._optimized_dtype = dtype
    except Exception:
        # Ensure the object is left in a consistent, unoptimized state if optimization fails.
        with contextlib.suppress(Exception):
            self.remove_optimized_model()
        raise

`predict(images, threshold=0.5, shape=None, patch_size=None, include_source_image=True, **kwargs)` ¶

Performs object detection on the input images and returns bounding box predictions.

This method accepts a single image or a list of images in various formats (file path, image url, PIL Image, NumPy array, or torch.Tensor). The images should be in RGB channel order. If a torch.Tensor is provided, it must already be normalized to values in the [0, 1] range and have the shape (C, H, W).

Parameters:

Name	Type	Description	Default
`images` ¶	`str \| Image \| ndarray \| Tensor \| list[str \| ndarray \| Image \| Tensor]`	A single image or a list of images to process. Images can be provided as file paths, PIL Images, NumPy arrays, or torch.Tensors.	required
`threshold` ¶	`float`	The minimum confidence score needed to consider a detected bounding box valid.	`0.5`
`shape` ¶	`tuple[int, int] \| None`	Optional `(height, width)` tuple to resize images to before inference. When provided, overrides the model's default inference resolution. The tuple should match the resolution used when exporting the model (typically a square shape). Both dimensions must be positive integers divisible by `patch_size * num_windows`. Defaults to `(model.resolution, model.resolution)` when not set.	`None`
`patch_size` ¶	`int \| None`	Backbone patch size used for shape divisibility validation. Defaults to `model_config.patch_size` (typically 14 for large models, 16 for smaller ones). Divisibility is checked against `patch_size * num_windows`.	`None`
`include_source_image` ¶	`bool`	Whether to attach the original image as `source_image` in `detections.metadata`. Defaults to `True`. Set to `False` to reduce memory use when source images are not needed.	`True`
`**kwargs` ¶	`Any`	Additional keyword arguments.	`{}`

Returns:

Name	Type	Description
	`Detections \| list[Detections]`	A single or multiple Detections objects, each containing bounding box
	`Detections \| list[Detections]`	coordinates, confidence scores, and class IDs. The `data` dict of
`each`	`Detections \| list[Detections]`	class:`~supervision.Detections` object contains `class_name`
	`Detections \| list[Detections]`	as a string array corresponding to each detection and `source_shape`
	`Detections \| list[Detections]`	as an `int64` array of shape `(N, 2)` with `[height, width]` rows.
	`Detections \| list[Detections]`	`source_shape` is stored per detection so supervision indexing works
	`Detections \| list[Detections]`	correctly. It was previously a `(height, width)` Python `tuple`;
	`Detections \| list[Detections]`	callers using `isinstance(v, tuple)` or `v == (H, W)` must be
	`Detections \| list[Detections]`	updated. The `metadata` dict contains `source_image` as the original
	`Detections \| list[Detections]`	`uint8` image array of shape `(H, W, 3)` when
	`Detections \| list[Detections]`	`include_source_image=True`.

Note

source_image moved from detections.data to detections.metadata. Update callers reading detections.data["source_image"] to use detections.metadata["source_image"].

Raises:

Type	Description
`ValueError`	If `shape` cannot be unpacked as a two-element sequence, if either dimension does not support the `__index__` protocol (e.g. `float`) or is a `bool`, if either dimension is zero or negative, if either dimension is not divisible by `patch_size * num_windows`, or if `patch_size` is not a positive integer.

Source code in src/rfdetr/detr.py

def predict(
    self,
    images: str | Image.Image | np.ndarray | torch.Tensor | list[str | np.ndarray | Image.Image | torch.Tensor],
    threshold: float = 0.5,
    shape: tuple[int, int] | None = None,
    patch_size: int | None = None,
    include_source_image: bool = True,
    **kwargs: Any,
) -> sv.Detections | list[sv.Detections]:
    """Performs object detection on the input images and returns bounding box
    predictions.

    This method accepts a single image or a list of images in various formats
    (file path, image url, PIL Image, NumPy array, or torch.Tensor). The images should be in
    RGB channel order. If a torch.Tensor is provided, it must already be normalized
    to values in the [0, 1] range and have the shape (C, H, W).

    Args:
        images:
            A single image or a list of images to process. Images can be provided
            as file paths, PIL Images, NumPy arrays, or torch.Tensors.
        threshold:
            The minimum confidence score needed to consider a detected bounding box valid.
        shape:
            Optional ``(height, width)`` tuple to resize images to before inference.
            When provided, overrides the model's default inference resolution. The
            tuple should match the resolution used when exporting the model
            (typically a square shape). Both dimensions must be positive integers
            divisible by ``patch_size * num_windows``. Defaults to
            ``(model.resolution, model.resolution)`` when not set.
        patch_size:
            Backbone patch size used for shape divisibility validation. Defaults
            to ``model_config.patch_size`` (typically 14 for large models, 16 for
            smaller ones). Divisibility is checked against
            ``patch_size * num_windows``.
        include_source_image:
            Whether to attach the original image as ``source_image`` in
            ``detections.metadata``. Defaults to ``True``.  Set to ``False``
            to reduce memory use when source images are not needed.
        **kwargs:
            Additional keyword arguments.

    Returns:
        A single or multiple Detections objects, each containing bounding box
        coordinates, confidence scores, and class IDs. The ``data`` dict of
        each :class:`~supervision.Detections` object contains ``class_name``
        as a string array corresponding to each detection and ``source_shape``
        as an ``int64`` array of shape ``(N, 2)`` with ``[height, width]`` rows.
        ``source_shape`` is stored per detection so supervision indexing works
        correctly. It was previously a ``(height, width)`` Python ``tuple``;
        callers using ``isinstance(v, tuple)`` or ``v == (H, W)`` must be
        updated. The ``metadata`` dict contains ``source_image`` as the original
        ``uint8`` image array of shape ``(H, W, 3)`` when
        ``include_source_image=True``.

    Note:
        ``source_image`` moved from ``detections.data`` to
        ``detections.metadata``. Update callers reading
        ``detections.data["source_image"]`` to use
        ``detections.metadata["source_image"]``.

    Raises:
        ValueError: If ``shape`` cannot be unpacked as a two-element sequence,
            if either dimension does not support the ``__index__`` protocol
            (e.g. ``float``) or is a ``bool``, if either dimension is zero or
            negative, if either dimension is not divisible by
            ``patch_size * num_windows``, or if ``patch_size`` is not a positive
            integer.

    """
    import supervision as sv

    _ensure_model_on_device(self.model)

    patch_size = _resolve_patch_size(patch_size, self.model_config, "predict")
    num_windows = getattr(self.model_config, "num_windows", 1)
    if isinstance(num_windows, bool) or not isinstance(num_windows, int) or num_windows <= 0:
        raise ValueError(f"model_config.num_windows must be a positive integer, got {num_windows!r}")
    block_size = patch_size * num_windows

    if shape is None:
        default_res = self.model.resolution
        if default_res % block_size != 0:
            raise ValueError(
                f"Model's default resolution ({default_res}) is not divisible by "
                f"block_size={block_size} (patch_size={patch_size} * num_windows={num_windows}). "
                f"Provide an explicit shape divisible by {block_size}.",
            )
    else:
        shape = _validate_shape_dims(shape, block_size, patch_size, num_windows)

    if not self._is_optimized_for_inference and not self._has_warned_about_not_being_optimized_for_inference:
        logger.warning(
            "Model is not optimized for inference. Latency may be higher than expected."
            " You can optimize the model for inference by calling model.optimize_for_inference().",
        )
        self._has_warned_about_not_being_optimized_for_inference = True

        self.model.model.eval()

    if not isinstance(images, list):
        images = [images]

    orig_sizes = []
    processed_images = []
    source_images = [] if include_source_image else None

    for img in images:
        if isinstance(img, str):
            if img.startswith("http"):
                img = requests.get(img, stream=True).raw
            img = Image.open(img)

        if not isinstance(img, torch.Tensor):
            if include_source_image:
                src = np.array(img)
                if src.dtype != np.uint8:
                    src = (src * 255).clip(0, 255).astype(np.uint8)
                source_images.append(src)
            img = F.to_tensor(img)
        elif include_source_image:
            source_images.append((img.permute(1, 2, 0).cpu().numpy() * 255).astype(np.uint8))

        if (img > 1).any():
            raise ValueError(
                "Image has pixel values above 1. Please ensure the image is normalized (scaled to [0, 1]).",
            )
        if (img < 0).any():
            raise ValueError(
                "Image has pixel values below 0. Please ensure the image is normalized (scaled to [0, 1]).",
            )
        if img.shape[0] != self.model_config.num_channels:
            raise ValueError(
                "Invalid tensor image shape. Tensor inputs to `predict()` must be in (C, H, W) format "
                f"with C matching the model configuration ({self.model_config.num_channels} channels). "
                f"Received tensor with shape {tuple(img.shape)}."
            )
        img_tensor = img

        h, w = img_tensor.shape[1:]
        orig_sizes.append((h, w))

        img_tensor = img_tensor.to(self.model.device)
        resize_to = list(shape) if shape is not None else [self.model.resolution, self.model.resolution]
        img_tensor = F.resize(img_tensor, resize_to)
        img_tensor = F.normalize(img_tensor, self.means, self.stds)

        processed_images.append(img_tensor)

    batch_tensor = torch.stack(processed_images)

    if self._is_optimized_for_inference:
        if (
            self._optimized_resolution != batch_tensor.shape[2]
            or self._optimized_resolution != batch_tensor.shape[3]
        ):
            # this could happen if someone manually changes self.model.resolution after optimizing the model,
            # or if predict(shape=...) is used with a shape that doesn't match the compiled square resolution.
            raise ValueError(
                f"Resolution mismatch. "
                f"Model was optimized for resolution {self._optimized_resolution}x{self._optimized_resolution}, "
                f"but got {batch_tensor.shape[2]}x{batch_tensor.shape[3]}."
                " You can explicitly remove the optimized model by calling model.remove_optimized_model().",
            )
        if self._optimized_has_been_compiled:
            if self._optimized_batch_size != batch_tensor.shape[0]:
                raise ValueError(
                    f"Batch size mismatch. "
                    f"Optimized model was compiled for batch size {self._optimized_batch_size}, "
                    f"but got {batch_tensor.shape[0]}."
                    " You can explicitly remove the optimized model by calling model.remove_optimized_model()."
                    " Alternatively, you can recompile the optimized model for a different batch size"
                    " by calling model.optimize_for_inference(batch_size=<new_batch_size>).",
                )

    with torch.no_grad():
        if self._is_optimized_for_inference:
            predictions = self.model.inference_model(batch_tensor.to(dtype=self._optimized_dtype))
        else:
            predictions = self.model.model(batch_tensor)
        if isinstance(predictions, tuple):
            return_predictions = {
                "pred_logits": predictions[1],
                "pred_boxes": predictions[0],
            }
            if len(predictions) == 3:
                return_predictions["pred_masks"] = predictions[2]
            predictions = return_predictions
        target_sizes = torch.tensor(orig_sizes, device=self.model.device)
        results = self.model.postprocess(predictions, target_sizes=target_sizes)

    model_class_names = self.class_names
    n = len(model_class_names)
    detections_list = []
    for i, result in enumerate(results):
        scores = result["scores"]
        labels = result["labels"]
        boxes = result["boxes"]

        keep = scores > threshold
        scores = scores[keep]
        labels = labels[keep]
        boxes = boxes[keep]

        if "masks" in result:
            masks = result["masks"]
            masks = masks[keep]

            detections = sv.Detections(
                xyxy=boxes.float().cpu().numpy(),
                confidence=scores.float().cpu().numpy(),
                class_id=labels.cpu().numpy(),
                mask=masks.squeeze(1).cpu().numpy(),
            )
        else:
            detections = sv.Detections(
                xyxy=boxes.float().cpu().numpy(),
                confidence=scores.float().cpu().numpy(),
                class_id=labels.cpu().numpy(),
            )

        if include_source_image:
            detections.metadata["source_image"] = source_images[i]
        detections.data["source_shape"] = np.tile(np.array(orig_sizes[i], dtype=np.int64), (len(detections), 1))

        # Attach class names so callers can map class_id → name without a
        # separate lookup.  class_id is always 0-indexed regardless of the
        # original dataset format (COCO category IDs are remapped during
        # training), so class_names[class_id] is the correct mapping.
        # Always set data["class_name"] for a consistent interface.
        #
        # RF-DETR uses num_classes + 1 logits internally; class index n is the
        # background/no-object class and is expected — map it to "__background__"
        # without warning.  Indices outside [0, n] are genuinely unexpected and
        # still produce an empty string with a one-time warning.
        class_ids = detections.class_id if detections.class_id is not None else np.array([], dtype=int)
        truly_oob = [cid for cid in class_ids if not (0 <= cid <= n)]
        if truly_oob:
            logger.warning_once(
                "predict() encountered class_id values out of range [0, %d]: %s — mapping to empty string",
                n,
                truly_oob[:5],
            )
        detections.data["class_name"] = np.array(
            [
                model_class_names[cid] if 0 <= cid < n else ("__background__" if cid == n else "")
                for cid in class_ids
            ],
            dtype=object,
        )

        detections_list.append(detections)

    return detections_list if len(detections_list) > 1 else detections_list[0]

`remove_optimized_model()` ¶

Remove the optimized inference model and reset all optimization flags.

Clears model.inference_model and resets all internal state set by :meth:optimize_for_inference. Safe to call even if the model has not been optimized.

Examples:

>>> from types import SimpleNamespace
>>> import torch
>>> class _TinyModel(torch.nn.Module):
...     def __init__(self):
...         super().__init__()
...         self.linear = torch.nn.Linear(1, 1)
...     def forward(self, x):
...         return {"pred_boxes": self.linear(x[:, :1, :1, :1].squeeze(-1).squeeze(-1))}
...     def export(self):
...         return None
>>> class _TinyContext:
...     def __init__(self):
...         self.device = torch.device("cpu")
...         self.resolution = 28
...         self.model = _TinyModel()
...         self.inference_model = None
>>> model = object.__new__(RFDETR)
>>> model.model_config = SimpleNamespace(num_channels=3)
>>> model.model = _TinyContext()
>>> model._is_optimized_for_inference = False
>>> model._has_warned_about_not_being_optimized_for_inference = False
>>> model._optimized_has_been_compiled = False
>>> model._optimized_batch_size = None
>>> model._optimized_resolution = None
>>> model._optimized_dtype = None
>>> model.optimize_for_inference(compile=False)
>>> model.remove_optimized_model()
>>> model._is_optimized_for_inference
False

Source code in src/rfdetr/detr.py

def remove_optimized_model(self) -> None:
    """Remove the optimized inference model and reset all optimization flags.

    Clears ``model.inference_model`` and resets all internal state set by
    :meth:`optimize_for_inference`. Safe to call even if the model has not
    been optimized.

    Examples:
        >>> from types import SimpleNamespace
        >>> import torch
        >>> class _TinyModel(torch.nn.Module):
        ...     def __init__(self):
        ...         super().__init__()
        ...         self.linear = torch.nn.Linear(1, 1)
        ...     def forward(self, x):
        ...         return {"pred_boxes": self.linear(x[:, :1, :1, :1].squeeze(-1).squeeze(-1))}
        ...     def export(self):
        ...         return None
        >>> class _TinyContext:
        ...     def __init__(self):
        ...         self.device = torch.device("cpu")
        ...         self.resolution = 28
        ...         self.model = _TinyModel()
        ...         self.inference_model = None
        >>> model = object.__new__(RFDETR)
        >>> model.model_config = SimpleNamespace(num_channels=3)
        >>> model.model = _TinyContext()
        >>> model._is_optimized_for_inference = False
        >>> model._has_warned_about_not_being_optimized_for_inference = False
        >>> model._optimized_has_been_compiled = False
        >>> model._optimized_batch_size = None
        >>> model._optimized_resolution = None
        >>> model._optimized_dtype = None
        >>> model.optimize_for_inference(compile=False)
        >>> model.remove_optimized_model()
        >>> model._is_optimized_for_inference
        False
    """
    self.model.inference_model = None
    self._is_optimized_for_inference = False
    self._optimized_has_been_compiled = False
    self._optimized_batch_size = None
    self._optimized_resolution = None
    self._optimized_dtype = None

`train(**kwargs)` ¶

Train an RF-DETR model via the PyTorch Lightning stack.

All keyword arguments are forwarded to :meth:get_train_config to build a :class:~rfdetr.config.TrainConfig. Several kwargs are absorbed and handled specially so that existing call-sites do not break:

resolution — updates the model's input resolution by mutating :attr:model_config.resolution in place before the train config is built. This change persists on :attr:model_config after :meth:train returns. The value must be a positive integer divisible by patch_size * num_windows for the model variant; a :class:ValueError is raised otherwise. :attr:model_config.positional_encoding_size is also updated when the config derives it formulaically (PE == resolution // patch_size); configs with a pretrained-specific PE value (e.g. RFDETRBase uses DINOv2's PE=37 at 560 px) are left unchanged to preserve checkpoint compatibility.
device — normalized via :class:torch.device and mapped to PyTorch Lightning trainer arguments. "cpu" becomes accelerator="cpu"; "cuda" and "cuda:N" become accelerator="gpu" and optionally devices=[N]; "mps" becomes accelerator="mps". Other valid torch device types fall back to PTL auto-detection and emit a :class:UserWarning.
callbacks — if the dict contains any non-empty lists a :class:DeprecationWarning is emitted; the dict is then discarded. Use PTL :class:~pytorch_lightning.Callback objects passed via :func:~rfdetr.training.build_trainer instead.
start_epoch — emits :class:DeprecationWarning and is dropped.
do_benchmark — emits :class:DeprecationWarning and is dropped.

After training completes the underlying nn.Module is synced back onto self.model.model so that :meth:predict and :meth:export continue to work without reloading the checkpoint.

Raises:

Type	Description
`ImportError`	If training dependencies are not installed. Install with `pip install "rfdetr[train,loggers]"`.
`ValueError`	If `resolution` is not a positive integer or is not divisible by `patch_size * num_windows` for the model variant.

Source code in src/rfdetr/detr.py

def train(self, **kwargs):
    """Train an RF-DETR model via the PyTorch Lightning stack.

    All keyword arguments are forwarded to :meth:`get_train_config` to build
    a :class:`~rfdetr.config.TrainConfig`.  Several kwargs are absorbed and
    handled specially so that existing call-sites do not break:

    * ``resolution`` — updates the model's input resolution by mutating
      :attr:`model_config.resolution` in place before the train config is
      built. This change persists on :attr:`model_config` after
      :meth:`train` returns. The value must be a positive integer divisible
      by ``patch_size * num_windows`` for the model variant; a
      :class:`ValueError` is raised otherwise.
      :attr:`model_config.positional_encoding_size` is also updated when
      the config derives it formulaically (``PE == resolution //
      patch_size``); configs with a pretrained-specific PE value (e.g.
      ``RFDETRBase`` uses DINOv2's PE=37 at 560 px) are left unchanged to
      preserve checkpoint compatibility.
    * ``device`` — normalized via :class:`torch.device` and mapped to PyTorch
      Lightning trainer arguments. ``"cpu"`` becomes ``accelerator="cpu"``;
      ``"cuda"`` and ``"cuda:N"`` become ``accelerator="gpu"`` and optionally
      ``devices=[N]``; ``"mps"`` becomes ``accelerator="mps"``. Other valid
      torch device types fall back to PTL auto-detection and emit a
      :class:`UserWarning`.
    * ``callbacks`` — if the dict contains any non-empty lists a
      :class:`DeprecationWarning` is emitted; the dict is then discarded.
      Use PTL :class:`~pytorch_lightning.Callback` objects passed via
      :func:`~rfdetr.training.build_trainer` instead.
    * ``start_epoch`` — emits :class:`DeprecationWarning` and is dropped.
    * ``do_benchmark`` — emits :class:`DeprecationWarning` and is dropped.

    After training completes the underlying ``nn.Module`` is synced back
    onto ``self.model.model`` so that :meth:`predict` and :meth:`export`
    continue to work without reloading the checkpoint.

    Raises:
        ImportError: If training dependencies are not installed. Install with
            ``pip install "rfdetr[train,loggers]"``.
        ValueError: If ``resolution`` is not a positive integer or is not
            divisible by ``patch_size * num_windows`` for the model variant.

    """
    # Both imports are grouped in a single try block because they both live in
    # the `rfdetr[train]` extras group — a missing `pytorch_lightning` (or any
    # other training-extras package) causes either import to fail, and the
    # remediation is identical: `pip install "rfdetr[train,loggers]"`.
    try:
        from rfdetr.training import RFDETRDataModule, RFDETRModelModule, build_trainer
        from rfdetr.training.auto_batch import resolve_auto_batch_config
    except ModuleNotFoundError as exc:
        # Preserve internal import errors so packaging/regression issues in
        # rfdetr.* are not misreported as missing optional extras.
        if exc.name and exc.name.startswith("rfdetr."):
            raise
        raise ImportError(
            "RF-DETR training dependencies are missing. "
            'Install them with `pip install "rfdetr[train,loggers]"` and try again.',
        ) from exc

    # Absorb legacy `callbacks` dict — warn if non-empty, then discard.
    callbacks_dict = kwargs.pop("callbacks", None)
    if callbacks_dict and any(callbacks_dict.values()):
        warnings.warn(
            "Custom callbacks dict is not forwarded to PTL. Use PTL Callback objects instead.",
            DeprecationWarning,
            stacklevel=2,
        )

    # Parse `device` kwarg and map it to PTL accelerator/devices.
    # Supports torch-style strings and torch.device (e.g. "cuda:1").
    _device = kwargs.pop("device", None)
    _accelerator, _devices = RFDETR._resolve_trainer_device_kwargs(_device)

    # Absorb legacy `start_epoch` — PTL resumes automatically via ckpt_path.
    if "start_epoch" in kwargs:
        warnings.warn(
            "`start_epoch` is deprecated and ignored; PTL resumes automatically via `resume`.",
            DeprecationWarning,
            stacklevel=2,
        )
        kwargs.pop("start_epoch")

    # Pop `do_benchmark`; benchmarking via `.train()` is deprecated.
    run_benchmark = bool(kwargs.pop("do_benchmark", False))
    if run_benchmark:
        warnings.warn(
            "`do_benchmark` in `.train()` is deprecated; use `rfdetr benchmark`.",
            DeprecationWarning,
            stacklevel=2,
        )

    # Apply resolution override to model_config before building the train config.
    # resolution is a ModelConfig field, not a TrainConfig field, so we pop it
    # here to avoid it being silently ignored by TrainConfig.
    _resolution = kwargs.pop("resolution", None)
    if _resolution is not None:
        if isinstance(_resolution, bool):
            raise ValueError("resolution must be a positive integer")
        try:
            _resolution = operator.index(_resolution)
        except TypeError as error:
            raise ValueError("resolution must be a positive integer") from error
        if _resolution <= 0:
            raise ValueError("resolution must be a positive integer")
        block_size = self.model_config.patch_size * self.model_config.num_windows
        if _resolution % block_size != 0:
            raise ValueError(
                f"resolution={_resolution} is not divisible by "
                f"patch_size ({self.model_config.patch_size}) * num_windows "
                f"({self.model_config.num_windows}) = {block_size}. "
                f"Choose a resolution that is a multiple of {block_size}."
            )
        # Smart PE update: only recompute positional_encoding_size when the
        # current config derives it formulaically (PE == resolution // patch_size).
        # Configs with a pretrained-specific PE (e.g. RFDETRBase uses DINOv2's
        # PE=37 at 518 px, training at 560 px) must not have PE silently changed
        # — doing so causes shape mismatches when loading pretrained checkpoints.
        _current_pe = self.model_config.positional_encoding_size
        _derived_pe = self.model_config.resolution // self.model_config.patch_size
        if _current_pe == _derived_pe:
            # Formula-derived: update PE proportionally to the new resolution.
            new_pe = _resolution // self.model_config.patch_size
            self.model_config.positional_encoding_size = new_pe
        else:
            # Pretrained-specific PE; leave it unchanged.
            new_pe = _current_pe
        self.model_config.resolution = _resolution

        # Keep the cached inference/export context in sync with model_config so
        # predict()/export()/deployment all see the same resolution metadata.
        if hasattr(self, "model") and self.model is not None:
            if hasattr(self.model, "resolution"):
                self.model.resolution = _resolution
            model_args = getattr(self.model, "args", None)
            if model_args is not None:
                if hasattr(model_args, "resolution"):
                    model_args.resolution = _resolution
                if hasattr(model_args, "positional_encoding_size"):
                    model_args.positional_encoding_size = new_pe
    config = self.get_train_config(**kwargs)
    if config.batch_size == "auto":
        # Auto-batch probing runs forward/backward on the actual model, which
        # must be on the target device (typically CUDA).  Lazy placement keeps
        # the model on CPU until first use — move it now.
        _ensure_model_on_device(self.model)
        auto_batch = resolve_auto_batch_config(
            model_context=self.model,
            model_config=self.model_config,
            train_config=config,
        )
        config.batch_size = auto_batch.safe_micro_batch
        config.grad_accum_steps = auto_batch.recommended_grad_accum_steps
        logger.info(
            "[auto-batch] resolved train config: batch_size=%s grad_accum_steps=%s effective_batch_size=%s",
            config.batch_size,
            config.grad_accum_steps,
            auto_batch.effective_batch_size,
        )
    self.model_config.model_name = type(self).__name__

    # Auto-detect num_classes from the training dataset and align model_config.
    # This must run before RFDETRModelModule is constructed so that weight loading
    # inside the module uses the correct (dataset-derived) class count.
    dataset_dir = getattr(config, "dataset_dir", None)
    if dataset_dir:
        self._align_num_classes_from_dataset(dataset_dir)

    module = RFDETRModelModule(self.model_config, config)
    datamodule = RFDETRDataModule(self.model_config, config)

    # Guard with LOCAL_RANK env var rather than is_main_process() because torch.distributed
    # is not yet initialized here (it is set up inside trainer.fit()).  In Lightning DDP
    # subprocesses, LOCAL_RANK is set by the launcher before the subprocess calls train(),
    # so this correctly identifies rank 0 even before dist.init_process_group() runs.
    if config.save_dataset_grids and os.environ.get("LOCAL_RANK", "0") == "0":
        try:
            from rfdetr.datasets.save_grids import DatasetGridSaver

            datamodule.setup("fit")
            grids_output_dir = Path(config.output_dir) / "dataset_grids"
            DatasetGridSaver(datamodule.train_dataloader(), grids_output_dir, dataset_type="train").save_grid()
            DatasetGridSaver(datamodule.val_dataloader(), grids_output_dir, dataset_type="val").save_grid()
        except Exception:
            logger.warning(
                "Failed to save dataset grids; training will continue without them.",
                exc_info=True,
            )

    trainer_kwargs = {"accelerator": _accelerator}
    if _devices is not None:
        trainer_kwargs["devices"] = _devices
    trainer = build_trainer(config, self.model_config, **trainer_kwargs)
    trainer.fit(module, datamodule, ckpt_path=config.resume or None)

    # Sync the trained weights back so predict() / export() see the updated model.
    self.model.model = module.model
    # Sync class names: prefer explicit config.class_names, otherwise fall back to dataset (#509).
    config_class_names = getattr(config, "class_names", None)
    if config_class_names is not None:
        self.model.class_names = config_class_names
    else:
        dataset_class_names = getattr(datamodule, "class_names", None)
        if dataset_class_names is not None:
            self.model.class_names = dataset_class_names

    # Save complete training configuration to disk for reproducibility.
    # Guard to main process only to avoid races in distributed/multi-GPU training.
    if is_main_process():
        complete_config = {
            "train_config": config.model_dump(),
            "model_config": self.model_config.model_dump(),
            "model_config_type": self.model_config.__class__.__name__,
            "class_names": self.model.class_names,
            "num_classes": len(self.model.class_names) if self.model.class_names else 0,
        }
        try:
            os.makedirs(config.output_dir, exist_ok=True)
            with open(os.path.join(config.output_dir, "training_config.json"), "w") as f:
                json.dump(complete_config, f, indent=2, default=str)
        except OSError as exc:
            logger.warning("Could not save training_config.json to %s: %s", config.output_dir, exc)

RF-DETR XLarge

Attributes¶

class_names property ¶

Functions¶

deploy_to_roboflow(workspace, project_id, version, api_key=None, size=None) ¶

workspace ¶

project_id ¶

version ¶

api_key ¶

size ¶

export(output_dir='output', infer_dir=None, simplify=False, backbone_only=False, opset_version=17, verbose=True, force=False, shape=None, batch_size=1, dynamic_batch=False, patch_size=None, format='onnx', quantization=None, calibration_data=None, max_images=100) ¶

output_dir ¶

infer_dir ¶

simplify ¶

backbone_only ¶

opset_version ¶

verbose ¶

force ¶

shape ¶

batch_size ¶

dynamic_batch ¶

patch_size ¶

format ¶

quantization ¶

calibration_data ¶

max_images ¶

from_checkpoint(path, **kwargs) classmethod ¶

path ¶

**kwargs ¶

get_model(config) ¶

config ¶

maybe_download_pretrain_weights() ¶

optimize_for_inference(compile=True, batch_size=1, dtype=torch.float32) ¶

compile ¶

batch_size ¶

dtype ¶

predict(images, threshold=0.5, shape=None, patch_size=None, include_source_image=True, **kwargs) ¶

images ¶

threshold ¶

shape ¶

patch_size ¶

include_source_image ¶

**kwargs ¶

remove_optimized_model() ¶

train(**kwargs) ¶

`class_names` `property` ¶

`deploy_to_roboflow(workspace, project_id, version, api_key=None, size=None)` ¶

`workspace` ¶

`project_id` ¶

`version` ¶

`api_key` ¶

`size` ¶

`export(output_dir='output', infer_dir=None, simplify=False, backbone_only=False, opset_version=17, verbose=True, force=False, shape=None, batch_size=1, dynamic_batch=False, patch_size=None, format='onnx', quantization=None, calibration_data=None, max_images=100)` ¶

`output_dir` ¶

`infer_dir` ¶

`simplify` ¶

`backbone_only` ¶

`opset_version` ¶

`verbose` ¶

`force` ¶

`shape` ¶

`batch_size` ¶

`dynamic_batch` ¶

`patch_size` ¶

`format` ¶

`quantization` ¶

`calibration_data` ¶

`max_images` ¶

`from_checkpoint(path, **kwargs)` `classmethod` ¶

`path` ¶

`**kwargs` ¶

`get_model(config)` ¶

`config` ¶

`maybe_download_pretrain_weights()` ¶

`optimize_for_inference(compile=True, batch_size=1, dtype=torch.float32)` ¶

`compile` ¶

`batch_size` ¶

`dtype` ¶

`predict(images, threshold=0.5, shape=None, patch_size=None, include_source_image=True, **kwargs)` ¶

`images` ¶

`threshold` ¶

`shape` ¶

`patch_size` ¶

`include_source_image` ¶

`**kwargs` ¶

`remove_optimized_model()` ¶

`train(**kwargs)` ¶