RT-DETR使用教程: RT-DETR使用教程
《Poly Kernel Inception Network for Remote Sensing Detection》
一、 模块介绍
遥感影像 (RSI) 中的对象检测通常面临一些日益增长的挑战,包括对象比例的巨大变化和不同的环境。以前的方法试图通过大核卷积或扩张卷积来扩大主干的空间感受野来应对这些挑战。但是,前者通常会引入相当大的背景噪声,而后者则可能会生成过于稀疏的特征表示。在本文中,我们介绍了 Poly Kernel Inception Network (PKINet) 来处理上述挑战。PKINet 采用无膨胀的多尺度卷积核来提取不同尺度的目标特征并捕获局部上下文。此外,还并行引入了上下文锚点注意力 (CAA) 模块,以捕获远程上下文信息。
二、 加入到RT-DETR中
2.1 创建脚本文件
2.2 复制代码
import torch
import torch.nn as nn
from typing import Optional, Sequence
from mmengine.model import BaseModule
from mmcv.cnn import ConvModule, build_norm_layer
from ultralytics.nn.modules.conv import autopad
from torch.cuda.amp import autocast
from timm.models.layers import DropPath
# -------------------------PKI----------------------------------
def make_divisible(value, divisor, min_value=None, min_ratio=0.9):
"""Make divisible function.
This function rounds the channel number to the nearest value that can be
divisible by the divisor. It is taken from the original tf repo. It ensures
that all layers have a channel number that is divisible by divisor. It can
be seen here: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py # noqa
value (int, float): The original channel number.
divisor (int): The divisor to fully divide the channel number.
min_value (int): The minimum value of the output channel.
Default: None, means that the minimum value equal to the divisor.
min_ratio (float): The minimum ratio of the rounded channel number to
the original channel number. Default: 0.9.
int: The modified output channel number.
if min_value is None:
min_value = divisor
new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
# Make sure that round down does not go down by more than (1-min_ratio).
if new_value < min_ratio * value:
new_value += divisor
return new_value
class BCHW2BHWC(nn.Module):
def __init__(self):
def forward(x):
return x.permute([0, 2, 3, 1])
class BHWC2BCHW(nn.Module):
def __init__(self):
def forward(x):
return x.permute([0, 3, 1, 2])
class GSiLU(BaseModule):
"""Global Sigmoid-Gated Linear Unit, reproduced from paper <SIMPLE CNN FOR VISION>"""
def __init__(self):
self.adpool = nn.AdaptiveAvgPool2d(1)
def forward(self, x):
return x * torch.sigmoid(self.adpool(x))
class CAA(BaseModule):
"""Context Anchor Attention"""
def __init__(
channels: int,
h_kernel_size: int = 11,
v_kernel_size: int = 11,
norm_cfg: Optional[dict] = dict(type='BN', momentum=0.03, eps=0.001),
act_cfg: Optional[dict] = dict(type='SiLU'),
init_cfg: Optional[dict] = None,
self.avg_pool = nn.AvgPool2d(7, 1, 3)
self.conv1 = ConvModule(channels, channels, 1, 1, 0,
norm_cfg=norm_cfg, act_cfg=act_cfg)
self.h_conv = ConvModule(channels, channels, (1, h_kernel_size), 1,
(0, h_kernel_size // 2), groups=channels,
norm_cfg=None, act_cfg=None)
self.v_conv = ConvModule(channels, channels, (v_kernel_size, 1), 1,
(v_kernel_size // 2, 0), groups=channels,
norm_cfg=None, act_cfg=None)
self.conv2 = ConvModule(channels, channels, 1, 1, 0,
norm_cfg=norm_cfg, act_cfg=act_cfg)
self.act = nn.Sigmoid()
def forward(self, x):
attn_factor = self.act(self.conv2(self.v_conv(self.h_conv(self.conv1(self.avg_pool(x))))))
return attn_factor
class ConvFFN(BaseModule):
"""Multi-layer perceptron implemented with ConvModule"""
def __init__(
in_channels: int,
out_channels: Optional[int] = None,
hidden_channels_scale: float = 4.0,
hidden_kernel_size: int = 3,
dropout_rate: float = 0.,
add_identity: bool = True,
norm_cfg: Optional[dict] = dict(type='BN', momentum=0.03, eps=0.001),
act_cfg: Optional[dict] = dict(type='SiLU'),
init_cfg: Optional[dict] = None,
out_channels = out_channels or in_channels
hidden_channels = int(in_channels * hidden_channels_scale)
self.ffn_layers = nn.Sequential(
ConvModule(in_channels, hidden_channels, kernel_size=1, stride=1, padding=0,
norm_cfg=norm_cfg, act_cfg=act_cfg),
ConvModule(hidden_channels, hidden_channels, kernel_size=hidden_kernel_size, stride=1,
padding=hidden_kernel_size // 2, groups=hidden_channels,
norm_cfg=norm_cfg, act_cfg=None),
ConvModule(hidden_channels, out_channels, kernel_size=1, stride=1, padding=0,
norm_cfg=norm_cfg, act_cfg=act_cfg),
self.add_identity = add_identity
def forward(self, x):
x = x + self.ffn_layers(x) if self.add_identity else self.ffn_layers(x)
return x
class Stem(BaseModule):
"""Stem layer"""
def __init__(
in_channels: int,
out_channels: int,
expansion: float = 1.0,
norm_cfg: Optional[dict] = dict(type='BN', momentum=0.03, eps=0.001),
act_cfg: Optional[dict] = dict(type='SiLU'),
init_cfg: Optional[dict] = None,
hidden_channels = make_divisible(int(out_channels * expansion), 8)
self.down_conv = ConvModule(in_channels, hidden_channels, kernel_size=3, stride=2, padding=1,
norm_cfg=norm_cfg, act_cfg=act_cfg)
self.conv1 = ConvModule(hidden_channels, hidden_channels, kernel_size=3, stride=1, padding=1,
norm_cfg=norm_cfg, act_cfg=act_cfg)
self.conv2 = ConvModule(hidden_channels, out_channels, kernel_size=3, stride=1, padding=1,
norm_cfg=norm_cfg, act_cfg=act_cfg)
def forward(self, x):
return self.conv2(self.conv1(self.down_conv(x)))
class DownSamplingLayer(BaseModule):
"""Down sampling layer"""
def __init__(
in_channels: int,
out_channels: Optional[int] = None,
norm_cfg: Optional[dict] = dict(type='BN', momentum=0.03, eps=0.001),
act_cfg: Optional[dict] = dict(type='SiLU'),
init_cfg: Optional[dict] = None,
out_channels = out_channels or (in_channels * 2)
self.down_conv = ConvModule(in_channels, out_channels, kernel_size=3, stride=2, padding=1,
norm_cfg=norm_cfg, act_cfg=act_cfg)
def forward(self, x):
return self.down_conv(x)
class InceptionBottleneck(BaseModule):
"""Bottleneck with Inception module"""
def __init__(
in_channels: int,
out_channels: Optional[int] = None,
kernel_sizes: Sequence[int] = (3, 5, 7, 9, 11),
dilations: Sequence[int] = (1, 1, 1, 1, 1),
expansion: float = 1.0,
add_identity: bool = True,
with_caa: bool = True,
caa_kernel_size: int = 11,
norm_cfg: Optional[dict] = dict(type='BN', momentum=0.03, eps=0.001),
act_cfg: Optional[dict] = dict(type='SiLU'),
init_cfg: Optional[dict] = None,
out_channels = out_channels or in_channels
hidden_channels = make_divisible(int(out_channels * expansion), 8)
self.pre_conv = ConvModule(in_channels, hidden_channels, 1, 1, 0, 1,
norm_cfg=norm_cfg, act_cfg=act_cfg)
self.dw_conv = ConvModule(hidden_channels, hidden_channels, kernel_sizes[0], 1,
autopad(kernel_sizes[0], None, dilations[0]), dilations[0],
groups=hidden_channels, norm_cfg=None, act_cfg=None)
self.dw_conv1 = ConvModule(hidden_channels, hidden_channels, kernel_sizes[1], 1,
autopad(kernel_sizes[1], None, dilations[1]), dilations[1],
groups=hidden_channels, norm_cfg=None, act_cfg=None)
self.dw_conv2 = ConvModule(hidden_channels, hidden_channels, kernel_sizes[2], 1,
autopad(kernel_sizes[2], None, dilations[2]), dilations[2],
groups=hidden_channels, norm_cfg=None, act_cfg=None)
self.dw_conv3 = ConvModule(hidden_channels, hidden_channels, kernel_sizes[3], 1,
autopad(kernel_sizes[3], None, dilations[3]), dilations[3],
groups=hidden_channels, norm_cfg=None, act_cfg=None)
self.dw_conv4 = ConvModule(hidden_channels, hidden_channels, kernel_sizes[4], 1,
autopad(kernel_sizes[4], None, dilations[4]), dilations[4],
groups=hidden_channels, norm_cfg=None, act_cfg=None)
self.pw_conv = ConvModule(hidden_channels, hidden_channels, 1, 1, 0, 1,
norm_cfg=norm_cfg, act_cfg=act_cfg)
if with_caa:
self.caa_factor = CAA(hidden_channels, caa_kernel_size, caa_kernel_size, None, None)
self.caa_factor = None
self.add_identity = add_identity and in_channels == out_channels
self.post_conv = ConvModule(hidden_channels, out_channels, 1, 1, 0, 1,
norm_cfg=norm_cfg, act_cfg=act_cfg)
def forward(self, x):
x = self.pre_conv(x)
y = x # if there is an inplace operation of x, use y = x.clone() instead of y = x
x = self.dw_conv(x)
x = x + self.dw_conv1(x) + self.dw_conv2(x) + self.dw_conv3(x) + self.dw_conv4(x)
x = self.pw_conv(x)
if self.caa_factor is not None:
y = self.caa_factor(y)
if self.add_identity:
y = x * y
x = x + y
x = x * y
x = self.post_conv(x)
return x
class PKIBlock(BaseModule):
"""Poly Kernel Inception Block"""
def __init__(
in_channels: int,
out_channels: Optional[int] = None,
kernel_sizes: Sequence[int] = (3, 5, 7, 9, 11),
dilations: Sequence[int] = (1, 1, 1, 1, 1),
with_caa: bool = True,
caa_kernel_size: int = 11,
expansion: float = 1.0,
ffn_scale: float = 4.0,
ffn_kernel_size: int = 3,
dropout_rate: float = 0.,
drop_path_rate: float = 0.,
layer_scale: Optional[float] = 1.0,
add_identity: bool = True,
norm_cfg: Optional[dict] = dict(type='BN', momentum=0.03, eps=0.001),
act_cfg: Optional[dict] = dict(type='SiLU'),
init_cfg: Optional[dict] = None,
out_channels = out_channels or in_channels
hidden_channels = make_divisible(int(out_channels * expansion), 8)
if norm_cfg is not None:
self.norm1 = build_norm_layer(norm_cfg, in_channels)[1]
self.norm2 = build_norm_layer(norm_cfg, hidden_channels)[1]
self.norm1 = nn.BatchNorm2d(in_channels)
self.norm2 = nn.BatchNorm2d(hidden_channels)
self.block = InceptionBottleneck(in_channels, hidden_channels, kernel_sizes, dilations,
expansion=1.0, add_identity=True,
with_caa=with_caa, caa_kernel_size=caa_kernel_size,
norm_cfg=norm_cfg, act_cfg=act_cfg)
self.ffn = ConvFFN(hidden_channels, out_channels, ffn_scale, ffn_kernel_size, dropout_rate, add_identity=False,
norm_cfg=None, act_cfg=None)
self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity()
self.layer_scale = layer_scale
if self.layer_scale:
self.gamma1 = nn.Parameter(layer_scale * torch.ones(hidden_channels), requires_grad=True)
self.gamma2 = nn.Parameter(layer_scale * torch.ones(out_channels), requires_grad=True)
self.add_identity = add_identity and in_channels == out_channels
def forward(self, x):
if self.layer_scale:
if self.add_identity:
x = x + self.drop_path(self.gamma1.unsqueeze(-1).unsqueeze(-1) * self.block(self.norm1(x)))
x = x + self.drop_path(self.gamma2.unsqueeze(-1).unsqueeze(-1) * self.ffn(self.norm2(x)))
x = self.drop_path(self.gamma1.unsqueeze(-1).unsqueeze(-1) * self.block(self.norm1(x)))
x = self.drop_path(self.gamma2.unsqueeze(-1).unsqueeze(-1) * self.ffn(self.norm2(x)))
if self.add_identity:
x = x + self.drop_path(self.block(self.norm1(x)))
x = x + self.drop_path(self.ffn(self.norm2(x)))
x = self.drop_path(self.block(self.norm1(x)))
x = self.drop_path(self.ffn(self.norm2(x)))
return x
2.3 更改task.py文件
from ultralytics.nn.blocks import *
elif m is PKIBlock:
c2 = args[0]
args = [ch[f], *args]
2.4 更改yaml文件
# Ultralytics YOLO 🚀, AGPL-3.0 license
# RT-DETR-l object detection model with P3-P5 outputs. For details see https://docs.ultralytics.com/models/rtdetr
# Parameters
nc: 80 # number of classes
scales: # model compound scaling constants, i.e. 'model=yolov8n-cls.yaml' will call yolov8-cls.yaml with scale 'n'
# [depth, width, max_channels]
l: [1.00, 1.00, 1024]
# [from, repeats, module, args]
- [-1, 1, HGStem, [32, 48]] # 0-P2/4
- [-1, 6, HGBlock, [48, 128, 3]] # stage 1
- [-1, 1, DWConv, [128, 3, 2, 1, False]] # 2-P3/8
- [-1, 6, HGBlock, [96, 512, 3]] # stage 2
- [-1, 1, DWConv, [512, 3, 2, 1, False]] # 4-P3/16
- [-1, 2, PKIBlock, [512]] # cm, c2, k, light, shortcut
- [-1, 6, HGBlock, [192, 1024, 5, True, True]]
- [-1, 6, HGBlock, [192, 1024, 5, True, True]] # stage 3
- [-1, 1, DWConv, [1024, 3, 2, 1, False]] # 8-P4/32
- [-1, 6, HGBlock, [384, 2048, 5, True, False]] # stage 4
- [-1, 1, Conv, [256, 1, 1, None, 1, 1, False]] # 10 input_proj.2
- [-1, 1, AIFI, [1024, 8]]
- [-1, 1, Conv, [256, 1, 1]] # 12, Y5, lateral_convs.0
- [-1, 1, nn.Upsample, [None, 2, "nearest"]]
- [7, 1, Conv, [256, 1, 1, None, 1, 1, False]] # 14 input_proj.1
- [[-2, -1], 1, Concat, [1]]
- [-1, 3, RepC3, [256]] # 16, fpn_blocks.0
- [-1, 1, Conv, [256, 1, 1]] # 17, Y4, lateral_convs.1
- [-1, 1, nn.Upsample, [None, 2, "nearest"]]
- [3, 1, Conv, [256, 1, 1, None, 1, 1, False]] # 19 input_proj.0
- [[-2, -1], 1, Concat, [1]] # cat backbone P4
- [-1, 3, RepC3, [256]] # X3 (21), fpn_blocks.1
- [-1, 1, Conv, [256, 3, 2]] # 22, downsample_convs.0
- [[-1, 17], 1, Concat, [1]] # cat Y4
- [-1, 3, RepC3, [256]] # F4 (24), pan_blocks.0
- [-1, 1, Conv, [256, 3, 2]] # 25, downsample_convs.1
- [[-1, 12], 1, Concat, [1]] # cat Y5
- [-1, 3, RepC3, [256]] # F5 (27), pan_blocks.1
- [[21, 24, 27], 1, RTDETRDecoder, [nc]] # Detect(P3, P4, P5)
2.5 修改train.py文件
from ultralytics.models import RTDETR
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
if __name__ == '__main__':
model = RTDETR(model='ultralytics/cfg/models/rt-detr/rtdetr-l.yaml')
# model.load('yolov8n.pt')
model.train(data='./data.yaml', epochs=2, batch=1, device='0', imgsz=640, workers=2, cache=False,
amp=True, mosaic=False, project='runs/train', name='exp')