diff --git a/.travis.yml b/.travis.yml index e14af9c0bcd1b58adefae850430adfea445beb0f..27afe1fd6f5e63a4ec618e2278d54d9d86f265ad 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,11 +2,19 @@ dist: bionic # ubuntu 18.04 language: python python: - - "3.5" - "3.6" - "3.7" -env: CUDA=10.1.105-1 CUDA_SHORT=10.1 UBUNTU_VERSION=ubuntu1804 FORCE_CUDA=1 +env: + global: + - CUDA=10.1.105-1 + - CUDA_SHORT=10.1 + - UBUNTU_VERSION=ubuntu1804 + - FORCE_CUDA=1 + matrix: + - TORCH=1.3.1 TORCHVISION=0.4.2 CUDA_ARCH=6.0 + - TORCH=1.5.0 TORCHVISION=0.6.0 CUDA_ARCH=7.0 + cache: pip # Ref to CUDA installation in Travis: https://github.com/jeremad/cuda-travis @@ -25,7 +33,7 @@ before_install: install: - pip install Pillow==6.2.2 # remove this line when torchvision>=0.5 - - pip install torch==1.2 torchvision==0.4.0 # TODO: fix CI for pytorch>1.2 + - pip install torch==${TORCH} torchvision==${TORCHVISION} - pip install "git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI" - pip install -r requirements.txt @@ -36,7 +44,7 @@ before_script: script: - python setup.py check -m -s - - python setup.py build_ext --inplace + - TORCH_CUDA_ARCH_LIST="${CUDA_ARCH}" python setup.py build_ext --inplace - coverage run --branch --source mmdet -m py.test -v --xdoctest-modules tests mmdet after_success: diff --git a/mmdet/models/mask_heads/fcn_mask_head.py b/mmdet/models/mask_heads/fcn_mask_head.py index 30090d52bfc614adf50debed8ac7d7493446ed75..62849a5824fa6432623043f305fa270295fcb8a9 100644 --- a/mmdet/models/mask_heads/fcn_mask_head.py +++ b/mmdet/models/mask_heads/fcn_mask_head.py @@ -2,12 +2,12 @@ import numpy as np import pycocotools.mask as mask_util import torch import torch.nn as nn +import torch.nn.functional as F from torch.nn.modules.utils import _pair from mmdet.core import auto_fp16, force_fp32, mask_target from mmdet.ops import Conv2d, ConvModule, build_upsample_layer from mmdet.ops.carafe import CARAFEPack -from mmdet.ops.grid_sampler import grid_sample from ..builder import HEADS, build_loss BYTES_PER_FLOAT = 4 @@ -302,7 +302,7 @@ def _do_paste_mask(masks, boxes, img_h, img_w, skip_empty=True): gy = img_y[:, :, None].expand(N, img_y.size(1), img_x.size(1)) grid = torch.stack([gx, gy], dim=3) - img_masks = grid_sample( + img_masks = F.grid_sample( masks.to(dtype=torch.float32), grid, align_corners=False) if skip_empty: diff --git a/mmdet/ops/affine_grid/__init__.py b/mmdet/ops/affine_grid/__init__.py deleted file mode 100644 index 8530ade33843475b8bb8b90bae9636323939b78d..0000000000000000000000000000000000000000 --- a/mmdet/ops/affine_grid/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .affine_grid import affine_grid - -__all__ = ['affine_grid'] diff --git a/mmdet/ops/affine_grid/affine_grid.py b/mmdet/ops/affine_grid/affine_grid.py deleted file mode 100644 index 7c24fa7991e04b22abaa08ac15a4fecd2941bf79..0000000000000000000000000000000000000000 --- a/mmdet/ops/affine_grid/affine_grid.py +++ /dev/null @@ -1,68 +0,0 @@ -import torch -import torch.nn.functional as F -from torch.autograd import Function -from torch.autograd.function import once_differentiable - -from . import affine_grid_ext - - -class _AffineGridGenerator(Function): - - @staticmethod - def forward(ctx, theta, size, align_corners): - - ctx.save_for_backward(theta) - ctx.size = size - ctx.align_corners = align_corners - - func = affine_grid_ext.affine_grid_generator_forward - - output = func(theta, size, align_corners) - - return output - - @staticmethod - @once_differentiable - def backward(ctx, grad_output): - theta = ctx.saved_tensors - size = ctx.size - align_corners = ctx.align_corners - - func = affine_grid_ext.affine_grid_generator_backward - - grad_input = func(grad_output, theta, size, align_corners) - - return grad_input, None, None - - -def affine_grid(theta, size, align_corners=False): - if torch.__version__ >= '1.3': - return F.affine_grid(theta, size, align_corners) - elif align_corners: - return F.affine_grid(theta, size) - else: - # enforce floating point dtype on theta - if not theta.is_floating_point(): - raise ValueError( - 'Expected theta to have floating point type, but got {}'. - format(theta.dtype)) - # check that shapes and sizes match - if len(size) == 4: - if theta.dim() != 3 or theta.size(-2) != 2 or theta.size(-1) != 3: - raise ValueError( - 'Expected a batch of 2D affine matrices of shape Nx2x3 ' - 'for size {}. Got {}.'.format(size, theta.shape)) - elif len(size) == 5: - if theta.dim() != 3 or theta.size(-2) != 3 or theta.size(-1) != 4: - raise ValueError( - 'Expected a batch of 3D affine matrices of shape Nx3x4 ' - 'for size {}. Got {}.'.format(size, theta.shape)) - else: - raise NotImplementedError( - 'affine_grid only supports 4D and 5D sizes, ' - 'for 2D and 3D affine transforms, respectively. ' - 'Got size {}.'.format(size)) - if min(size) <= 0: - raise ValueError( - 'Expected non-zero, positive output size. Got {}'.format(size)) - return _AffineGridGenerator.apply(theta, size, align_corners) diff --git a/mmdet/ops/affine_grid/src/affine_grid_ext.cpp b/mmdet/ops/affine_grid/src/affine_grid_ext.cpp deleted file mode 100644 index cc5c80d780cf85f80e113d1b8857ed829fda9c47..0000000000000000000000000000000000000000 --- a/mmdet/ops/affine_grid/src/affine_grid_ext.cpp +++ /dev/null @@ -1,23 +0,0 @@ -// Modified from https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/AffineGridGenerator.cpp -#include <ATen/ATen.h> -#include <ATen/NativeFunctions.h> -#include <torch/extension.h> - -namespace mmdetection { - -using namespace at; - -Tensor affine_grid_generator_forward(const Tensor &theta, IntArrayRef size, - bool align_corners); - -Tensor affine_grid_generator_backward(const Tensor &grad, IntArrayRef size, - bool align_corners); - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { -m.def("affine_grid_generator_forward", &affine_grid_generator_forward, -"affine_grid_generator_forward"); -m.def("affine_grid_generator_backward", &affine_grid_generator_backward, -"affine_grid_generator_backward"); -} - -} // namespace mmdetection diff --git a/mmdet/ops/affine_grid/src/cpu/affine_grid_cpu.cpp b/mmdet/ops/affine_grid/src/cpu/affine_grid_cpu.cpp deleted file mode 100644 index 51434604fd6e91f821ea65893190086a040fe7b7..0000000000000000000000000000000000000000 --- a/mmdet/ops/affine_grid/src/cpu/affine_grid_cpu.cpp +++ /dev/null @@ -1,108 +0,0 @@ -// Modified from https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/AffineGridGenerator.cpp -#include <ATen/ATen.h> -#include <ATen/NativeFunctions.h> -#include <torch/extension.h> - -namespace mmdetection { - -using namespace at; - -at::Tensor linspace_from_neg_one(const Tensor& grid, int64_t num_steps, - bool align_corners) { - if (num_steps <= 1) { - return at::tensor(0, grid.options()); - } - auto range = at::linspace(-1, 1, num_steps, grid.options()); - if (!align_corners) { - range = range * (num_steps - 1) / num_steps; - } - return range; -} - -Tensor make_base_grid_4D(const Tensor& theta, int64_t N, int64_t C, int64_t H, - int64_t W, bool align_corners) { - auto base_grid = at::empty({N, H, W, 3}, theta.options()); - - base_grid.select(-1, 0).copy_(linspace_from_neg_one(theta, W, align_corners)); - base_grid.select(-1, 1).copy_( - linspace_from_neg_one(theta, H, align_corners).unsqueeze_(-1)); - base_grid.select(-1, 2).fill_(1); - - return base_grid; -} - -Tensor make_base_grid_5D(const Tensor& theta, int64_t N, int64_t C, int64_t D, - int64_t H, int64_t W, bool align_corners) { - auto base_grid = at::empty({N, D, H, W, 4}, theta.options()); - - base_grid.select(-1, 0).copy_(linspace_from_neg_one(theta, W, align_corners)); - base_grid.select(-1, 1).copy_( - linspace_from_neg_one(theta, H, align_corners).unsqueeze_(-1)); - base_grid.select(-1, 2).copy_(linspace_from_neg_one(theta, D, align_corners) - .unsqueeze_(-1) - .unsqueeze_(-1)); - base_grid.select(-1, 3).fill_(1); - - return base_grid; -} - -Tensor affine_grid_generator_4D_forward(const Tensor& theta, int64_t N, - int64_t C, int64_t H, int64_t W, - bool align_corners) { - Tensor base_grid = make_base_grid_4D(theta, N, C, H, W, align_corners); - auto grid = base_grid.view({N, H * W, 3}).bmm(theta.transpose(1, 2)); - return grid.view({N, H, W, 2}); -} - -Tensor affine_grid_generator_5D_forward(const Tensor& theta, int64_t N, - int64_t C, int64_t D, int64_t H, - int64_t W, bool align_corners) { - Tensor base_grid = make_base_grid_5D(theta, N, C, D, H, W, align_corners); - auto grid = base_grid.view({N, D * H * W, 4}).bmm(theta.transpose(1, 2)); - return grid.view({N, D, H, W, 3}); -} - -Tensor affine_grid_generator_forward(const Tensor& theta, IntArrayRef size, - bool align_corners) { - if (size.size() == 4) { - return affine_grid_generator_4D_forward(theta, size[0], size[1], size[2], - size[3], align_corners); - } else { - return affine_grid_generator_5D_forward(theta, size[0], size[1], size[2], - size[3], size[4], align_corners); - } -} - -Tensor affine_grid_generator_4D_backward(const Tensor& grad_grid, int64_t N, - int64_t C, int64_t H, int64_t W, - bool align_corners) { - auto base_grid = make_base_grid_4D(grad_grid, N, C, H, W, align_corners); - AT_ASSERT(grad_grid.sizes() == IntArrayRef({N, H, W, 2})); - auto grad_theta = base_grid.view({N, H * W, 3}) - .transpose(1, 2) - .bmm(grad_grid.view({N, H * W, 2})); - return grad_theta.transpose(1, 2); -} - -Tensor affine_grid_generator_5D_backward(const Tensor& grad_grid, int64_t N, - int64_t C, int64_t D, int64_t H, - int64_t W, bool align_corners) { - auto base_grid = make_base_grid_5D(grad_grid, N, C, D, H, W, align_corners); - AT_ASSERT(grad_grid.sizes() == IntArrayRef({N, D, H, W, 3})); - auto grad_theta = base_grid.view({N, D * H * W, 4}) - .transpose(1, 2) - .bmm(grad_grid.view({N, D * H * W, 3})); - return grad_theta.transpose(1, 2); -} - -Tensor affine_grid_generator_backward(const Tensor& grad, IntArrayRef size, - bool align_corners) { - if (size.size() == 4) { - return affine_grid_generator_4D_backward(grad, size[0], size[1], size[2], - size[3], align_corners); - } else { - return affine_grid_generator_5D_backward(grad, size[0], size[1], size[2], - size[3], size[4], align_corners); - } -} -} // namespace mmdetection diff --git a/mmdet/ops/carafe/src/carafe_ext.cpp b/mmdet/ops/carafe/src/carafe_ext.cpp index 5bee3dafc114aaaf8db749794e9c2e9b842dbf57..7998ac2cd9a8aa6d0add9244048718caead68a4e 100644 --- a/mmdet/ops/carafe/src/carafe_ext.cpp +++ b/mmdet/ops/carafe/src/carafe_ext.cpp @@ -22,7 +22,7 @@ int carafe_forward(at::Tensor features, at::Tensor rfeatures, at::Tensor masks, at::Tensor rmasks, int kernel_size, int group_size, int scale_factor, at::Tensor routput, at::Tensor output) { - if (features.type().is_cuda()) { + if (features.device().is_cuda()) { #ifdef WITH_CUDA return carafe_forward_cuda(features, rfeatures, masks, rmasks, kernel_size, group_size, scale_factor, routput, output); @@ -39,7 +39,7 @@ int carafe_backward(at::Tensor top_grad, at::Tensor rfeatures, at::Tensor rbottom_grad_hs, at::Tensor rbottom_grad, at::Tensor rmask_grad, at::Tensor bottom_grad, at::Tensor mask_grad) { - if (top_grad.type().is_cuda()) { + if (top_grad.device().is_cuda()) { #ifdef WITH_CUDA return carafe_backward_cuda(top_grad, rfeatures, masks, kernel_size, group_size, scale_factor, rtop_grad, rbottom_grad_hs, rbottom_grad, diff --git a/mmdet/ops/carafe/src/carafe_naive_ext.cpp b/mmdet/ops/carafe/src/carafe_naive_ext.cpp index 06fe912ad6d502d9cd7387227ae3f4b4a9095c1e..357b8625df8fb18589829bbded26315f9070e743 100644 --- a/mmdet/ops/carafe/src/carafe_naive_ext.cpp +++ b/mmdet/ops/carafe/src/carafe_naive_ext.cpp @@ -18,7 +18,7 @@ int carafe_naive_backward_cuda(at::Tensor top_grad, at::Tensor features, int carafe_naive_forward(at::Tensor features, at::Tensor masks, int kernel_size, int group_size, int scale_factor, at::Tensor output) { - if (features.type().is_cuda()) { + if (features.device().is_cuda()) { #ifdef WITH_CUDA return carafe_naive_forward_cuda(features, masks, kernel_size, group_size, scale_factor, output); @@ -33,7 +33,7 @@ int carafe_naive_backward(at::Tensor top_grad, at::Tensor features, at::Tensor masks, int kernel_size, int group_size, int scale_factor, at::Tensor bottom_grad, at::Tensor mask_grad) { - if (top_grad.type().is_cuda()) { + if (top_grad.device().is_cuda()) { #ifdef WITH_CUDA return carafe_naive_backward_cuda(top_grad, features, masks, kernel_size, group_size, scale_factor, bottom_grad, mask_grad); diff --git a/mmdet/ops/carafe/src/cuda/carafe_cuda.cpp b/mmdet/ops/carafe/src/cuda/carafe_cuda.cpp index 28d890f545166a6c751d801ec03e0111d8da16bf..59b536c027c9407dcb86e2d14099d8d210ddabcb 100644 --- a/mmdet/ops/carafe/src/cuda/carafe_cuda.cpp +++ b/mmdet/ops/carafe/src/cuda/carafe_cuda.cpp @@ -24,9 +24,9 @@ int CARAFEBackwardLaucher(const at::Tensor top_grad, const at::Tensor rfeatures, at::Tensor rmask_grad, at::Tensor bottom_grad, at::Tensor mask_grad); -#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") +#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ") #define CHECK_CONTIGUOUS(x) \ - AT_CHECK(x.is_contiguous(), #x, " must be contiguous ") + TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ") #define CHECK_INPUT(x) \ CHECK_CUDA(x); \ CHECK_CONTIGUOUS(x) diff --git a/mmdet/ops/carafe/src/cuda/carafe_cuda_kernel.cu b/mmdet/ops/carafe/src/cuda/carafe_cuda_kernel.cu index da62755067bbaaa174cfed886789b4f807a2aab6..3a02a20f847734f83567bf80f277a22b67e78c0f 100644 --- a/mmdet/ops/carafe/src/cuda/carafe_cuda_kernel.cu +++ b/mmdet/ops/carafe/src/cuda/carafe_cuda_kernel.cu @@ -156,9 +156,9 @@ int CARAFEForwardLaucher(const at::Tensor features, const at::Tensor masks, // one warp per pixel cudaStream_t stream = at::cuda::getCurrentCUDAStream(); AT_DISPATCH_FLOATING_TYPES_AND_HALF( - features.type(), "NCHW2NHWC_Feature", ([&] { - const scalar_t *bottom_data = features.data<scalar_t>(); - scalar_t *top_data = rfeatures.data<scalar_t>(); + features.scalar_type(), "NCHW2NHWC_Feature", ([&] { + const scalar_t *bottom_data = features.data_ptr<scalar_t>(); + scalar_t *top_data = rfeatures.data_ptr<scalar_t>(); const int dh = divideUP(channels, kTileDim); const int dw = divideUP(input_height * input_width, kTileDim); BatchTranspose2DCUDAKernel<scalar_t> @@ -167,9 +167,9 @@ int CARAFEForwardLaucher(const at::Tensor features, const at::Tensor masks, bottom_data, top_data); })); AT_DISPATCH_FLOATING_TYPES_AND_HALF( - features.type(), "NCHW2NHWC_Masks", ([&] { - const scalar_t *bottom_data = masks.data<scalar_t>(); - scalar_t *top_data = rmasks.data<scalar_t>(); + features.scalar_type(), "NCHW2NHWC_Masks", ([&] { + const scalar_t *bottom_data = masks.data_ptr<scalar_t>(); + scalar_t *top_data = rmasks.data_ptr<scalar_t>(); const int dh = divideUP(mask_channels, kTileDim); const int dw = divideUP(output_height * output_width, kTileDim); BatchTranspose2DCUDAKernel<scalar_t> @@ -178,12 +178,12 @@ int CARAFEForwardLaucher(const at::Tensor features, const at::Tensor masks, bottom_data, top_data); })); AT_DISPATCH_FLOATING_TYPES_AND_HALF( - features.type(), "CARAFELaucherForward", ([&] { + features.scalar_type(), "CARAFELaucherForward", ([&] { const int num_kernels = batch_size * output_height * output_width * THREADS_PER_PIXEL; - const scalar_t *bottom_data = rfeatures.data<scalar_t>(); - const scalar_t *bottom_masks = rmasks.data<scalar_t>(); - scalar_t *top_data = routput.data<scalar_t>(); + const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>(); + const scalar_t *bottom_masks = rmasks.data_ptr<scalar_t>(); + scalar_t *top_data = routput.data_ptr<scalar_t>(); CARAFEForward<scalar_t> <<<at::cuda::ATenCeilDiv(num_kernels, THREADS_PER_BLOCK), @@ -193,9 +193,9 @@ int CARAFEForwardLaucher(const at::Tensor features, const at::Tensor masks, output_height, output_width, mask_channels, top_data); })); AT_DISPATCH_FLOATING_TYPES_AND_HALF( - features.type(), "NHWC2NCHW", ([&] { - const scalar_t *bottom_data = routput.data<scalar_t>(); - scalar_t *top_data = output.data<scalar_t>(); + features.scalar_type(), "NHWC2NCHW", ([&] { + const scalar_t *bottom_data = routput.data_ptr<scalar_t>(); + scalar_t *top_data = output.data_ptr<scalar_t>(); const int dh = divideUP(output_height * output_width, kTileDim); const int dw = divideUP(channels, kTileDim); BatchTranspose2DCUDAKernel<scalar_t> @@ -388,9 +388,9 @@ int CARAFEBackwardLaucher(const at::Tensor top_grad, const at::Tensor rfeatures, at::Tensor mask_grad) { cudaStream_t stream = at::cuda::getCurrentCUDAStream(); AT_DISPATCH_FLOATING_TYPES_AND_HALF( - top_grad.type(), "NCHW2NHWC_Top_Grad", ([&] { - const scalar_t *bottom_data = top_grad.data<scalar_t>(); - scalar_t *top_data = rtop_grad.data<scalar_t>(); + top_grad.scalar_type(), "NCHW2NHWC_Top_Grad", ([&] { + const scalar_t *bottom_data = top_grad.data_ptr<scalar_t>(); + scalar_t *top_data = rtop_grad.data_ptr<scalar_t>(); const int dh = divideUP(channels, kTileDim); const int dw = divideUP(output_height * output_width, kTileDim); BatchTranspose2DCUDAKernel<scalar_t> @@ -400,12 +400,12 @@ int CARAFEBackwardLaucher(const at::Tensor top_grad, const at::Tensor rfeatures, })); AT_DISPATCH_FLOATING_TYPES_AND_HALF( - top_grad.type(), "CARAFELaucherBackward_Feature", ([&] { + top_grad.scalar_type(), "CARAFELaucherBackward_Feature", ([&] { const int num_kernels = batch_size * output_height * output_width * THREADS_PER_PIXEL; - const scalar_t *top_diff = rtop_grad.data<scalar_t>(); - const scalar_t *bottom_masks = masks.data<scalar_t>(); - scalar_t *bottom_diff = rbottom_grad_hs.data<scalar_t>(); + const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>(); + const scalar_t *bottom_masks = masks.data_ptr<scalar_t>(); + scalar_t *bottom_diff = rbottom_grad_hs.data_ptr<scalar_t>(); CARAFEBackward_Feature<scalar_t> <<<at::cuda::ATenCeilDiv(num_kernels, THREADS_PER_BLOCK), @@ -415,11 +415,11 @@ int CARAFEBackwardLaucher(const at::Tensor top_grad, const at::Tensor rfeatures, output_height, output_width, mask_channels, bottom_diff); })); AT_DISPATCH_FLOATING_TYPES_AND_HALF( - top_grad.type(), "FeatureSum", ([&] { + top_grad.scalar_type(), "FeatureSum", ([&] { const int num_kernels = batch_size * input_height * input_width * THREADS_PER_PIXEL; - const scalar_t *bottom_diff_hs = rbottom_grad_hs.data<scalar_t>(); - scalar_t *bottom_diff = rbottom_grad.data<scalar_t>(); + const scalar_t *bottom_diff_hs = rbottom_grad_hs.data_ptr<scalar_t>(); + scalar_t *bottom_diff = rbottom_grad.data_ptr<scalar_t>(); FeatureSum<scalar_t> <<<at::cuda::ATenCeilDiv(num_kernels, THREADS_PER_BLOCK), @@ -428,9 +428,9 @@ int CARAFEBackwardLaucher(const at::Tensor top_grad, const at::Tensor rfeatures, input_height, input_width, bottom_diff); })); AT_DISPATCH_FLOATING_TYPES_AND_HALF( - top_grad.type(), "NHWC2NCHW_Bottom_Grad", ([&] { - const scalar_t *bottom_data = rbottom_grad.data<scalar_t>(); - scalar_t *top_data = bottom_grad.data<scalar_t>(); + top_grad.scalar_type(), "NHWC2NCHW_Bottom_Grad", ([&] { + const scalar_t *bottom_data = rbottom_grad.data_ptr<scalar_t>(); + scalar_t *top_data = bottom_grad.data_ptr<scalar_t>(); const int dh = divideUP(input_height * input_width, kTileDim); const int dw = divideUP(channels, kTileDim); BatchTranspose2DCUDAKernel<scalar_t> @@ -440,12 +440,12 @@ int CARAFEBackwardLaucher(const at::Tensor top_grad, const at::Tensor rfeatures, })); AT_DISPATCH_FLOATING_TYPES( - top_grad.type(), "CARAFELaucherBackward_Mask", ([&] { + top_grad.scalar_type(), "CARAFELaucherBackward_Mask", ([&] { const int num_kernels = batch_size * output_height * output_width * mask_channels * WARP_SIZE; - const scalar_t *top_diff = rtop_grad.data<scalar_t>(); - const scalar_t *bottom_data = rfeatures.data<scalar_t>(); - scalar_t *mask_diff = rmask_grad.data<scalar_t>(); + const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>(); + const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>(); + scalar_t *mask_diff = rmask_grad.data_ptr<scalar_t>(); CARAFEBackward_Mask<scalar_t> <<<at::cuda::ATenCeilDiv(num_kernels, THREADS_PER_BLOCK), @@ -455,9 +455,9 @@ int CARAFEBackwardLaucher(const at::Tensor top_grad, const at::Tensor rfeatures, output_height, output_width, mask_channels, mask_diff); })); AT_DISPATCH_FLOATING_TYPES_AND_HALF( - top_grad.type(), "NHWC2NCHW_Mask_Grad", ([&] { - const scalar_t *bottom_data = rmask_grad.data<scalar_t>(); - scalar_t *top_data = mask_grad.data<scalar_t>(); + top_grad.scalar_type(), "NHWC2NCHW_Mask_Grad", ([&] { + const scalar_t *bottom_data = rmask_grad.data_ptr<scalar_t>(); + scalar_t *top_data = mask_grad.data_ptr<scalar_t>(); const int dh = divideUP(output_height * output_width, kTileDim); const int dw = divideUP(mask_channels, kTileDim); BatchTranspose2DCUDAKernel<scalar_t> diff --git a/mmdet/ops/carafe/src/cuda/carafe_naive_cuda.cpp b/mmdet/ops/carafe/src/cuda/carafe_naive_cuda.cpp index 611f1d114710f6b276455ed58cecaa48a7a634b5..394afd3ad06f88943ae50b022278d55343a1ad62 100644 --- a/mmdet/ops/carafe/src/cuda/carafe_naive_cuda.cpp +++ b/mmdet/ops/carafe/src/cuda/carafe_naive_cuda.cpp @@ -18,9 +18,9 @@ int CARAFENAIVEBackwardLaucher(const at::Tensor top_grad, const int height, const int width, at::Tensor bottom_grad, at::Tensor mask_grad); -#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") +#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ") #define CHECK_CONTIGUOUS(x) \ - AT_CHECK(x.is_contiguous(), #x, " must be contiguous ") + TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ") #define CHECK_INPUT(x) \ CHECK_CUDA(x); \ CHECK_CONTIGUOUS(x) diff --git a/mmdet/ops/carafe/src/cuda/carafe_naive_cuda_kernel.cu b/mmdet/ops/carafe/src/cuda/carafe_naive_cuda_kernel.cu index 3edbae7948128441a2fc4263f7cda22d13a989fc..9cf9855a71c5e58c2ca3e4e369c5ded70a52e8fd 100644 --- a/mmdet/ops/carafe/src/cuda/carafe_naive_cuda_kernel.cu +++ b/mmdet/ops/carafe/src/cuda/carafe_naive_cuda_kernel.cu @@ -76,10 +76,10 @@ int CARAFENAIVEForwardLaucher(const at::Tensor features, const at::Tensor masks, const int width, at::Tensor output) { const int output_size = batch_size * channels * height * width; AT_DISPATCH_FLOATING_TYPES_AND_HALF( - features.type(), "CARAFENAIVELaucherForward", ([&] { - const scalar_t *bottom_data = features.data<scalar_t>(); - const scalar_t *bottom_masks = masks.data<scalar_t>(); - scalar_t *top_data = output.data<scalar_t>(); + features.scalar_type(), "CARAFENAIVELaucherForward", ([&] { + const scalar_t *bottom_data = features.data_ptr<scalar_t>(); + const scalar_t *bottom_masks = masks.data_ptr<scalar_t>(); + scalar_t *top_data = output.data_ptr<scalar_t>(); CARAFENAIVEForward<scalar_t> <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>( @@ -152,12 +152,12 @@ int CARAFENAIVEBackwardLaucher(const at::Tensor top_grad, const int output_size = batch_size * channels * height * width; AT_DISPATCH_FLOATING_TYPES_AND_HALF( - top_grad.type(), "CARAFENAIVELaucherBackward", ([&] { - const scalar_t *top_diff = top_grad.data<scalar_t>(); - const scalar_t *bottom_data = features.data<scalar_t>(); - const scalar_t *bottom_masks = masks.data<scalar_t>(); - scalar_t *bottom_diff = bottom_grad.data<scalar_t>(); - scalar_t *mask_diff = mask_grad.data<scalar_t>(); + top_grad.scalar_type(), "CARAFENAIVELaucherBackward", ([&] { + const scalar_t *top_diff = top_grad.data_ptr<scalar_t>(); + const scalar_t *bottom_data = features.data_ptr<scalar_t>(); + const scalar_t *bottom_masks = masks.data_ptr<scalar_t>(); + scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>(); + scalar_t *mask_diff = mask_grad.data_ptr<scalar_t>(); CARAFENAIVEBackward<scalar_t> <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>( diff --git a/mmdet/ops/dcn/src/cuda/deform_conv_cuda.cpp b/mmdet/ops/dcn/src/cuda/deform_conv_cuda.cpp index 8601eb3b276a4cf2105c04bca35e25b4addd8d84..5d9424908ed2dbd4ac3cdb98d13e09287a4d2f2d 100644 --- a/mmdet/ops/dcn/src/cuda/deform_conv_cuda.cpp +++ b/mmdet/ops/dcn/src/cuda/deform_conv_cuda.cpp @@ -63,26 +63,26 @@ void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput, at::Tensor weight, int kH, int kW, int dH, int dW, int padH, int padW, int dilationH, int dilationW, int group, int deformable_group) { - AT_CHECK(weight.ndimension() == 4, + TORCH_CHECK(weight.ndimension() == 4, "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, " "but got: %s", weight.ndimension()); - AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous"); + TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous"); - AT_CHECK(kW > 0 && kH > 0, + TORCH_CHECK(kW > 0 && kH > 0, "kernel size should be greater than zero, but got kH: %d kW: %d", kH, kW); - AT_CHECK((weight.size(2) == kH && weight.size(3) == kW), + TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW), "kernel size should be consistent with weight, ", "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d", kH, kW, weight.size(2), weight.size(3)); - AT_CHECK(dW > 0 && dH > 0, + TORCH_CHECK(dW > 0 && dH > 0, "stride should be greater than zero, but got dH: %d dW: %d", dH, dW); - AT_CHECK( + TORCH_CHECK( dilationW > 0 && dilationH > 0, "dilation should be greater than 0, but got dilationH: %d dilationW: %d", dilationH, dilationW); @@ -98,7 +98,7 @@ void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput, dimw++; } - AT_CHECK(ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s", + TORCH_CHECK(ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s", ndim); long nInputPlane = weight.size(1) * group; @@ -110,7 +110,7 @@ void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput, long outputWidth = (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1; - AT_CHECK(nInputPlane % deformable_group == 0, + TORCH_CHECK(nInputPlane % deformable_group == 0, "input channels must divide deformable group size"); if (outputWidth < 1 || outputHeight < 1) @@ -120,27 +120,27 @@ void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput, nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight, outputWidth); - AT_CHECK(input.size(1) == nInputPlane, + TORCH_CHECK(input.size(1) == nInputPlane, "invalid number of input planes, expected: %d, but got: %d", nInputPlane, input.size(1)); - AT_CHECK((inputHeight >= kH && inputWidth >= kW), + TORCH_CHECK((inputHeight >= kH && inputWidth >= kW), "input image is smaller than kernel"); - AT_CHECK((offset.size(2) == outputHeight && offset.size(3) == outputWidth), + TORCH_CHECK((offset.size(2) == outputHeight && offset.size(3) == outputWidth), "invalid spatial size of offset, expected height: %d width: %d, but " "got height: %d width: %d", outputHeight, outputWidth, offset.size(2), offset.size(3)); - AT_CHECK((offset.size(1) == deformable_group * 2 * kH * kW), + TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW), "invalid number of channels of offset"); if (gradOutput != NULL) { - AT_CHECK(gradOutput->size(dimf) == nOutputPlane, + TORCH_CHECK(gradOutput->size(dimf) == nOutputPlane, "invalid number of gradOutput planes, expected: %d, but got: %d", nOutputPlane, gradOutput->size(dimf)); - AT_CHECK((gradOutput->size(dimh) == outputHeight && + TORCH_CHECK((gradOutput->size(dimh) == outputHeight && gradOutput->size(dimw) == outputWidth), "invalid size of gradOutput, expected height: %d width: %d , but " "got height: %d width: %d", @@ -191,7 +191,7 @@ int deform_conv_forward_cuda(at::Tensor input, at::Tensor weight, long outputHeight = (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; - AT_CHECK((offset.size(0) == batchSize), "invalid batch size of offset"); + TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset"); output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane, outputHeight, outputWidth}); @@ -298,7 +298,7 @@ int deform_conv_backward_input_cuda(at::Tensor input, at::Tensor offset, long outputHeight = (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; - AT_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset"); + TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset"); gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth}); columns = at::zeros( {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth}, @@ -414,7 +414,7 @@ int deform_conv_backward_parameters_cuda( long outputHeight = (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1; - AT_CHECK((offset.size(0) == batchSize), "invalid batch size of offset"); + TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset"); columns = at::zeros( {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth}, @@ -494,8 +494,8 @@ void modulated_deform_conv_cuda_forward( const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int deformable_group, const bool with_bias) { - AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); - AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous"); + TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); + TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous"); at::DeviceGuard guard(input.device()); const int batch = input.size(0); @@ -576,8 +576,8 @@ void modulated_deform_conv_cuda_backward( int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, int pad_w, int dilation_h, int dilation_w, int group, int deformable_group, const bool with_bias) { - AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); - AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous"); + TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); + TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous"); at::DeviceGuard guard(input.device()); const int batch = input.size(0); diff --git a/mmdet/ops/dcn/src/cuda/deform_conv_cuda_kernel.cu b/mmdet/ops/dcn/src/cuda/deform_conv_cuda_kernel.cu index e7a26f2e830846f80272bcd8c5ce0def34593c95..98752dccf8c58817ca1a952554dd3f33188a2d34 100644 --- a/mmdet/ops/dcn/src/cuda/deform_conv_cuda_kernel.cu +++ b/mmdet/ops/dcn/src/cuda/deform_conv_cuda_kernel.cu @@ -258,9 +258,9 @@ void deformable_im2col( AT_DISPATCH_FLOATING_TYPES_AND_HALF( data_im.scalar_type(), "deformable_im2col_gpu", ([&] { - const scalar_t *data_im_ = data_im.data<scalar_t>(); - const scalar_t *data_offset_ = data_offset.data<scalar_t>(); - scalar_t *data_col_ = data_col.data<scalar_t>(); + const scalar_t *data_im_ = data_im.data_ptr<scalar_t>(); + const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>(); + scalar_t *data_col_ = data_col.data_ptr<scalar_t>(); deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>( num_kernels, data_im_, data_offset_, height, width, ksize_h, ksize_w, @@ -352,9 +352,9 @@ void deformable_col2im( AT_DISPATCH_FLOATING_TYPES_AND_HALF( data_col.scalar_type(), "deformable_col2im_gpu", ([&] { - const scalar_t *data_col_ = data_col.data<scalar_t>(); - const scalar_t *data_offset_ = data_offset.data<scalar_t>(); - scalar_t *grad_im_ = grad_im.data<scalar_t>(); + const scalar_t *data_col_ = data_col.data_ptr<scalar_t>(); + const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>(); + scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>(); deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>( num_kernels, data_col_, data_offset_, channels, height, width, ksize_h, @@ -450,10 +450,10 @@ void deformable_col2im_coord( AT_DISPATCH_FLOATING_TYPES_AND_HALF( data_col.scalar_type(), "deformable_col2im_coord_gpu", ([&] { - const scalar_t *data_col_ = data_col.data<scalar_t>(); - const scalar_t *data_im_ = data_im.data<scalar_t>(); - const scalar_t *data_offset_ = data_offset.data<scalar_t>(); - scalar_t *grad_offset_ = grad_offset.data<scalar_t>(); + const scalar_t *data_col_ = data_col.data_ptr<scalar_t>(); + const scalar_t *data_im_ = data_im.data_ptr<scalar_t>(); + const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>(); + scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>(); deformable_col2im_coord_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>( num_kernels, data_col_, data_im_, data_offset_, channels, height, width, @@ -780,10 +780,10 @@ void modulated_deformable_im2col_cuda( AT_DISPATCH_FLOATING_TYPES_AND_HALF( data_im.scalar_type(), "modulated_deformable_im2col_gpu", ([&] { - const scalar_t *data_im_ = data_im.data<scalar_t>(); - const scalar_t *data_offset_ = data_offset.data<scalar_t>(); - const scalar_t *data_mask_ = data_mask.data<scalar_t>(); - scalar_t *data_col_ = data_col.data<scalar_t>(); + const scalar_t *data_im_ = data_im.data_ptr<scalar_t>(); + const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>(); + const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>(); + scalar_t *data_col_ = data_col.data_ptr<scalar_t>(); modulated_deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>( num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im, kernel_h, kenerl_w, @@ -812,10 +812,10 @@ void modulated_deformable_col2im_cuda( AT_DISPATCH_FLOATING_TYPES_AND_HALF( data_col.scalar_type(), "modulated_deformable_col2im_gpu", ([&] { - const scalar_t *data_col_ = data_col.data<scalar_t>(); - const scalar_t *data_offset_ = data_offset.data<scalar_t>(); - const scalar_t *data_mask_ = data_mask.data<scalar_t>(); - scalar_t *grad_im_ = grad_im.data<scalar_t>(); + const scalar_t *data_col_ = data_col.data_ptr<scalar_t>(); + const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>(); + const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>(); + scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>(); modulated_deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>( num_kernels, data_col_, data_offset_, data_mask_, channels, height_im, width_im, @@ -845,12 +845,12 @@ void modulated_deformable_col2im_coord_cuda( AT_DISPATCH_FLOATING_TYPES_AND_HALF( data_col.scalar_type(), "modulated_deformable_col2im_coord_gpu", ([&] { - const scalar_t *data_col_ = data_col.data<scalar_t>(); - const scalar_t *data_im_ = data_im.data<scalar_t>(); - const scalar_t *data_offset_ = data_offset.data<scalar_t>(); - const scalar_t *data_mask_ = data_mask.data<scalar_t>(); - scalar_t *grad_offset_ = grad_offset.data<scalar_t>(); - scalar_t *grad_mask_ = grad_mask.data<scalar_t>(); + const scalar_t *data_col_ = data_col.data_ptr<scalar_t>(); + const scalar_t *data_im_ = data_im.data_ptr<scalar_t>(); + const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>(); + const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>(); + scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>(); + scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>(); modulated_deformable_col2im_coord_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>( num_kernels, data_col_, data_im_, data_offset_, data_mask_, channels, height_im, width_im, diff --git a/mmdet/ops/dcn/src/cuda/deform_pool_cuda.cpp b/mmdet/ops/dcn/src/cuda/deform_pool_cuda.cpp index d7ed3f639ea0a6d6f7f71b3d3bcdd23a54c77499..3c09f998029714bdd3542788bc6da6e4f48e9d0b 100644 --- a/mmdet/ops/dcn/src/cuda/deform_pool_cuda.cpp +++ b/mmdet/ops/dcn/src/cuda/deform_pool_cuda.cpp @@ -33,7 +33,7 @@ void deform_psroi_pooling_cuda_forward( at::Tensor top_count, const int no_trans, const float spatial_scale, const int output_dim, const int group_size, const int pooled_size, const int part_size, const int sample_per_part, const float trans_std) { - AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); + TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); at::DeviceGuard guard(input.device()); const int batch = input.size(0); @@ -59,8 +59,8 @@ void deform_psroi_pooling_cuda_backward( const int no_trans, const float spatial_scale, const int output_dim, const int group_size, const int pooled_size, const int part_size, const int sample_per_part, const float trans_std) { - AT_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous"); - AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); + TORCH_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous"); + TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous"); at::DeviceGuard guard(input.device()); const int batch = input.size(0); diff --git a/mmdet/ops/dcn/src/cuda/deform_pool_cuda_kernel.cu b/mmdet/ops/dcn/src/cuda/deform_pool_cuda_kernel.cu index 05b00d4be618353b404540469bf6118902651ca2..18e3a048d3f48ce6ce86162a354aeaa29ce001a6 100644 --- a/mmdet/ops/dcn/src/cuda/deform_pool_cuda_kernel.cu +++ b/mmdet/ops/dcn/src/cuda/deform_pool_cuda_kernel.cu @@ -290,11 +290,11 @@ void DeformablePSROIPoolForward(const at::Tensor data, AT_DISPATCH_FLOATING_TYPES_AND_HALF( data.scalar_type(), "deformable_psroi_pool_forward", ([&] { - const scalar_t *bottom_data = data.data<scalar_t>(); - const scalar_t *bottom_rois = bbox.data<scalar_t>(); - const scalar_t *bottom_trans = no_trans ? NULL : trans.data<scalar_t>(); - scalar_t *top_data = out.data<scalar_t>(); - scalar_t *top_count_data = top_count.data<scalar_t>(); + const scalar_t *bottom_data = data.data_ptr<scalar_t>(); + const scalar_t *bottom_rois = bbox.data_ptr<scalar_t>(); + const scalar_t *bottom_trans = no_trans ? NULL : trans.data_ptr<scalar_t>(); + scalar_t *top_data = out.data_ptr<scalar_t>(); + scalar_t *top_count_data = top_count.data_ptr<scalar_t>(); DeformablePSROIPoolForwardKernel<<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>( count, bottom_data, (scalar_t)spatial_scale, channels, height, width, pooled_height, pooled_width, @@ -341,13 +341,13 @@ void DeformablePSROIPoolBackwardAcc(const at::Tensor out_grad, AT_DISPATCH_FLOATING_TYPES_AND_HALF( out_grad.scalar_type(), "deformable_psroi_pool_backward_acc", ([&] { - const scalar_t *top_diff = out_grad.data<scalar_t>(); - const scalar_t *bottom_data = data.data<scalar_t>(); - const scalar_t *bottom_rois = bbox.data<scalar_t>(); - const scalar_t *bottom_trans = no_trans ? NULL : trans.data<scalar_t>(); - scalar_t *bottom_data_diff = in_grad.data<scalar_t>(); - scalar_t *bottom_trans_diff = no_trans ? NULL : trans_grad.data<scalar_t>(); - const scalar_t *top_count_data = top_count.data<scalar_t>(); + const scalar_t *top_diff = out_grad.data_ptr<scalar_t>(); + const scalar_t *bottom_data = data.data_ptr<scalar_t>(); + const scalar_t *bottom_rois = bbox.data_ptr<scalar_t>(); + const scalar_t *bottom_trans = no_trans ? NULL : trans.data_ptr<scalar_t>(); + scalar_t *bottom_data_diff = in_grad.data_ptr<scalar_t>(); + scalar_t *bottom_trans_diff = no_trans ? NULL : trans_grad.data_ptr<scalar_t>(); + const scalar_t *top_count_data = top_count.data_ptr<scalar_t>(); DeformablePSROIPoolBackwardAccKernel<<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>( count, top_diff, top_count_data, num_rois, (scalar_t)spatial_scale, channels, height, width, diff --git a/mmdet/ops/dcn/src/deform_conv_ext.cpp b/mmdet/ops/dcn/src/deform_conv_ext.cpp index 2beaeffcbb0a41364f4440a201f39a0b6d30ffd8..fac60162b699438bf17d32a7660564b45c3b9745 100644 --- a/mmdet/ops/dcn/src/deform_conv_ext.cpp +++ b/mmdet/ops/dcn/src/deform_conv_ext.cpp @@ -54,7 +54,7 @@ int deform_conv_forward(at::Tensor input, at::Tensor weight, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, int im2col_step) { - if (input.type().is_cuda()) { + if (input.device().is_cuda()) { #ifdef WITH_CUDA return deform_conv_forward_cuda(input, weight, offset, output, columns, ones, kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, @@ -73,7 +73,7 @@ int deform_conv_backward_input(at::Tensor input, at::Tensor offset, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, int im2col_step) { - if (input.type().is_cuda()) { + if (input.device().is_cuda()) { #ifdef WITH_CUDA return deform_conv_backward_input_cuda(input, offset, gradOutput, gradInput, gradOffset, weight, columns, kW, kH, dW, dH, padW, padH, @@ -91,7 +91,7 @@ int deform_conv_backward_parameters( at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH, int padW, int padH, int dilationW, int dilationH, int group, int deformable_group, float scale, int im2col_step) { - if (input.type().is_cuda()) { + if (input.device().is_cuda()) { #ifdef WITH_CUDA return deform_conv_backward_parameters_cuda(input, offset, gradOutput, gradWeight, columns, ones, kW, kH, dW, dH, padW, padH, dilationW, @@ -110,7 +110,7 @@ void modulated_deform_conv_forward( const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int deformable_group, const bool with_bias) { - if (input.type().is_cuda()) { + if (input.device().is_cuda()) { #ifdef WITH_CUDA return modulated_deform_conv_cuda_forward(input, weight, bias, ones, offset, mask, output, columns, kernel_h, kernel_w, stride_h, @@ -131,7 +131,7 @@ void modulated_deform_conv_backward( int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h, int pad_w, int dilation_h, int dilation_w, int group, int deformable_group, const bool with_bias) { - if (input.type().is_cuda()) { + if (input.device().is_cuda()) { #ifdef WITH_CUDA return modulated_deform_conv_cuda_backward(input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight, grad_bias, grad_offset, diff --git a/mmdet/ops/dcn/src/deform_pool_ext.cpp b/mmdet/ops/dcn/src/deform_pool_ext.cpp index f590fabec5f4e998d1090c730c325b92ba01ba81..877064828d5b999f4cd4c3f239540862c16de268 100644 --- a/mmdet/ops/dcn/src/deform_pool_ext.cpp +++ b/mmdet/ops/dcn/src/deform_pool_ext.cpp @@ -31,7 +31,7 @@ void deform_psroi_pooling_forward( at::Tensor top_count, const int no_trans, const float spatial_scale, const int output_dim, const int group_size, const int pooled_size, const int part_size, const int sample_per_part, const float trans_std) { - if (input.type().is_cuda()) { + if (input.device().is_cuda()) { #ifdef WITH_CUDA return deform_psroi_pooling_cuda_forward(input, bbox, trans, out, top_count, no_trans, spatial_scale, output_dim, group_size, pooled_size, @@ -49,7 +49,7 @@ void deform_psroi_pooling_backward( const int no_trans, const float spatial_scale, const int output_dim, const int group_size, const int pooled_size, const int part_size, const int sample_per_part, const float trans_std) { - if (input.type().is_cuda()) { + if (input.device().is_cuda()) { #ifdef WITH_CUDA return deform_psroi_pooling_cuda_backward(out_grad, input, bbox, trans, top_count, input_grad, trans_grad, no_trans, spatial_scale, diff --git a/mmdet/ops/grid_sampler/__init__.py b/mmdet/ops/grid_sampler/__init__.py deleted file mode 100644 index 868617a6b3f42049d8b78253bf639f07e47ec981..0000000000000000000000000000000000000000 --- a/mmdet/ops/grid_sampler/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from .grid_sampler import grid_sample - -__all__ = ['grid_sample'] diff --git a/mmdet/ops/grid_sampler/grid_sampler.py b/mmdet/ops/grid_sampler/grid_sampler.py deleted file mode 100644 index b5c59aa4906966f62f1ac584c8a38dae549b0e2d..0000000000000000000000000000000000000000 --- a/mmdet/ops/grid_sampler/grid_sampler.py +++ /dev/null @@ -1,100 +0,0 @@ -import torch -import torch.nn.functional as F -from torch.autograd import Function -from torch.autograd.function import once_differentiable - -from . import grid_sampler_ext - - -class _GridSampler(Function): - - @staticmethod - def forward(ctx, input, grid, mode_enum, padding_mode_enum, align_corners): - - ctx.save_for_backward(input, grid) - ctx.mode_enum = mode_enum - ctx.padding_mode_enum = padding_mode_enum - ctx.align_corners = align_corners - - output = grid_sampler_ext.grid_sampler_forward(input, grid, mode_enum, - padding_mode_enum, - align_corners) - - return output - - @staticmethod - @once_differentiable - def backward(ctx, grad_output): - input, grid = ctx.saved_tensors - mode_enum = ctx.mode_enum - padding_mode_enum = ctx.padding_mode_enum - align_corners = ctx.align_corners - - grad_input, grad_grid = grid_sampler_ext.grid_sampler_backward( - grad_output, input, grid, mode_enum, padding_mode_enum, - align_corners) - - return grad_input, grad_grid, None, None, None - - -def grid_sample(input, - grid, - mode='bilinear', - padding_mode='zeros', - align_corners=False): - if torch.__version__ >= '1.3': - return F.grid_sample(input, grid, mode, padding_mode, align_corners) - elif align_corners: - return F.grid_sample(input, grid, mode, padding_mode) - else: - - # use self-compiled grid_sampler to support align_corners=False - - assert mode in ['bilinear', 'nearest'], \ - 'expected mode to be bilinear or nearest, but got: {}'.format(mode) - - assert padding_mode in ['zeros', 'border', 'reflection'], \ - 'expected padding_mode to be zeros, border, or reflection, ' \ - 'but got: {}'.format(padding_mode) - - if mode == 'bilinear': - mode_enum = 0 - else: - mode_enum = 1 - - if padding_mode == 'zeros': - padding_mode_enum = 0 - elif padding_mode == 'border': - padding_mode_enum = 1 - else: - padding_mode_enum = 2 - - # shape check - assert input.device == grid.device, \ - 'expected input and grid to be on same device, ' \ - 'but input is on {} and grid is on {}'.format( - input.device, grid.device) - assert input.dtype == grid.dtype, \ - 'expected input and grid to have the same dtype, ' \ - 'but input has {} and grid has {}'.format( - input.dtype, grid.dtype) - assert input.dim() == 4 or input.dim() == 5, \ - 'expected 4D or 5D input and grid with same number of dimensions' \ - 'but got input with sizes {} and grid with sizes {}'.format( - input.size(), grid.size()) - assert input.size(0) == grid.size(0), \ - 'expected input and grid to have the same batch size, ' \ - 'but got input with sizes {} and grid with sizes {}'.format( - input.size(), grid.size()) - assert grid.size(-1) == input.dim() - 2, \ - 'expected grid to have size {} in last {} dimension, ' \ - 'but got grid with sizes '.format( - input.dim() - 2, grid.size()) - for i in range(2, input.dim()): - assert input.size(i) > 0, \ - 'expected input to have non-empty spatial dimensions, ' \ - 'but input has sizes {} with dimension {} being empty'.format( - input.sizes(), i) - - return _GridSampler.apply(input, grid, mode_enum, padding_mode_enum, - align_corners) diff --git a/mmdet/ops/grid_sampler/src/cpu/grid_sampler_cpu.cpp b/mmdet/ops/grid_sampler/src/cpu/grid_sampler_cpu.cpp deleted file mode 100644 index cf1776ed1d7de573c97dd3254ef08d2352ca4377..0000000000000000000000000000000000000000 --- a/mmdet/ops/grid_sampler/src/cpu/grid_sampler_cpu.cpp +++ /dev/null @@ -1,692 +0,0 @@ -// Modified from https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/GridSampler.cpp - -#include <torch/extension.h> -#include "grid_sampler_cpu.h" -#include <ATen/ATen.h> -#include <ATen/Device.h> -#include <ATen/NativeFunctions.h> -#include <c10/core/Layout.h> -#include <c10/util/Exception.h> - -#ifdef _OPENMP -#include <omp.h> -#endif - -namespace mmdetection { - -using namespace at; -using mmdetection::detail::GridSamplerInterpolation; -using mmdetection::detail::GridSamplerPadding; - -namespace { - - template<typename scalar_t> - Tensor grid_sampler_2d_forward_cpu_impl(const Tensor& input, const Tensor& grid, - GridSamplerInterpolation interpolation_mode, - GridSamplerPadding padding_mode, - bool align_corners) { - int64_t N = input.size(0); - int64_t C = input.size(1); - int64_t inp_H = input.size(2); - int64_t inp_W = input.size(3); - int64_t out_H = grid.size(1); - int64_t out_W = grid.size(2); - auto output = at::empty({N, C, out_H, out_W}, input.options()); - int64_t inp_sN = input.stride(0); - int64_t inp_sC = input.stride(1); - int64_t inp_sH = input.stride(2); - int64_t inp_sW = input.stride(3); - int64_t grid_sN = grid.stride(0); - int64_t grid_sH = grid.stride(1); - int64_t grid_sW = grid.stride(2); - int64_t grid_sCoor = grid.stride(3); - int64_t out_sN = output.stride(0); - int64_t out_sC = output.stride(1); - int64_t out_sH = output.stride(2); - int64_t out_sW = output.stride(3); - scalar_t *inp_ptr = input.data<scalar_t>(); - scalar_t *out_ptr = output.data<scalar_t>(); - scalar_t *grid_ptr = grid.data<scalar_t>(); - // loop over each output pixel - #ifdef _OPENMP - #pragma omp parallel for - #endif - for (int64_t n = 0; n < N; ++n) { - scalar_t *grid_ptr_N = grid_ptr + n * grid_sN; - scalar_t *inp_ptr_N = inp_ptr + n * inp_sN; - for (int64_t h = 0; h < out_H; ++h) { - for (int64_t w = 0; w < out_W; ++w) { - // get the corresponding input x, y, z co-ordinates from grid - scalar_t *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW; - scalar_t ix = *grid_ptr_NHW; - scalar_t iy = grid_ptr_NHW[grid_sCoor]; - - ix = grid_sampler_compute_source_index(ix, inp_W, padding_mode, align_corners); - iy = grid_sampler_compute_source_index(iy, inp_H, padding_mode, align_corners); - - if (interpolation_mode == GridSamplerInterpolation::Bilinear) { - // get corner pixel values from (x, y, z) - // for 4d, we used north-east-south-west - // for 5d, we add top-bottom - int64_t ix_nw = static_cast<int64_t>(std::floor(ix)); - int64_t iy_nw = static_cast<int64_t>(std::floor(iy)); - - int64_t ix_ne = ix_nw + 1; - int64_t iy_ne = iy_nw; - - int64_t ix_sw = ix_nw; - int64_t iy_sw = iy_nw + 1; - - int64_t ix_se = ix_nw + 1; - int64_t iy_se = iy_nw + 1; - - // get surfaces to each neighbor: - scalar_t nw = (ix_se - ix) * (iy_se - iy) ; - scalar_t ne = (ix - ix_sw) * (iy_sw - iy) ; - scalar_t sw = (ix_ne - ix) * (iy - iy_ne); - scalar_t se = (ix - ix_nw) * (iy - iy_nw); - - // calculate bilinear weighted pixel value and set output pixel - scalar_t *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW; - scalar_t *inp_ptr_NC = inp_ptr_N; - for (int c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) { - // (c, iz_tnw, iy_tnw, ix_tnw) * tnw + (c, iz_tne, iy_tne, ix_tne) * tne - // + (c, iz_tsw, iy_tsw, ix_tsw) * tsw + (c, iz_tse, iy_tse, ix_tse) * tse - // + (c, iz_bnw, iy_bnw, ix_bnw) * bnw + (c, iz_bne, iy_bne, ix_bne) * bne - // + (c, iz_bsw, iy_bsw, ix_bsw) * bsw + (c, iz_bse, iy_bse, ix_bse) * bse - *out_ptr_NCHW = static_cast<scalar_t>(0); - if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) { - *out_ptr_NCHW += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw; - } - if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) { - *out_ptr_NCHW += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne; - } - if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) { - *out_ptr_NCHW += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw; - } - if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) { - *out_ptr_NCHW += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se; - } - } - } else if (interpolation_mode == GridSamplerInterpolation::Nearest) { - int64_t ix_nearest = static_cast<int64_t>(std::round(ix)); - int64_t iy_nearest = static_cast<int64_t>(std::round(iy)); - - // assign nearest neighor pixel value to output pixel - scalar_t *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW; - scalar_t *inp_ptr_NC = inp_ptr_N; - for (int c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) { - if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) { - *out_ptr_NCHW = inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW]; - } else { - *out_ptr_NCHW = static_cast<scalar_t>(0); - } - } - } - } - } - } - - return output; - } - - template<typename scalar_t> - Tensor grid_sampler_3d_forward_cpu_impl(const Tensor& input, const Tensor& grid, - GridSamplerInterpolation interpolation_mode, - GridSamplerPadding padding_mode, - bool align_corners) { - int64_t N = input.size(0); - int64_t C = input.size(1); - int64_t inp_D = input.size(2); - int64_t inp_H = input.size(3); - int64_t inp_W = input.size(4); - int64_t out_D = grid.size(1); - int64_t out_H = grid.size(2); - int64_t out_W = grid.size(3); - auto output = at::empty({N, C, out_D, out_H, out_W}, input.options()); - int64_t inp_sN = input.stride(0); - int64_t inp_sC = input.stride(1); - int64_t inp_sD = input.stride(2); - int64_t inp_sH = input.stride(3); - int64_t inp_sW = input.stride(4); - int64_t grid_sN = grid.stride(0); - int64_t grid_sD = grid.stride(1); - int64_t grid_sH = grid.stride(2); - int64_t grid_sW = grid.stride(3); - int64_t grid_sCoor = grid.stride(4); - int64_t out_sN = output.stride(0); - int64_t out_sC = output.stride(1); - int64_t out_sD = output.stride(2); - int64_t out_sH = output.stride(3); - int64_t out_sW = output.stride(4); - scalar_t *inp_ptr = input.data<scalar_t>(); - scalar_t *out_ptr = output.data<scalar_t>(); - scalar_t *grid_ptr = grid.data<scalar_t>(); - // loop over each output pixel - #ifdef _OPENMP - #pragma omp parallel for - #endif - for (int64_t n = 0; n < N; ++n) { - scalar_t *grid_ptr_N = grid_ptr + n * grid_sN; - scalar_t *inp_ptr_N = inp_ptr + n * inp_sN; - for (int64_t d = 0; d < out_D; ++d) { - for (int64_t h = 0; h < out_H; ++h) { - for (int64_t w = 0; w < out_W; ++w) { - // get the corresponding input x, y, z co-ordinates from grid - scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW; - scalar_t ix = *grid_ptr_NDHW; - scalar_t iy = grid_ptr_NDHW[grid_sCoor]; - scalar_t iz = grid_ptr_NDHW[2 * grid_sCoor]; - - ix = grid_sampler_compute_source_index(ix, inp_W, padding_mode, align_corners); - iy = grid_sampler_compute_source_index(iy, inp_H, padding_mode, align_corners); - iz = grid_sampler_compute_source_index(iz, inp_D, padding_mode, align_corners); - - if (interpolation_mode == GridSamplerInterpolation::Bilinear) { - // get corner pixel values from (x, y, z) - // for 4d, we used north-east-south-west - // for 5d, we add top-bottom - int64_t ix_tnw = static_cast<int64_t>(std::floor(ix)); - int64_t iy_tnw = static_cast<int64_t>(std::floor(iy)); - int64_t iz_tnw = static_cast<int64_t>(std::floor(iz)); - - int64_t ix_tne = ix_tnw + 1; - int64_t iy_tne = iy_tnw; - int64_t iz_tne = iz_tnw; - - int64_t ix_tsw = ix_tnw; - int64_t iy_tsw = iy_tnw + 1; - int64_t iz_tsw = iz_tnw; - - int64_t ix_tse = ix_tnw + 1; - int64_t iy_tse = iy_tnw + 1; - int64_t iz_tse = iz_tnw; - - int64_t ix_bnw = ix_tnw; - int64_t iy_bnw = iy_tnw; - int64_t iz_bnw = iz_tnw + 1; - - int64_t ix_bne = ix_tnw + 1; - int64_t iy_bne = iy_tnw; - int64_t iz_bne = iz_tnw + 1; - - int64_t ix_bsw = ix_tnw; - int64_t iy_bsw = iy_tnw + 1; - int64_t iz_bsw = iz_tnw + 1; - - int64_t ix_bse = ix_tnw + 1; - int64_t iy_bse = iy_tnw + 1; - int64_t iz_bse = iz_tnw + 1; - - // get surfaces to each neighbor: - scalar_t tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz); - scalar_t tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz); - scalar_t tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz); - scalar_t tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz); - scalar_t bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse); - scalar_t bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw); - scalar_t bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne); - scalar_t bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw); - - // calculate bilinear weighted pixel value and set output pixel - scalar_t *out_ptr_NCDHW = out_ptr + n * out_sN + d * out_sD + h * out_sH + w * out_sW; - scalar_t *inp_ptr_NC = inp_ptr_N; - for (int c = 0; c < C; ++c, out_ptr_NCDHW += out_sC, inp_ptr_NC += inp_sC) { - // (c, iz_tnw, iy_tnw, ix_tnw) * tnw + (c, iz_tne, iy_tne, ix_tne) * tne - // + (c, iz_tsw, iy_tsw, ix_tsw) * tsw + (c, iz_tse, iy_tse, ix_tse) * tse - // + (c, iz_bnw, iy_bnw, ix_bnw) * bnw + (c, iz_bne, iy_bne, ix_bne) * bne - // + (c, iz_bsw, iy_bsw, ix_bsw) * bsw + (c, iz_bse, iy_bse, ix_bse) * bse - *out_ptr_NCDHW = static_cast<scalar_t>(0); - if (within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] * tnw; - } - if (within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] * tne; - } - if (within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] * tsw; - } - if (within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] * tse; - } - if (within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] * bnw; - } - if (within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] * bne; - } - if (within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] * bsw; - } - if (within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] * bse; - } - } - } else if (interpolation_mode == GridSamplerInterpolation::Nearest) { - int64_t ix_nearest = static_cast<int64_t>(std::round(ix)); - int64_t iy_nearest = static_cast<int64_t>(std::round(iy)); - int64_t iz_nearest = static_cast<int64_t>(std::round(iz)); - - // assign nearest neighor pixel value to output pixel - scalar_t *out_ptr_NCDHW = out_ptr + n * out_sN + d * out_sD + h * out_sH + w * out_sW; - scalar_t *inp_ptr_NC = inp_ptr_N; - for (int c = 0; c < C; ++c, out_ptr_NCDHW += out_sC, inp_ptr_NC += inp_sC) { - if (within_bounds_3d(iz_nearest, iy_nearest, ix_nearest, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW = inp_ptr_NC[iz_nearest * inp_sD + iy_nearest * inp_sH + ix_nearest * inp_sW]; - } else { - *out_ptr_NCDHW = static_cast<scalar_t>(0); - } - } - } - } - } - } - } - return output; - } - - template<typename scalar_t> - std::tuple<Tensor, Tensor> - grid_sampler_2d_backward_cpu_impl(const Tensor& grad_output, - const Tensor& input, const Tensor& grid, - GridSamplerInterpolation interpolation_mode, - GridSamplerPadding padding_mode, - bool align_corners) { - auto grad_input = at::zeros_like(input); - auto grad_grid = at::empty_like(grid); - // If interpolation mode is Nearest, then grad_grid is not filled in the - // loop below. - if (interpolation_mode == GridSamplerInterpolation::Nearest) { - grad_grid.zero_(); - } - int64_t N = input.size(0); - int64_t C = input.size(1); - int64_t inp_H = input.size(2); - int64_t inp_W = input.size(3); - int64_t out_H = grid.size(1); - int64_t out_W = grid.size(2); - int64_t inp_sN = input.stride(0); - int64_t inp_sC = input.stride(1); - int64_t inp_sH = input.stride(2); - int64_t inp_sW = input.stride(3); - int64_t grid_sN = grid.stride(0); - int64_t grid_sH = grid.stride(1); - int64_t grid_sW = grid.stride(2); - int64_t grid_sCoor = grid.stride(3); - int64_t gOut_sN = grad_output.stride(0); - int64_t gOut_sC = grad_output.stride(1); - int64_t gOut_sH = grad_output.stride(2); - int64_t gOut_sW = grad_output.stride(3); - int64_t gInp_sN = grad_input.stride(0); - int64_t gInp_sC = grad_input.stride(1); - int64_t gInp_sH = grad_input.stride(2); - int64_t gInp_sW = grad_input.stride(3); - int64_t gGrid_sN = grad_grid.stride(0); - int64_t gGrid_sW = grad_grid.stride(2); - scalar_t *inp_ptr = input.data<scalar_t>(); - scalar_t *grid_ptr = grid.data<scalar_t>(); - scalar_t *gOut_ptr = grad_output.data<scalar_t>(); - scalar_t *gInp_ptr = grad_input.data<scalar_t>(); - scalar_t *gGrid_ptr = grad_grid.data<scalar_t>(); - // loop over each output pixel - #ifdef _OPENMP - #pragma omp parallel for - #endif - for (int64_t n = 0; n < N; ++n) { - scalar_t *grid_ptr_N = grid_ptr + n * grid_sN; - scalar_t *inp_ptr_N = inp_ptr + n * inp_sN; - scalar_t *gGrid_ptr_NHW = gGrid_ptr + n * gGrid_sN; - for (int64_t h = 0; h < out_H; ++h) { - for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NHW += gGrid_sW /* grad_grid is contiguous */ ) { - // get the corresponding input x, y, z co-ordinates from grid - scalar_t *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW; - scalar_t ix = *grid_ptr_NHW; - scalar_t iy = grid_ptr_NHW[grid_sCoor]; - - // multipliers for gradients on ix, iy, and iz - scalar_t gix_mult, giy_mult; - ix = grid_sampler_compute_source_index_set_grad(ix, inp_W, padding_mode, align_corners, &gix_mult); - iy = grid_sampler_compute_source_index_set_grad(iy, inp_H, padding_mode, align_corners, &giy_mult); - - if (interpolation_mode == GridSamplerInterpolation::Bilinear) { - // get corner pixel values from (x, y, z) - // for 4d, we used north-east-south-west - // for 5d, we add top-bottom - int64_t ix_nw = static_cast<int64_t>(std::floor(ix)); - int64_t iy_nw = static_cast<int64_t>(std::floor(iy)); - - int64_t ix_ne = ix_nw + 1; - int64_t iy_ne = iy_nw; - - int64_t ix_sw = ix_nw; - int64_t iy_sw = iy_nw + 1; - - int64_t ix_se = ix_nw + 1; - int64_t iy_se = iy_nw + 1; - - // get surfaces to each neighbor: - scalar_t nw = (ix_se - ix) * (iy_se - iy) ; - scalar_t ne = (ix - ix_sw) * (iy_sw - iy) ; - scalar_t sw = (ix_ne - ix) * (iy - iy_ne); - scalar_t se = (ix - ix_nw) * (iy - iy_nw); - - scalar_t gix = static_cast<scalar_t>(0), giy = static_cast<scalar_t>(0); - scalar_t *gOut_ptr_NCHW = gOut_ptr + n * gOut_sN + h * gOut_sH + w * gOut_sW; - scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN; - scalar_t *inp_ptr_NC = inp_ptr_N; - // calculate bilinear weighted pixel value and set output pixel - for (int c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC += inp_sC) { - scalar_t gOut = *gOut_ptr_NCHW; - - // calculate and set grad_input - safe_add_2d(gInp_ptr_NC, iy_nw, ix_nw, gInp_sH, gInp_sW, inp_H, inp_W, nw * gOut); - safe_add_2d(gInp_ptr_NC, iy_ne, ix_ne, gInp_sH, gInp_sW, inp_H, inp_W, ne * gOut); - safe_add_2d(gInp_ptr_NC, iy_sw, ix_sw, gInp_sH, gInp_sW, inp_H, inp_W, sw * gOut); - safe_add_2d(gInp_ptr_NC, iy_se, ix_se, gInp_sH, gInp_sW, inp_H, inp_W, se * gOut); - - // calculate grad_grid - if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) { - scalar_t nw_val = inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW]; - gix -= nw_val * (iy_se - iy) * gOut; - giy -= nw_val * (ix_se - ix) * gOut; - } - if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) { - scalar_t ne_val = inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW]; - gix += ne_val * (iy_sw - iy) * gOut; - giy -= ne_val * (ix - ix_sw) * gOut; - } - if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) { - scalar_t sw_val = inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW]; - gix -= sw_val * (iy - iy_ne) * gOut; - giy += sw_val * (ix_ne - ix) * gOut; - } - if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) { - scalar_t se_val = inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW]; - gix += se_val * (iy - iy_nw) * gOut; - giy += se_val * (ix - ix_nw) * gOut; - } - } - - // assuming grad_grid is contiguous - gGrid_ptr_NHW[0] = gix_mult * gix; - gGrid_ptr_NHW[1] = giy_mult * giy; - } else if (interpolation_mode == GridSamplerInterpolation::Nearest) { - int64_t ix_nearest = static_cast<int64_t>(std::round(ix)); - int64_t iy_nearest = static_cast<int64_t>(std::round(iy)); - - // assign nearest neighor pixel value to output pixel - scalar_t *gOut_ptr_NCHW = gOut_ptr + n * gOut_sN + h * gOut_sH + w * gOut_sW; - scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN; - for (int c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, gInp_ptr_NC += gInp_sC) { - // calculate and set grad_input - safe_add_2d(gInp_ptr_NC, iy_nearest, ix_nearest, - gInp_sH, gInp_sW, inp_H, inp_W, *gOut_ptr_NCHW); - } - } - } - } - } - return std::make_tuple(grad_input, grad_grid); - } - - template<typename scalar_t> - std::tuple<Tensor, Tensor> - grid_sampler_3d_backward_cpu_impl(const Tensor& grad_output, - const Tensor& input, const Tensor& grid, - GridSamplerInterpolation interpolation_mode, - GridSamplerPadding padding_mode, - bool align_corners) { - auto grad_input = at::zeros_like(input); - auto grad_grid = at::empty_like(grid); - // If interpolation mode is Nearest, then grad_grid is not filled in the - // loop below. - if (interpolation_mode == GridSamplerInterpolation::Nearest) { - grad_grid.zero_(); - } - int64_t N = input.size(0); - int64_t C = input.size(1); - int64_t inp_D = input.size(2); - int64_t inp_H = input.size(3); - int64_t inp_W = input.size(4); - int64_t out_D = grid.size(1); - int64_t out_H = grid.size(2); - int64_t out_W = grid.size(3); - int64_t inp_sN = input.stride(0); - int64_t inp_sC = input.stride(1); - int64_t inp_sD = input.stride(2); - int64_t inp_sH = input.stride(3); - int64_t inp_sW = input.stride(4); - int64_t grid_sN = grid.stride(0); - int64_t grid_sD = grid.stride(1); - int64_t grid_sH = grid.stride(2); - int64_t grid_sW = grid.stride(3); - int64_t grid_sCoor = grid.stride(4); - int64_t gOut_sN = grad_output.stride(0); - int64_t gOut_sC = grad_output.stride(1); - int64_t gOut_sD = grad_output.stride(2); - int64_t gOut_sH = grad_output.stride(3); - int64_t gOut_sW = grad_output.stride(4); - int64_t gInp_sN = grad_input.stride(0); - int64_t gInp_sC = grad_input.stride(1); - int64_t gInp_sD = grad_input.stride(2); - int64_t gInp_sH = grad_input.stride(3); - int64_t gInp_sW = grad_input.stride(4); - int64_t gGrid_sN = grad_grid.stride(0); - int64_t gGrid_sW = grad_grid.stride(3); - scalar_t *inp_ptr = input.data<scalar_t>(); - scalar_t *grid_ptr = grid.data<scalar_t>(); - scalar_t *gOut_ptr = grad_output.data<scalar_t>(); - scalar_t *gInp_ptr = grad_input.data<scalar_t>(); - scalar_t *gGrid_ptr = grad_grid.data<scalar_t>(); - // loop over each output pixel - #ifdef _OPENMP - #pragma omp parallel for - #endif - for (int64_t n = 0; n < N; ++n) { - scalar_t *grid_ptr_N = grid_ptr + n * grid_sN; - scalar_t *inp_ptr_N = inp_ptr + n * inp_sN; - scalar_t *gGrid_ptr_NDHW = gGrid_ptr + n * gGrid_sN; - for (int64_t d = 0; d < out_D; ++d) { - for (int64_t h = 0; h < out_H; ++h) { - for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NDHW += gGrid_sW /* grad_grid is contiguous */ ) { - // get the corresponding input x, y, z co-ordinates from grid - scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW; - scalar_t ix = *grid_ptr_NDHW; - scalar_t iy = grid_ptr_NDHW[grid_sCoor]; - scalar_t iz = grid_ptr_NDHW[2 * grid_sCoor]; - - // multipliers for gradients on ix, iy, and iz - scalar_t gix_mult, giy_mult, giz_mult; - ix = grid_sampler_compute_source_index_set_grad(ix, inp_W, padding_mode, align_corners, &gix_mult); - iy = grid_sampler_compute_source_index_set_grad(iy, inp_H, padding_mode, align_corners, &giy_mult); - iz = grid_sampler_compute_source_index_set_grad(iz, inp_D, padding_mode, align_corners, &giz_mult); - - if (interpolation_mode == GridSamplerInterpolation::Bilinear) { - // get corner pixel values from (x, y, z) - // for 4d, we used north-east-south-west - // for 5d, we add top-bottom - int64_t ix_tnw = static_cast<int64_t>(std::floor(ix)); - int64_t iy_tnw = static_cast<int64_t>(std::floor(iy)); - int64_t iz_tnw = static_cast<int64_t>(std::floor(iz)); - - int64_t ix_tne = ix_tnw + 1; - int64_t iy_tne = iy_tnw; - int64_t iz_tne = iz_tnw; - - int64_t ix_tsw = ix_tnw; - int64_t iy_tsw = iy_tnw + 1; - int64_t iz_tsw = iz_tnw; - - int64_t ix_tse = ix_tnw + 1; - int64_t iy_tse = iy_tnw + 1; - int64_t iz_tse = iz_tnw; - - int64_t ix_bnw = ix_tnw; - int64_t iy_bnw = iy_tnw; - int64_t iz_bnw = iz_tnw + 1; - - int64_t ix_bne = ix_tnw + 1; - int64_t iy_bne = iy_tnw; - int64_t iz_bne = iz_tnw + 1; - - int64_t ix_bsw = ix_tnw; - int64_t iy_bsw = iy_tnw + 1; - int64_t iz_bsw = iz_tnw + 1; - - int64_t ix_bse = ix_tnw + 1; - int64_t iy_bse = iy_tnw + 1; - int64_t iz_bse = iz_tnw + 1; - - // get surfaces to each neighbor: - scalar_t tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz); - scalar_t tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz); - scalar_t tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz); - scalar_t tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz); - scalar_t bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse); - scalar_t bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw); - scalar_t bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne); - scalar_t bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw); - - scalar_t gix = static_cast<scalar_t>(0), giy = static_cast<scalar_t>(0), giz = static_cast<scalar_t>(0); - scalar_t *gOut_ptr_NCDHW = gOut_ptr + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW; - scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN; - scalar_t *inp_ptr_NC = inp_ptr_N; - // calculate bilinear weighted pixel value and set output pixel - for (int c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC += inp_sC) { - scalar_t gOut = *gOut_ptr_NCDHW; - - // calculate and set grad_input - safe_add_3d(gInp_ptr_NC, iz_tnw, iy_tnw, ix_tnw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tnw * gOut); - safe_add_3d(gInp_ptr_NC, iz_tne, iy_tne, ix_tne, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tne * gOut); - safe_add_3d(gInp_ptr_NC, iz_tsw, iy_tsw, ix_tsw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tsw * gOut); - safe_add_3d(gInp_ptr_NC, iz_tse, iy_tse, ix_tse, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tse * gOut); - safe_add_3d(gInp_ptr_NC, iz_bnw, iy_bnw, ix_bnw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bnw * gOut); - safe_add_3d(gInp_ptr_NC, iz_bne, iy_bne, ix_bne, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bne * gOut); - safe_add_3d(gInp_ptr_NC, iz_bsw, iy_bsw, ix_bsw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bsw * gOut); - safe_add_3d(gInp_ptr_NC, iz_bse, iy_bse, ix_bse, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bse * gOut); - - // calculate grad_grid - if (within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) { - scalar_t tnw_val = inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW]; - gix -= tnw_val * (iy_bse - iy) * (iz_bse - iz) * gOut; - giy -= tnw_val * (ix_bse - ix) * (iz_bse - iz) * gOut; - giz -= tnw_val * (ix_bse - ix) * (iy_bse - iy) * gOut; - } - if (within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W)) { - scalar_t tne_val = inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW]; - gix += tne_val * (iy_bsw - iy) * (iz_bsw - iz) * gOut; - giy -= tne_val * (ix - ix_bsw) * (iz_bsw - iz) * gOut; - giz -= tne_val * (ix - ix_bsw) * (iy_bsw - iy) * gOut; - } - if (within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W)) { - scalar_t tsw_val = inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW]; - gix -= tsw_val * (iy - iy_bne) * (iz_bne - iz) * gOut; - giy += tsw_val * (ix_bne - ix) * (iz_bne - iz) * gOut; - giz -= tsw_val * (ix_bne - ix) * (iy - iy_bne) * gOut; - } - if (within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W)) { - scalar_t tse_val = inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW]; - gix += tse_val * (iy - iy_bnw) * (iz_bnw - iz) * gOut; - giy += tse_val * (ix - ix_bnw) * (iz_bnw - iz) * gOut; - giz -= tse_val * (ix - ix_bnw) * (iy - iy_bnw) * gOut; - } - if (within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W)) { - scalar_t bnw_val = inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW]; - gix -= bnw_val * (iy_tse - iy) * (iz - iz_tse) * gOut; - giy -= bnw_val * (ix_tse - ix) * (iz - iz_tse) * gOut; - giz += bnw_val * (ix_tse - ix) * (iy_tse - iy) * gOut; - } - if (within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W)) { - scalar_t bne_val = inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW]; - gix += bne_val * (iy_tsw - iy) * (iz - iz_tsw) * gOut; - giy -= bne_val * (ix - ix_tsw) * (iz - iz_tsw) * gOut; - giz += bne_val * (ix - ix_tsw) * (iy_tsw - iy) * gOut; - } - if (within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W)) { - scalar_t bsw_val = inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW]; - gix -= bsw_val * (iy - iy_tne) * (iz - iz_tne) * gOut; - giy += bsw_val * (ix_tne - ix) * (iz - iz_tne) * gOut; - giz += bsw_val * (ix_tne - ix) * (iy - iy_tne) * gOut; - } - if (within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W)) { - scalar_t bse_val = inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW]; - gix += bse_val * (iy - iy_tnw) * (iz - iz_tnw) * gOut; - giy += bse_val * (ix - ix_tnw) * (iz - iz_tnw) * gOut; - giz += bse_val * (ix - ix_tnw) * (iy - iy_tnw) * gOut; - } - } - - // assuming grad_grid is contiguous - gGrid_ptr_NDHW[0] = gix_mult * gix; - gGrid_ptr_NDHW[1] = giy_mult * giy; - gGrid_ptr_NDHW[2] = giz_mult * giz; - } else if (interpolation_mode == GridSamplerInterpolation::Nearest) { - int64_t ix_nearest = static_cast<int64_t>(std::round(ix)); - int64_t iy_nearest = static_cast<int64_t>(std::round(iy)); - int64_t iz_nearest = static_cast<int64_t>(std::round(iz)); - - // assign nearest neighor pixel value to output pixel - scalar_t *gOut_ptr_NCDHW = gOut_ptr + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW; - scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN; - for (int c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, gInp_ptr_NC += gInp_sC) { - // calculate and set grad_input - safe_add_3d(gInp_ptr_NC, iz_nearest, iy_nearest, ix_nearest, - gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, *gOut_ptr_NCDHW); - } - } - } - } - } - } - return std::make_tuple(grad_input, grad_grid); - } - -} // namespace - -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -Tensor grid_sampler_2d_forward_cpu(const Tensor& input, const Tensor& grid, - int64_t interpolation_mode, int64_t padding_mode, - bool align_corners) { - return AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "grid_sampler_2d_forward_cpu", [&] { - return grid_sampler_2d_forward_cpu_impl<scalar_t>( - input, grid, static_cast<GridSamplerInterpolation>(interpolation_mode), - static_cast<GridSamplerPadding>(padding_mode), align_corners); - }); -} - -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -Tensor grid_sampler_3d_forward_cpu(const Tensor& input, const Tensor& grid, - int64_t interpolation_mode, int64_t padding_mode, - bool align_corners) { - return AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "grid_sampler_3d_forward_cpu", [&] { - return grid_sampler_3d_forward_cpu_impl<scalar_t>( - input, grid, static_cast<GridSamplerInterpolation>(interpolation_mode), - static_cast<GridSamplerPadding>(padding_mode), align_corners); - }); -} - -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -std::tuple<Tensor, Tensor> -grid_sampler_2d_backward_cpu(const Tensor& grad_output, const Tensor& input, const Tensor& grid, - int64_t interpolation_mode, int64_t padding_mode, bool align_corners) { - return AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "grid_sampler_2d_backward_cpu", [&] { - return grid_sampler_2d_backward_cpu_impl<scalar_t>( - grad_output, input, grid, - static_cast<GridSamplerInterpolation>(interpolation_mode), - static_cast<GridSamplerPadding>(padding_mode), align_corners); - }); -} - -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -std::tuple<Tensor, Tensor> -grid_sampler_3d_backward_cpu(const Tensor& grad_output, const Tensor& input, const Tensor& grid, - int64_t interpolation_mode, int64_t padding_mode, bool align_corners) { - return AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "grid_sampler_3d_backward_cpu", [&] { - return grid_sampler_3d_backward_cpu_impl<scalar_t>( - grad_output, input, grid, - static_cast<GridSamplerInterpolation>(interpolation_mode), - static_cast<GridSamplerPadding>(padding_mode), align_corners); - }); -} - -} // namespace mmdetection diff --git a/mmdet/ops/grid_sampler/src/cpu/grid_sampler_cpu.h b/mmdet/ops/grid_sampler/src/cpu/grid_sampler_cpu.h deleted file mode 100644 index 3c9ae45063bf212d715abcf54c0bdccdb23958fc..0000000000000000000000000000000000000000 --- a/mmdet/ops/grid_sampler/src/cpu/grid_sampler_cpu.h +++ /dev/null @@ -1,225 +0,0 @@ -// Modified from https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/GridSampler.h - -#pragma once - -#include <ATen/ATen.h> -#include <ATen/NativeFunctions.h> - -namespace mmdetection { - -namespace detail { - - enum class GridSamplerInterpolation {Bilinear, Nearest}; - enum class GridSamplerPadding {Zeros, Border, Reflection}; - -} // namespace detail - -using detail::GridSamplerInterpolation; -using detail::GridSamplerPadding; - -// Unnormalizes a coordinate from the -1 to +1 scale to its pixel index value, -// where we view each pixel as an area between (idx - 0.5) and (idx + 0.5). -// if align_corners: -1 and +1 get sent to the centers of the corner pixels -// -1 --> 0 -// +1 --> (size - 1) -// scale_factor = (size - 1) / 2 -// if not align_corners: -1 and +1 get sent to the image edges -// -1 --> -0.5 -// +1 --> (size - 1) + 0.5 == size - 0.5 -// scale_factor = size / 2 -template <typename scalar_t> -static inline scalar_t grid_sampler_unnormalize(scalar_t coord, int64_t size, - bool align_corners) { - if (align_corners) { - // unnormalize coord from [-1, 1] to [0, size - 1] - return ((coord + 1) / 2) * (size - 1); - } else { - // unnormalize coord from [-1, 1] to [-0.5, size - 0.5] - return ((coord + 1) * size - 1) / 2; - } -} - -// grid_sampler_unnormalize_set_grad works the same as grid_sampler_unnormalize -// except that it also returns the `d output / d input` via pointer argument -// `grad_in`. -// This is useful in the backward pass of grid_sampler. -template <typename scalar_t> -static inline scalar_t grid_sampler_unnormalize_set_grad(scalar_t coord, int64_t size, - bool align_corners, scalar_t *grad_in) { - if (align_corners) { - // unnormalize coord from [-1, 1] to [0, size - 1] - *grad_in = static_cast<scalar_t>(size - 1) / 2; - return ((coord + 1) / 2) * (size - 1); - } else { - // unnormalize coord from [-1, 1] to [-0.5, size - 0.5] - *grad_in = static_cast<scalar_t>(size) / 2; - return ((coord + 1) * size - 1) / 2; - } -} - -// Clips coordinates to between 0 and clip_limit - 1 -template<typename scalar_t> -static inline scalar_t clip_coordinates(scalar_t in, int64_t clip_limit) { - return std::min(static_cast<scalar_t>(clip_limit - 1), std::max(in, static_cast<scalar_t>(0))); -} - -// clip_coordinates_set_grad works similarly to clip_coordinates except that -// it also returns the `d output / d input` via pointer argument `grad_in`. -// This is useful in the backward pass of grid_sampler. -template<typename scalar_t> -static inline scalar_t clip_coordinates_set_grad(scalar_t in, int64_t clip_limit, - scalar_t *grad_in) { - if (in < static_cast<scalar_t>(0)) { - *grad_in = static_cast<scalar_t>(0); - return static_cast<scalar_t>(0); - } else { - scalar_t max = static_cast<scalar_t>(clip_limit - 1); - if (in > max) { - *grad_in = static_cast<scalar_t>(0); - return max; - } else { - *grad_in = static_cast<scalar_t>(1); - return in; - } - } -} - -// Reflects coordinates until they fall between low and high (inclusive). -// The bounds are passed as twice their value so that half-integer values -// can be represented as ints. -template<typename scalar_t> -static inline scalar_t reflect_coordinates(scalar_t in, int64_t twice_low, - int64_t twice_high) { - if (twice_low == twice_high) { - return static_cast<scalar_t>(0); - } - scalar_t min = static_cast<scalar_t>(twice_low) / 2; - scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2; - in = std::fabs(in - min); - // `fmod` returns same sign as `in`, which is positive after the `fabs` above. - scalar_t extra = std::fmod(in, span); - int flips = static_cast<int>(std::floor(in / span)); - if (flips % 2 == 0) { - return extra + min; - } else { - return span - extra + min; - } -} - -// reflect_coordinates_set_grad works similarly to reflect_coordinates except -// that it also returns the `d output / d input` via pointer argument -// `grad_in`. -// This is useful in the backward pass of grid_sampler. -template<typename scalar_t> -static inline scalar_t reflect_coordinates_set_grad(scalar_t in, int64_t twice_low, - int64_t twice_high, scalar_t *grad_in) { - if (twice_low == twice_high) { - *grad_in = static_cast<scalar_t>(0); - return static_cast<scalar_t>(0); - } - int grad_in_mult_; - scalar_t min = static_cast<scalar_t>(twice_low) / 2; - scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2; - in = in - min; - if (in < static_cast<scalar_t>(0)) { - grad_in_mult_ = -1; - in = -in; - } else { - grad_in_mult_ = 1; - } - // `fmod` returns same sign as `in`, which is positive after the `if` above. - scalar_t extra = std::fmod(in, span); - int flips = static_cast<int>(std::floor(in / span)); - if (flips % 2 == 0) { - *grad_in = static_cast<scalar_t>(grad_in_mult_); - return extra + min; - } else { - *grad_in = static_cast<scalar_t>(-grad_in_mult_); - return span - extra + min; - } -} - -// Computes the pixel source index value for a grid coordinate -template <typename scalar_t> -static inline scalar_t grid_sampler_compute_source_index( - scalar_t coord, - int64_t size, - GridSamplerPadding padding_mode, - bool align_corners) { - coord = grid_sampler_unnormalize(coord, size, align_corners); - if (padding_mode == GridSamplerPadding::Border) { - // clip coordinates to image borders - coord = clip_coordinates(coord, size); - } else if (padding_mode == GridSamplerPadding::Reflection) { - // reflect coordinates by image borders - if (align_corners) { - coord = reflect_coordinates(coord, 0, 2*(size - 1)); - } else { - coord = reflect_coordinates(coord, -1, 2*size - 1); - // when align_corners=False, reflection does not auto clip coords - coord = clip_coordinates(coord, size); - } - } - return coord; -} - -// grid_sampler_compute_source_index_set_grad works similarly to -// grid_sampler_compute_source_index except that it also returns the -// `d output / d input` via pointer argument `grad_in`. -// This is useful in the backward pass of grid_sampler. -template <typename scalar_t> -static inline scalar_t grid_sampler_compute_source_index_set_grad( - scalar_t coord, - int64_t size, - GridSamplerPadding padding_mode, - bool align_corners, - scalar_t *grad_in) { - scalar_t grad_clip, grad_refl; - coord = grid_sampler_unnormalize_set_grad(coord, size, align_corners, grad_in); - if (padding_mode == GridSamplerPadding::Border) { - // clip coordinates to image borders - coord = clip_coordinates_set_grad(coord, size, &grad_clip); - *grad_in = (*grad_in) * grad_clip; - } else if (padding_mode == GridSamplerPadding::Reflection) { - // reflect coordinates by image borders - if (align_corners) { - coord = reflect_coordinates_set_grad(coord, 0, 2*(size - 1), &grad_refl); - *grad_in = (*grad_in) * grad_refl; - } else { - coord = reflect_coordinates_set_grad(coord, -1, 2*size - 1, &grad_refl); - // when align_corners=False, reflection does not auto clip coords - coord = clip_coordinates_set_grad(coord, size, &grad_clip); - *grad_in = (*grad_in) * grad_refl * grad_clip; - } - } - return coord; -} - -static inline bool within_bounds_2d(int64_t h, int64_t w, int64_t H, int64_t W) { - return h >= 0 && h < H && w >= 0 && w < W; -} - -static inline bool within_bounds_3d(int64_t d, int64_t h, int64_t w, int64_t D, int64_t H, int64_t W) { - return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W; -} - -template<typename scalar_t> -static inline void safe_add_2d(scalar_t *data, int64_t h, int64_t w, - int64_t sH, int64_t sW, int64_t H, int64_t W, - scalar_t delta) { - if (within_bounds_2d(h, w, H, W)) { - data[h * sH + w * sW] += delta; - } -} - -template<typename scalar_t> -static inline void safe_add_3d(scalar_t *data, int64_t d, int64_t h, int64_t w, - int64_t sD, int64_t sH, int64_t sW, - int64_t D, int64_t H, int64_t W, - scalar_t delta) { - if (within_bounds_3d(d, h, w, D, H, W)) { - data[d * sD + h * sH + w * sW] += delta; - } -} - -} // namespace mmdetection diff --git a/mmdet/ops/grid_sampler/src/cuda/grid_sampler_cuda.cu b/mmdet/ops/grid_sampler/src/cuda/grid_sampler_cuda.cu deleted file mode 100644 index 2d747a0b897dda1b0a29998a1825832b6b5eb99c..0000000000000000000000000000000000000000 --- a/mmdet/ops/grid_sampler/src/cuda/grid_sampler_cuda.cu +++ /dev/null @@ -1,718 +0,0 @@ -// Modified from https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda/GridSampler.cu - -#include <ATen/ATen.h> -#include "grid_sampler_cuda.cuh" -#include <ATen/cuda/CUDAContext.h> -#include <ATen/cuda/CUDAApplyUtils.cuh> -#include <ATen/cuda/detail/TensorInfo.cuh> -#include <ATen/cuda/detail/IndexUtils.cuh> -#include <ATen/cuda/detail/KernelUtils.h> -#include <c10/macros/Macros.h> - -namespace mmdetection { - -using namespace at::cuda::detail; - -using mmdetection::detail::GridSamplerInterpolation; -using mmdetection::detail::GridSamplerPadding; - -namespace { - template <typename scalar_t> - C10_LAUNCH_BOUNDS_1(1024) - __global__ void grid_sampler_2d_forward_kernel_cuda( - const int nthreads, - TensorInfo<scalar_t, int> input, - TensorInfo<scalar_t, int> grid, - TensorInfo<scalar_t, int> output, - const GridSamplerInterpolation interpolation_mode, - const GridSamplerPadding padding_mode, - bool align_corners) { - - int C = input.sizes[1]; - int inp_H = input.sizes[2]; - int inp_W = input.sizes[3]; - int out_H = grid.sizes[1]; - int out_W = grid.sizes[2]; - int inp_sN = input.strides[0]; - int inp_sC = input.strides[1]; - int inp_sH = input.strides[2]; - int inp_sW = input.strides[3]; - int grid_sN = grid.strides[0]; - int grid_sH = grid.strides[1]; - int grid_sW = grid.strides[2]; - int grid_sCoor = grid.strides[3]; - int out_sN = output.strides[0]; - int out_sC = output.strides[1]; - int out_sH = output.strides[2]; - int out_sW = output.strides[3]; - - CUDA_KERNEL_LOOP(index, nthreads) { - const int w = index % out_W; - const int h = (index / out_W) % out_H; - const int n = index / (out_H * out_W); - const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW; - - // get the corresponding input x, y co-ordinates from grid - scalar_t ix = grid.data[grid_offset]; - scalar_t iy = grid.data[grid_offset + grid_sCoor]; - - ix = grid_sampler_compute_source_index(ix, inp_W, padding_mode, align_corners); - iy = grid_sampler_compute_source_index(iy, inp_H, padding_mode, align_corners); - - if (interpolation_mode == GridSamplerInterpolation::Bilinear) { - // get NE, NW, SE, SW pixel values from (x, y) - int ix_nw = static_cast<int>(::floor(ix)); - int iy_nw = static_cast<int>(::floor(iy)); - int ix_ne = ix_nw + 1; - int iy_ne = iy_nw; - int ix_sw = ix_nw; - int iy_sw = iy_nw + 1; - int ix_se = ix_nw + 1; - int iy_se = iy_nw + 1; - - // get surfaces to each neighbor: - scalar_t nw = (ix_se - ix) * (iy_se - iy); - scalar_t ne = (ix - ix_sw) * (iy_sw - iy); - scalar_t sw = (ix_ne - ix) * (iy - iy_ne); - scalar_t se = (ix - ix_nw) * (iy - iy_nw); - - // calculate bilinear weighted pixel value and set output pixel - auto inp_ptr_NC = input.data + n * inp_sN; - auto out_ptr_NCHW = output.data + n * out_sN + h * out_sH + w * out_sW; - for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC) { - *out_ptr_NCHW = static_cast<scalar_t>(0); - if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) { - *out_ptr_NCHW += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw; - } - if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) { - *out_ptr_NCHW += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne; - } - if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) { - *out_ptr_NCHW += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw; - } - if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) { - *out_ptr_NCHW += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se; - } - } - } else if (interpolation_mode == GridSamplerInterpolation::Nearest) { - int ix_nearest = static_cast<int>(::round(ix)); - int iy_nearest = static_cast<int>(::round(iy)); - - // assign nearest neighor pixel value to output pixel - auto inp_ptr_NC = input.data + n * inp_sN; - auto out_ptr_NCHW = output.data + n * out_sN + h * out_sH + w * out_sW; - for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC) { - if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) { - *out_ptr_NCHW = inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW]; - } else { - *out_ptr_NCHW = static_cast<scalar_t>(0); - } - } - } - } - } - - template <typename scalar_t> - C10_LAUNCH_BOUNDS_1(1024) - __global__ void grid_sampler_3d_forward_kernel_cuda( - const int nthreads, - TensorInfo<scalar_t, int> input, - TensorInfo<scalar_t, int> grid, - TensorInfo<scalar_t, int> output, - const GridSamplerInterpolation interpolation_mode, - const GridSamplerPadding padding_mode, - bool align_corners) { - - int C = input.sizes[1]; - int inp_D = input.sizes[2]; - int inp_H = input.sizes[3]; - int inp_W = input.sizes[4]; - int out_D = grid.sizes[1]; - int out_H = grid.sizes[2]; - int out_W = grid.sizes[3]; - int inp_sN = input.strides[0]; - int inp_sC = input.strides[1]; - int inp_sD = input.strides[2]; - int inp_sH = input.strides[3]; - int inp_sW = input.strides[4]; - int grid_sN = grid.strides[0]; - int grid_sD = grid.strides[1]; - int grid_sH = grid.strides[2]; - int grid_sW = grid.strides[3]; - int grid_sCoor = grid.strides[4]; - int out_sN = output.strides[0]; - int out_sC = output.strides[1]; - int out_sD = output.strides[2]; - int out_sH = output.strides[3]; - int out_sW = output.strides[4]; - - CUDA_KERNEL_LOOP(index, nthreads) { - const int w = index % out_W; - const int h = (index / out_W) % out_H; - const int d = (index / (out_H * out_W)) % out_D; - const int n = index / (out_D * out_H * out_W); - const int grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW; - - // get the corresponding input x, y, z co-ordinates from grid - scalar_t ix = grid.data[grid_offset]; - scalar_t iy = grid.data[grid_offset + grid_sCoor]; - scalar_t iz = grid.data[grid_offset + 2 * grid_sCoor]; - - ix = grid_sampler_compute_source_index(ix, inp_W, padding_mode, align_corners); - iy = grid_sampler_compute_source_index(iy, inp_H, padding_mode, align_corners); - iz = grid_sampler_compute_source_index(iz, inp_D, padding_mode, align_corners); - - if (interpolation_mode == GridSamplerInterpolation::Bilinear) { - // get corner pixel values from (x, y, z) - // for 4d, we used north-east-south-west - // for 5d, we add top-bottom - int ix_tnw = static_cast<int>(::floor(ix)); - int iy_tnw = static_cast<int>(::floor(iy)); - int iz_tnw = static_cast<int>(::floor(iz)); - - int ix_tne = ix_tnw + 1; - int iy_tne = iy_tnw; - int iz_tne = iz_tnw; - - int ix_tsw = ix_tnw; - int iy_tsw = iy_tnw + 1; - int iz_tsw = iz_tnw; - - int ix_tse = ix_tnw + 1; - int iy_tse = iy_tnw + 1; - int iz_tse = iz_tnw; - - int ix_bnw = ix_tnw; - int iy_bnw = iy_tnw; - int iz_bnw = iz_tnw + 1; - - int ix_bne = ix_tnw + 1; - int iy_bne = iy_tnw; - int iz_bne = iz_tnw + 1; - - int ix_bsw = ix_tnw; - int iy_bsw = iy_tnw + 1; - int iz_bsw = iz_tnw + 1; - - int ix_bse = ix_tnw + 1; - int iy_bse = iy_tnw + 1; - int iz_bse = iz_tnw + 1; - - // get surfaces to each neighbor: - scalar_t tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz); - scalar_t tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz); - scalar_t tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz); - scalar_t tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz); - scalar_t bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse); - scalar_t bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw); - scalar_t bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne); - scalar_t bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw); - - auto inp_ptr_NC = input.data + n * inp_sN; - auto out_ptr_NCDHW = output.data + n * out_sN + d * out_sD + h * out_sH + w * out_sW; - for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) { - // (c, iz_tnw, iy_tnw, ix_tnw) * tnw + (c, iz_tne, iy_tne, ix_tne) * tne - // + (c, iz_tsw, iy_tsw, ix_tsw) * tsw + (c, iz_tse, iy_tse, ix_tse) * tse - // + (c, iz_bnw, iy_bnw, ix_bnw) * bnw + (c, iz_bne, iy_bne, ix_bne) * bne - // + (c, iz_bsw, iy_bsw, ix_bsw) * bsw + (c, iz_bse, iy_bse, ix_bse) * bse - *out_ptr_NCDHW = static_cast<scalar_t>(0); - if (within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] * tnw; - } - if (within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] * tne; - } - if (within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] * tsw; - } - if (within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] * tse; - } - if (within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] * bnw; - } - if (within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] * bne; - } - if (within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] * bsw; - } - if (within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW += inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] * bse; - } - } - } else if (interpolation_mode == GridSamplerInterpolation::Nearest) { - int ix_nearest = static_cast<int>(::round(ix)); - int iy_nearest = static_cast<int>(::round(iy)); - int iz_nearest = static_cast<int>(::round(iz)); - - // assign nearest neighor pixel value to output pixel - auto inp_ptr_NC = input.data + n * inp_sN; - auto out_ptr_NCDHW = output.data + n * out_sN + d * out_sD + h * out_sH + w * out_sW; - for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) { - if (within_bounds_3d(iz_nearest, iy_nearest, ix_nearest, inp_D, inp_H, inp_W)) { - *out_ptr_NCDHW = inp_ptr_NC[iz_nearest * inp_sD + iy_nearest * inp_sH + ix_nearest * inp_sW]; - } else { - *out_ptr_NCDHW = static_cast<scalar_t>(0); - } - } - } - } - } - - template <typename scalar_t> - C10_LAUNCH_BOUNDS_1(1024) - __global__ void grid_sampler_2d_backward_kernel_cuda( - const int nthreads, - TensorInfo<scalar_t, int> grad_output, - TensorInfo<scalar_t, int> input, - TensorInfo<scalar_t, int> grid, - TensorInfo<scalar_t, int> grad_input, // initialized to zeros - TensorInfo<scalar_t, int> grad_grid, // initialized to empty - const GridSamplerInterpolation interpolation_mode, - const GridSamplerPadding padding_mode, - bool align_corners) { - - int C = input.sizes[1]; - int inp_H = input.sizes[2]; - int inp_W = input.sizes[3]; - int out_H = grid.sizes[1]; - int out_W = grid.sizes[2]; - int inp_sN = input.strides[0]; - int inp_sC = input.strides[1]; - int inp_sH = input.strides[2]; - int inp_sW = input.strides[3]; - int grid_sN = grid.strides[0]; - int grid_sH = grid.strides[1]; - int grid_sW = grid.strides[2]; - int grid_sCoor = grid.strides[3]; - int gOut_sN = grad_output.strides[0]; - int gOut_sC = grad_output.strides[1]; - int gOut_sH = grad_output.strides[2]; - int gOut_sW = grad_output.strides[3]; - int gInp_sN = grad_input.strides[0]; - int gInp_sC = grad_input.strides[1]; - int gInp_sH = grad_input.strides[2]; - int gInp_sW = grad_input.strides[3]; - int gGrid_sW = grad_grid.strides[2]; - - CUDA_KERNEL_LOOP(index, nthreads) { - const int w = index % out_W; - const int h = (index / out_W) % out_H; - const int n = index / (out_H * out_W); - const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW; - - // get the corresponding input x, y co-ordinates from grid - scalar_t ix = grid.data[grid_offset]; - scalar_t iy = grid.data[grid_offset + grid_sCoor]; - - // multipliers for gradients on ix and iy - scalar_t gix_mult, giy_mult; - ix = grid_sampler_compute_source_index_set_grad(ix, inp_W, padding_mode, align_corners, &gix_mult); - iy = grid_sampler_compute_source_index_set_grad(iy, inp_H, padding_mode, align_corners, &giy_mult); - - if (interpolation_mode == GridSamplerInterpolation::Bilinear) { - // get NE, NW, SE, SW pixel values from (x, y) - int ix_nw = static_cast<int>(::floor(ix)); - int iy_nw = static_cast<int>(::floor(iy)); - int ix_ne = ix_nw + 1; - int iy_ne = iy_nw; - int ix_sw = ix_nw; - int iy_sw = iy_nw + 1; - int ix_se = ix_nw + 1; - int iy_se = iy_nw + 1; - - // get surfaces to each neighbor: - scalar_t nw = (ix_se - ix) * (iy_se - iy); - scalar_t ne = (ix - ix_sw) * (iy_sw - iy); - scalar_t sw = (ix_ne - ix) * (iy - iy_ne); - scalar_t se = (ix - ix_nw) * (iy - iy_nw); - - scalar_t gix = static_cast<scalar_t>(0), giy = static_cast<scalar_t>(0); - scalar_t *gOut_ptr_NCHW = grad_output.data + n * gOut_sN + h * gOut_sH + w * gOut_sW; - scalar_t *gInp_ptr_NC = grad_input.data + n * gInp_sN; - scalar_t *inp_ptr_NC = input.data + n * inp_sN; - for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, gInp_ptr_NC += gInp_sC, gOut_ptr_NCHW += gOut_sC) { - scalar_t gOut = *gOut_ptr_NCHW; - - // calculate and set grad_input - safe_add_2d(gInp_ptr_NC, iy_nw, ix_nw, gInp_sH, gInp_sW, inp_H, inp_W, nw * gOut); - safe_add_2d(gInp_ptr_NC, iy_ne, ix_ne, gInp_sH, gInp_sW, inp_H, inp_W, ne * gOut); - safe_add_2d(gInp_ptr_NC, iy_sw, ix_sw, gInp_sH, gInp_sW, inp_H, inp_W, sw * gOut); - safe_add_2d(gInp_ptr_NC, iy_se, ix_se, gInp_sH, gInp_sW, inp_H, inp_W, se * gOut); - - // calculate grad_grid - if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) { - scalar_t nw_val = inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW]; - gix -= nw_val * (iy_se - iy) * gOut; - giy -= nw_val * (ix_se - ix) * gOut; - } - if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) { - scalar_t ne_val = inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW]; - gix += ne_val * (iy_sw - iy) * gOut; - giy -= ne_val * (ix - ix_sw) * gOut; - } - if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) { - scalar_t sw_val = inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW]; - gix -= sw_val * (iy - iy_ne) * gOut; - giy += sw_val * (ix_ne - ix) * gOut; - } - if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) { - scalar_t se_val = inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW]; - gix += se_val * (iy - iy_nw) * gOut; - giy += se_val * (ix - ix_nw) * gOut; - } - } - - // assuming grad_grid is contiguous - // thus we can - // 1. use index with gGrid_sW to directly compute gGrid_ptr_NHW - // 2. directly assign to gGrid_ptr_NHW[0], gGrid_ptr_NHW[1] - scalar_t *gGrid_ptr_NHW = grad_grid.data + index * gGrid_sW; - gGrid_ptr_NHW[0] = gix_mult * gix; - gGrid_ptr_NHW[1] = giy_mult * giy; - } else if (interpolation_mode == GridSamplerInterpolation::Nearest) { - int ix_nearest = static_cast<int>(::round(ix)); - int iy_nearest = static_cast<int>(::round(iy)); - - // assign nearest neighor pixel value to output pixel - scalar_t *gOut_ptr_NCHW = grad_output.data + n * gOut_sN + h * gOut_sH + w * gOut_sW; - scalar_t *gInp_ptr_NC = grad_input.data + n * gInp_sN; - for (int c = 0; c < C; ++c, gInp_ptr_NC += gInp_sC, gOut_ptr_NCHW += gOut_sC) { - // calculate and set grad_input - safe_add_2d(gInp_ptr_NC, iy_nearest, ix_nearest, gInp_sH, gInp_sW, inp_H, inp_W, *gOut_ptr_NCHW); - } - - // assuming grad_grid is contiguous - // thus we can - // 1. use index with gGrid_sW to directly compute gGrid_ptr_NHW - // 2. directly assign to gGrid_ptr_NHW[0], gGrid_ptr_NHW[1] - scalar_t *gGrid_ptr_NHW = grad_grid.data + index * gGrid_sW; - gGrid_ptr_NHW[0] = static_cast<scalar_t>(0); - gGrid_ptr_NHW[1] = static_cast<scalar_t>(0); - } - } - } - - template <typename scalar_t> - C10_LAUNCH_BOUNDS_1(1024) - __global__ void grid_sampler_3d_backward_kernel_cuda( - const int nthreads, - TensorInfo<scalar_t, int> grad_output, - TensorInfo<scalar_t, int> input, - TensorInfo<scalar_t, int> grid, - TensorInfo<scalar_t, int> grad_input, // initialized to zeros - TensorInfo<scalar_t, int> grad_grid, // initialized to empty - const GridSamplerInterpolation interpolation_mode, - const GridSamplerPadding padding_mode, - bool align_corners) { - - int C = input.sizes[1]; - int inp_D = input.sizes[2]; - int inp_H = input.sizes[3]; - int inp_W = input.sizes[4]; - int out_D = grid.sizes[1]; - int out_H = grid.sizes[2]; - int out_W = grid.sizes[3]; - int inp_sN = input.strides[0]; - int inp_sC = input.strides[1]; - int inp_sD = input.strides[2]; - int inp_sH = input.strides[3]; - int inp_sW = input.strides[4]; - int grid_sN = grid.strides[0]; - int grid_sD = grid.strides[1]; - int grid_sH = grid.strides[2]; - int grid_sW = grid.strides[3]; - int grid_sCoor = grid.strides[4]; - int gOut_sN = grad_output.strides[0]; - int gOut_sC = grad_output.strides[1]; - int gOut_sD = grad_output.strides[2]; - int gOut_sH = grad_output.strides[3]; - int gOut_sW = grad_output.strides[4]; - int gInp_sN = grad_input.strides[0]; - int gInp_sC = grad_input.strides[1]; - int gInp_sD = grad_input.strides[2]; - int gInp_sH = grad_input.strides[3]; - int gInp_sW = grad_input.strides[4]; - int gGrid_sW = grad_grid.strides[3]; - - CUDA_KERNEL_LOOP(index, nthreads) { - const int w = index % out_W; - const int h = (index / out_W) % out_H; - const int d = (index / (out_H * out_W)) % out_D; - const int n = index / (out_D * out_H * out_W); - const int grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW; - - // get the corresponding input x, y, z co-ordinates from grid - scalar_t ix = grid.data[grid_offset]; - scalar_t iy = grid.data[grid_offset + grid_sCoor]; - scalar_t iz = grid.data[grid_offset + 2 * grid_sCoor]; - - // multipliers for gradients on ix, iy, and iz - scalar_t gix_mult, giy_mult, giz_mult; - ix = grid_sampler_compute_source_index_set_grad(ix, inp_W, padding_mode, align_corners, &gix_mult); - iy = grid_sampler_compute_source_index_set_grad(iy, inp_H, padding_mode, align_corners, &giy_mult); - iz = grid_sampler_compute_source_index_set_grad(iz, inp_D, padding_mode, align_corners, &giz_mult); - - if (interpolation_mode == GridSamplerInterpolation::Bilinear) { - // get corner pixel values from (x, y, z) - // for 4d, we used north-east-south-west - // for 5d, we add top-bottom - int ix_tnw = static_cast<int>(::floor(ix)); - int iy_tnw = static_cast<int>(::floor(iy)); - int iz_tnw = static_cast<int>(::floor(iz)); - - int ix_tne = ix_tnw + 1; - int iy_tne = iy_tnw; - int iz_tne = iz_tnw; - - int ix_tsw = ix_tnw; - int iy_tsw = iy_tnw + 1; - int iz_tsw = iz_tnw; - - int ix_tse = ix_tnw + 1; - int iy_tse = iy_tnw + 1; - int iz_tse = iz_tnw; - - int ix_bnw = ix_tnw; - int iy_bnw = iy_tnw; - int iz_bnw = iz_tnw + 1; - - int ix_bne = ix_tnw + 1; - int iy_bne = iy_tnw; - int iz_bne = iz_tnw + 1; - - int ix_bsw = ix_tnw; - int iy_bsw = iy_tnw + 1; - int iz_bsw = iz_tnw + 1; - - int ix_bse = ix_tnw + 1; - int iy_bse = iy_tnw + 1; - int iz_bse = iz_tnw + 1; - - // get surfaces to each neighbor: - scalar_t tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz); - scalar_t tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz); - scalar_t tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz); - scalar_t tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz); - scalar_t bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse); - scalar_t bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw); - scalar_t bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne); - scalar_t bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw); - - scalar_t gix = static_cast<scalar_t>(0), giy = static_cast<scalar_t>(0), giz = static_cast<scalar_t>(0); - scalar_t *gOut_ptr_NCDHW = grad_output.data + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW; - scalar_t *gInp_ptr_NC = grad_input.data + n * gInp_sN; - scalar_t *inp_ptr_NC = input.data + n * inp_sN; - // calculate bilinear weighted pixel value and set output pixel - for (int c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC += inp_sC) { - scalar_t gOut = *gOut_ptr_NCDHW; - - // calculate and set grad_input - safe_add_3d(gInp_ptr_NC, iz_tnw, iy_tnw, ix_tnw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tnw * gOut); - safe_add_3d(gInp_ptr_NC, iz_tne, iy_tne, ix_tne, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tne * gOut); - safe_add_3d(gInp_ptr_NC, iz_tsw, iy_tsw, ix_tsw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tsw * gOut); - safe_add_3d(gInp_ptr_NC, iz_tse, iy_tse, ix_tse, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tse * gOut); - safe_add_3d(gInp_ptr_NC, iz_bnw, iy_bnw, ix_bnw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bnw * gOut); - safe_add_3d(gInp_ptr_NC, iz_bne, iy_bne, ix_bne, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bne * gOut); - safe_add_3d(gInp_ptr_NC, iz_bsw, iy_bsw, ix_bsw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bsw * gOut); - safe_add_3d(gInp_ptr_NC, iz_bse, iy_bse, ix_bse, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bse * gOut); - - // calculate grad_grid - if (within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) { - scalar_t tnw_val = inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW]; - gix -= tnw_val * (iy_bse - iy) * (iz_bse - iz) * gOut; - giy -= tnw_val * (ix_bse - ix) * (iz_bse - iz) * gOut; - giz -= tnw_val * (ix_bse - ix) * (iy_bse - iy) * gOut; - } - if (within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W)) { - scalar_t tne_val = inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW]; - gix += tne_val * (iy_bsw - iy) * (iz_bsw - iz) * gOut; - giy -= tne_val * (ix - ix_bsw) * (iz_bsw - iz) * gOut; - giz -= tne_val * (ix - ix_bsw) * (iy_bsw - iy) * gOut; - } - if (within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W)) { - scalar_t tsw_val = inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW]; - gix -= tsw_val * (iy - iy_bne) * (iz_bne - iz) * gOut; - giy += tsw_val * (ix_bne - ix) * (iz_bne - iz) * gOut; - giz -= tsw_val * (ix_bne - ix) * (iy - iy_bne) * gOut; - } - if (within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W)) { - scalar_t tse_val = inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW]; - gix += tse_val * (iy - iy_bnw) * (iz_bnw - iz) * gOut; - giy += tse_val * (ix - ix_bnw) * (iz_bnw - iz) * gOut; - giz -= tse_val * (ix - ix_bnw) * (iy - iy_bnw) * gOut; - } - if (within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W)) { - scalar_t bnw_val = inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW]; - gix -= bnw_val * (iy_tse - iy) * (iz - iz_tse) * gOut; - giy -= bnw_val * (ix_tse - ix) * (iz - iz_tse) * gOut; - giz += bnw_val * (ix_tse - ix) * (iy_tse - iy) * gOut; - } - if (within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W)) { - scalar_t bne_val = inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW]; - gix += bne_val * (iy_tsw - iy) * (iz - iz_tsw) * gOut; - giy -= bne_val * (ix - ix_tsw) * (iz - iz_tsw) * gOut; - giz += bne_val * (ix - ix_tsw) * (iy_tsw - iy) * gOut; - } - if (within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W)) { - scalar_t bsw_val = inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW]; - gix -= bsw_val * (iy - iy_tne) * (iz - iz_tne) * gOut; - giy += bsw_val * (ix_tne - ix) * (iz - iz_tne) * gOut; - giz += bsw_val * (ix_tne - ix) * (iy - iy_tne) * gOut; - } - if (within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W)) { - scalar_t bse_val = inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW]; - gix += bse_val * (iy - iy_tnw) * (iz - iz_tnw) * gOut; - giy += bse_val * (ix - ix_tnw) * (iz - iz_tnw) * gOut; - giz += bse_val * (ix - ix_tnw) * (iy - iy_tnw) * gOut; - } - } - - // assuming grad_grid is contiguous - // thus we can - // 1. use index with gGrid_sW to directly compute gGrid_ptr_NDHW - // 2. directly assign to gGrid_ptr_NDHW[0], gGrid_ptr_NDHW[1], gGrid_ptr_NDHW[2] - scalar_t *gGrid_ptr_NDHW = grad_grid.data + index * gGrid_sW; - gGrid_ptr_NDHW[0] = gix_mult * gix; - gGrid_ptr_NDHW[1] = giy_mult * giy; - gGrid_ptr_NDHW[2] = giz_mult * giz; - } else if (interpolation_mode == GridSamplerInterpolation::Nearest) { - int ix_nearest = static_cast<int>(::round(ix)); - int iy_nearest = static_cast<int>(::round(iy)); - int iz_nearest = static_cast<int>(::round(iz)); - - // assign nearest neighor pixel value to output pixel - scalar_t *gOut_ptr_NCDHW = grad_output.data + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW; - scalar_t *gInp_ptr_NC = grad_input.data + n * gInp_sN; - for (int c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, gInp_ptr_NC += gInp_sC) { - // calculate and set grad_input - safe_add_3d(gInp_ptr_NC, iz_nearest, iy_nearest, ix_nearest, - gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, *gOut_ptr_NCDHW); - } - - // assuming grad_grid is contiguous - // thus we can - // 1. use index with gGrid_sW to directly compute gGrid_ptr_NDHW - // 2. directly assign to gGrid_ptr_NDHW[0], gGrid_ptr_NDHW[1], gGrid_ptr_NDHW[2] - scalar_t *gGrid_ptr_NDHW = grad_grid.data + index * gGrid_sW; - gGrid_ptr_NDHW[0] = static_cast<scalar_t>(0); - gGrid_ptr_NDHW[1] = static_cast<scalar_t>(0); - gGrid_ptr_NDHW[2] = static_cast<scalar_t>(0); - } - } - } -} // namespace - -using namespace at; -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -Tensor grid_sampler_2d_forward_cuda(const Tensor& input, const Tensor& grid, - int64_t interpolation_mode, int64_t padding_mode, - bool align_corners) { - auto N = input.size(0); - auto H = grid.size(1); - auto W = grid.size(2); - auto output = at::empty({N, input.size(1), H, W}, input.options()); - int count = static_cast<int>(N * H * W); - if (count > 0) { - AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_2d_forward_cuda", [&] { - grid_sampler_2d_forward_kernel_cuda<scalar_t> - <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>( - count, - getTensorInfo<scalar_t, int>(input), - getTensorInfo<scalar_t, int>(grid), - getTensorInfo<scalar_t, int>(output), - static_cast<GridSamplerInterpolation>(interpolation_mode), - static_cast<GridSamplerPadding>(padding_mode), - align_corners); - }); - } - return output; -} - -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -Tensor grid_sampler_3d_forward_cuda(const Tensor& input, const Tensor& grid, - int64_t interpolation_mode, int64_t padding_mode, - bool align_corners) { - auto N = input.size(0); - auto D = grid.size(1); - auto H = grid.size(2); - auto W = grid.size(3); - auto output = at::empty({N, input.size(1), D, H, W}, input.options()); - int count = static_cast<int>(N * D * H * W); - if (count > 0) { - AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_3d_forward_cuda", [&] { - grid_sampler_3d_forward_kernel_cuda<scalar_t> - <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>( - count, - getTensorInfo<scalar_t, int>(input), - getTensorInfo<scalar_t, int>(grid), - getTensorInfo<scalar_t, int>(output), - static_cast<GridSamplerInterpolation>(interpolation_mode), - static_cast<GridSamplerPadding>(padding_mode), - align_corners); - }); - } - return output; -} - -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -std::tuple<Tensor, Tensor> -grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input, - const Tensor& grid, int64_t interpolation_mode, - int64_t padding_mode, bool align_corners) { - auto N = input.size(0); - auto H = grid.size(1); - auto W = grid.size(2); - auto grad_input = at::zeros_like(input); - auto grad_grid = at::empty_like(grid); - int count = static_cast<int>(N * H * W); - if (count > 0) { - AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_2d_backward_cuda", [&] { - grid_sampler_2d_backward_kernel_cuda<scalar_t> - <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>( - count, - getTensorInfo<scalar_t, int>(grad_output), - getTensorInfo<scalar_t, int>(input), - getTensorInfo<scalar_t, int>(grid), - getTensorInfo<scalar_t, int>(grad_input), - getTensorInfo<scalar_t, int>(grad_grid), - static_cast<GridSamplerInterpolation>(interpolation_mode), - static_cast<GridSamplerPadding>(padding_mode), - align_corners); - }); - } - return std::make_tuple(grad_input, grad_grid); -} - -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -std::tuple<Tensor, Tensor> -grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input, - const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode, - bool align_corners) { - auto N = input.size(0); - auto D = grid.size(1); - auto H = grid.size(2); - auto W = grid.size(3); - auto grad_input = at::zeros_like(input); - auto grad_grid = at::empty_like(grid); - int count = static_cast<int>(N * D * H * W); - if (count > 0) { - AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_3d_backward_cuda", [&] { - grid_sampler_3d_backward_kernel_cuda<scalar_t> - <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>( - count, - getTensorInfo<scalar_t, int>(grad_output), - getTensorInfo<scalar_t, int>(input), - getTensorInfo<scalar_t, int>(grid), - getTensorInfo<scalar_t, int>(grad_input), - getTensorInfo<scalar_t, int>(grad_grid), - static_cast<GridSamplerInterpolation>(interpolation_mode), - static_cast<GridSamplerPadding>(padding_mode), - align_corners); - }); - } - return std::make_tuple(grad_input, grad_grid); -} - -} // namespace mmdetection diff --git a/mmdet/ops/grid_sampler/src/cuda/grid_sampler_cuda.cuh b/mmdet/ops/grid_sampler/src/cuda/grid_sampler_cuda.cuh deleted file mode 100644 index a84fa7c076ecd8302aacddf6c350196cc5ce964e..0000000000000000000000000000000000000000 --- a/mmdet/ops/grid_sampler/src/cuda/grid_sampler_cuda.cuh +++ /dev/null @@ -1,233 +0,0 @@ -// Modified from https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda/GridSampler.cuh - -#include <ATen/ATen.h> -#include <ATen/NativeFunctions.h> -#include <ATen/cuda/CUDAApplyUtils.cuh> -#include <THC/THCAtomics.cuh> - -namespace mmdetection { - -namespace detail { - - enum class GridSamplerInterpolation {Bilinear, Nearest}; - enum class GridSamplerPadding {Zeros, Border, Reflection}; - -} // namespace detail - -using detail::GridSamplerInterpolation; -using detail::GridSamplerPadding; - -// Unnormalizes a coordinate from the -1 to +1 scale to its pixel index value, -// where we view each pixel as an area between (idx - 0.5) and (idx + 0.5). -// if align_corners: -1 and +1 get sent to the centers of the corner pixels -// -1 --> 0 -// +1 --> (size - 1) -// scale_factor = (size - 1) / 2 -// if not align_corners: -1 and +1 get sent to the image edges -// -1 --> -0.5 -// +1 --> (size - 1) + 0.5 == size - 0.5 -// scale_factor = size / 2 -template <typename scalar_t> -static __forceinline__ __device__ -scalar_t grid_sampler_unnormalize(scalar_t coord, int size, bool align_corners) { - if (align_corners) { - // unnormalize coord from [-1, 1] to [0, size - 1] - return ((coord + 1.f) / 2) * (size - 1); - } else { - // unnormalize coord from [-1, 1] to [-0.5, size - 0.5] - return ((coord + 1.f) * size - 1) / 2; - } -} - -// grid_sampler_unnormalize_set_grad works the same as grid_sampler_unnormalize -// except that it also returns the `d output / d input` via pointer argument -// `grad_in`. -// This is useful in the backward pass of grid_sampler. -template <typename scalar_t> -static __forceinline__ __device__ -scalar_t grid_sampler_unnormalize_set_grad(scalar_t coord, int size, - bool align_corners, scalar_t *grad_in) { - if (align_corners) { - // unnormalize coord from [-1, 1] to [0, size - 1] - *grad_in = static_cast<scalar_t>(size - 1) / 2; - return ((coord + 1.f) / 2) * (size - 1); - } else { - // unnormalize coord from [-1, 1] to [-0.5, size - 0.5] - *grad_in = static_cast<scalar_t>(size) / 2; - return ((coord + 1.f) * size - 1) / 2; - } -} - -// Clips coordinates to between 0 and clip_limit - 1 -template <typename scalar_t> -static __forceinline__ __device__ -scalar_t clip_coordinates(scalar_t in, int clip_limit) { - return ::min(static_cast<scalar_t>(clip_limit - 1), ::max(in, static_cast<scalar_t>(0))); -} - -// clip_coordinates_set_grad works similarly to clip_coordinates except that -// it also returns the `d output / d input` via pointer argument `grad_in`. -// This is useful in the backward pass of grid_sampler. -template <typename scalar_t> -static __forceinline__ __device__ -scalar_t clip_coordinates_set_grad(scalar_t in, int clip_limit, scalar_t *grad_in) { - if (in < static_cast<scalar_t>(0)) { - *grad_in = static_cast<scalar_t>(0); - return static_cast<scalar_t>(0); - } else { - scalar_t max = static_cast<scalar_t>(clip_limit - 1); - if (in > max) { - *grad_in = static_cast<scalar_t>(0); - return max; - } else { - *grad_in = static_cast<scalar_t>(1); - return in; - } - } -} - -// Reflects coordinates until they fall between low and high (inclusive). -// The bounds are passed as twice their value so that half-integer values -// can be represented as ints. -template <typename scalar_t> -static __forceinline__ __device__ -scalar_t reflect_coordinates(scalar_t in, int twice_low, int twice_high) { - if (twice_low == twice_high) { - return static_cast<scalar_t>(0); - } - scalar_t min = static_cast<scalar_t>(twice_low) / 2; - scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2; - in = ::fabs(in - min); - // `fmod` returns same sign as `in`, which is positive after the `fabs` above. - scalar_t extra = ::fmod(in, span); - int flips = static_cast<int>(::floor(in / span)); - if (flips % 2 == 0) { - return extra + min; - } else { - return span - extra + min; - } -} - -// reflect_coordinates_set_grad works similarly to reflect_coordinates except -// that it also returns the `d output / d input` via pointer argument -// `grad_in`. -// This is useful in the backward pass of grid_sampler. -template <typename scalar_t> -static __forceinline__ __device__ -scalar_t reflect_coordinates_set_grad(scalar_t in, int twice_low, int twice_high, - scalar_t *grad_in) { - if (twice_low == twice_high) { - *grad_in = static_cast<scalar_t>(0); - return static_cast<scalar_t>(0); - } - int grad_in_mult_; - scalar_t min = static_cast<scalar_t>(twice_low) / 2; - scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2; - in = in - min; - if (in < static_cast<scalar_t>(0)) { - grad_in_mult_ = -1; - in = -in; - } else { - grad_in_mult_ = 1; - } - // `fmod` returns same sign as `in`, which is positive after the `if` above. - scalar_t extra = ::fmod(in, span); - int flips = static_cast<int>(::floor(in / span)); - if (flips % 2 == 0) { - *grad_in = static_cast<scalar_t>(grad_in_mult_); - return extra + min; - } else { - *grad_in = static_cast<scalar_t>(-grad_in_mult_); - return span - extra + min; - } -} - -// Computes the pixel source index value for a grid coordinate -template <typename scalar_t> -static __forceinline__ __device__ -scalar_t grid_sampler_compute_source_index( - scalar_t coord, - int size, - GridSamplerPadding padding_mode, - bool align_corners) { - coord = grid_sampler_unnormalize(coord, size, align_corners); - if (padding_mode == GridSamplerPadding::Border) { - // clip coordinates to image borders - coord = clip_coordinates(coord, size); - } else if (padding_mode == GridSamplerPadding::Reflection) { - // reflect coordinates by image borders - if (align_corners) { - coord = reflect_coordinates(coord, 0, 2*(size - 1)); - } else { - coord = reflect_coordinates(coord, -1, 2*size - 1); - // when align_corners=False, reflection does not auto clip coords - coord = clip_coordinates(coord, size); - } - } - return coord; -} - -// grid_sampler_compute_source_index_set_grad works similarly to -// grid_sampler_compute_source_index except that it also returns the -// `d output / d input` via pointer argument `grad_in`. -// This is useful in the backward pass of grid_sampler. -template <typename scalar_t> -static __forceinline__ __device__ -scalar_t grid_sampler_compute_source_index_set_grad( - scalar_t coord, - int size, - GridSamplerPadding padding_mode, - bool align_corners, - scalar_t *grad_in) { - scalar_t grad_clip, grad_refl; - coord = grid_sampler_unnormalize_set_grad(coord, size, align_corners, grad_in); - if (padding_mode == GridSamplerPadding::Border) { - // clip coordinates to image borders - coord = clip_coordinates_set_grad(coord, size, &grad_clip); - *grad_in = (*grad_in) * grad_clip; - } else if (padding_mode == GridSamplerPadding::Reflection) { - // reflect coordinates by image borders - if (align_corners) { - coord = reflect_coordinates_set_grad(coord, 0, 2*(size - 1), &grad_refl); - *grad_in = (*grad_in) * grad_refl; - } else { - coord = reflect_coordinates_set_grad(coord, -1, 2*size - 1, &grad_refl); - // when align_corners=False, reflection does not auto clip coords - coord = clip_coordinates_set_grad(coord, size, &grad_clip); - *grad_in = (*grad_in) * grad_refl * grad_clip; - } - } - return coord; -} - -static __forceinline__ __device__ -bool within_bounds_2d(int h, int w, int H, int W) { - return h >= 0 && h < H && w >= 0 && w < W; -} - -static __forceinline__ __device__ -bool within_bounds_3d(int d, int h, int w, int D, int H, int W) { - return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W; -} - -template<typename scalar_t> -static __forceinline__ __device__ -void safe_add_2d(scalar_t *data, int h, int w, - int sH, int sW, int H, int W, - scalar_t delta) { - if (within_bounds_2d(h, w, H, W)) { - atomicAdd(data + h * sH + w * sW, delta); - } -} - -template<typename scalar_t> -static __forceinline__ __device__ -void safe_add_3d(scalar_t *data, int d, int h, int w, - int sD, int sH, int sW, int D, int H, int W, - scalar_t delta) { - if (within_bounds_3d(d, h, w, D, H, W)) { - atomicAdd(data + d * sD + h * sH + w * sW, delta); - } -} - -} // namespace at::mmdetection diff --git a/mmdet/ops/grid_sampler/src/grid_sampler_ext.cpp b/mmdet/ops/grid_sampler/src/grid_sampler_ext.cpp deleted file mode 100644 index 7e76a7aab80b738efd5a33317c2b5bb0e3ea5d00..0000000000000000000000000000000000000000 --- a/mmdet/ops/grid_sampler/src/grid_sampler_ext.cpp +++ /dev/null @@ -1,117 +0,0 @@ -#include <torch/extension.h> -#include <ATen/DeviceGuard.h> - -namespace mmdetection { - -using namespace at; - -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -Tensor grid_sampler_2d_forward_cpu(const Tensor& input, const Tensor& grid, - int64_t interpolation_mode, int64_t padding_mode, - bool align_corners); - -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -Tensor grid_sampler_3d_forward_cpu(const Tensor& input, const Tensor& grid, - int64_t interpolation_mode, int64_t padding_mode, - bool align_corners); - -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -std::tuple<Tensor, Tensor> -grid_sampler_2d_backward_cpu(const Tensor& grad_output, const Tensor& input, - const Tensor& grid, int64_t interpolation_mode, - int64_t padding_mode, bool align_corners); - -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -std::tuple<Tensor, Tensor> -grid_sampler_3d_backward_cpu(const Tensor& grad_output, const Tensor& input, - const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode, - bool align_corners); - -#ifdef WITH_CUDA -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -Tensor grid_sampler_2d_forward_cuda(const Tensor& input, const Tensor& grid, - int64_t interpolation_mode, int64_t padding_mode, - bool align_corners); - -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -Tensor grid_sampler_3d_forward_cuda(const Tensor& input, const Tensor& grid, - int64_t interpolation_mode, int64_t padding_mode, - bool align_corners); - -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -std::tuple<Tensor, Tensor> -grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input, - const Tensor& grid, int64_t interpolation_mode, - int64_t padding_mode, bool align_corners); - -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -std::tuple<Tensor, Tensor> -grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input, - const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode, - bool align_corners); -#endif - -// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ]. -Tensor grid_sampler_forward(const Tensor& input, const Tensor& grid, - int64_t interpolation_mode, int64_t padding_mode, - bool align_corners) { - if (input.dim() == 4) { - if (input.type().is_cuda()) { -#ifdef WITH_CUDA - return grid_sampler_2d_forward_cuda(input, grid, interpolation_mode, - padding_mode, align_corners); -#else - AT_ERROR("grid_sampler is not compiled with GPU support"); -#endif - } - return grid_sampler_2d_forward_cpu(input, grid, interpolation_mode, - padding_mode, align_corners); - } else { - if (input.type().is_cuda()) { -#ifdef WITH_CUDA - return grid_sampler_3d_forward_cuda(input, grid, interpolation_mode, - padding_mode, align_corners); -#else - AT_ERROR("grid_sampler is not compiled with GPU support"); -#endif - } - return grid_sampler_3d_forward_cpu(input, grid, interpolation_mode, - padding_mode, align_corners); - } -} - -std::tuple<Tensor, Tensor> -grid_sampler_backward(const Tensor& grad_output, const Tensor& input, - const Tensor& grid, int64_t interpolation_mode, - int64_t padding_mode, bool align_corners) { - if (input.dim() == 4) { - if (input.type().is_cuda()) { -#ifdef WITH_CUDA - return grid_sampler_2d_backward_cuda(grad_output, input, grid, - interpolation_mode, padding_mode, align_corners); -#else - AT_ERROR("grid_sampler is not compiled with GPU support"); -#endif - } - return grid_sampler_2d_backward_cpu(grad_output, input, grid, - interpolation_mode, padding_mode, align_corners); - } else { - if (input.type().is_cuda()) { -#ifdef WITH_CUDA - return grid_sampler_3d_backward_cuda(grad_output, input, grid, - interpolation_mode, padding_mode, align_corners); -#else - AT_ERROR("grid_sampler is not compiled with GPU support"); -#endif - } - return grid_sampler_3d_backward_cpu(grad_output, input, grid, - interpolation_mode, padding_mode, align_corners); - } -} - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("grid_sampler_forward_cuda", &grid_sampler_forward, "grid_sampler_forward"); - m.def("grid_sampler_backward_cuda", &grid_sampler_backward, "grid_sampler_backward"); -} - -} // namespace mmdetection diff --git a/mmdet/ops/masked_conv/src/cuda/masked_conv2d_cuda.cpp b/mmdet/ops/masked_conv/src/cuda/masked_conv2d_cuda.cpp index b2850d916a4862e7c231c3075466796bacb1c952..84bd7c279132c1343e4d80dda50523861bea542c 100644 --- a/mmdet/ops/masked_conv/src/cuda/masked_conv2d_cuda.cpp +++ b/mmdet/ops/masked_conv/src/cuda/masked_conv2d_cuda.cpp @@ -17,9 +17,9 @@ int MaskedCol2imForwardLaucher(const at::Tensor col, const int height, const at::Tensor mask_w_idx, const int mask_cnt, at::Tensor im); -#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") +#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ") #define CHECK_CONTIGUOUS(x) \ - AT_CHECK(x.is_contiguous(), #x, " must be contiguous ") + TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ") #define CHECK_INPUT(x) \ CHECK_CUDA(x); \ CHECK_CONTIGUOUS(x) diff --git a/mmdet/ops/masked_conv/src/cuda/masked_conv2d_kernel.cu b/mmdet/ops/masked_conv/src/cuda/masked_conv2d_kernel.cu index 81c785bbe41461fa8a4d380dbbef60dbe677cf6a..b8323592f528a714d88417f606abaa564c6c744d 100644 --- a/mmdet/ops/masked_conv/src/cuda/masked_conv2d_kernel.cu +++ b/mmdet/ops/masked_conv/src/cuda/masked_conv2d_kernel.cu @@ -59,10 +59,10 @@ int MaskedIm2colForwardLaucher(const at::Tensor bottom_data, const int height, AT_DISPATCH_FLOATING_TYPES_AND_HALF( bottom_data.scalar_type(), "MaskedIm2colLaucherForward", ([&] { - const scalar_t *bottom_data_ = bottom_data.data<scalar_t>(); - const int64_t *mask_h_idx_ = mask_h_idx.data<int64_t>(); - const int64_t *mask_w_idx_ = mask_w_idx.data<int64_t>(); - scalar_t *top_data_ = top_data.data<scalar_t>(); + const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>(); + const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>(); + const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>(); + scalar_t *top_data_ = top_data.data_ptr<scalar_t>(); MaskedIm2colForward<scalar_t> <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, at::cuda::getCurrentCUDAStream() >>>( @@ -99,10 +99,10 @@ int MaskedCol2imForwardLaucher(const at::Tensor bottom_data, const int height, AT_DISPATCH_FLOATING_TYPES_AND_HALF( bottom_data.scalar_type(), "MaskedCol2imLaucherForward", ([&] { - const scalar_t *bottom_data_ = bottom_data.data<scalar_t>(); - const int64_t *mask_h_idx_ = mask_h_idx.data<int64_t>(); - const int64_t *mask_w_idx_ = mask_w_idx.data<int64_t>(); - scalar_t *top_data_ = top_data.data<scalar_t>(); + const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>(); + const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>(); + const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>(); + scalar_t *top_data_ = top_data.data_ptr<scalar_t>(); MaskedCol2imForward<scalar_t> <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, at::cuda::getCurrentCUDAStream()>>>( diff --git a/mmdet/ops/masked_conv/src/masked_conv2d_ext.cpp b/mmdet/ops/masked_conv/src/masked_conv2d_ext.cpp index 5bf60be580edc682e2f451ce92a17b888a7fa10e..39058ad77552966092dfbf729cc8c8fb14c98a06 100644 --- a/mmdet/ops/masked_conv/src/masked_conv2d_ext.cpp +++ b/mmdet/ops/masked_conv/src/masked_conv2d_ext.cpp @@ -19,7 +19,7 @@ int masked_im2col_forward(const at::Tensor im, const at::Tensor mask_h_idx, const at::Tensor mask_w_idx, const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, at::Tensor col) { - if (im.type().is_cuda()) { + if (im.device().is_cuda()) { #ifdef WITH_CUDA return masked_im2col_forward_cuda(im, mask_h_idx, mask_w_idx, kernel_h, kernel_w, pad_h, pad_w, col); @@ -34,7 +34,7 @@ int masked_col2im_forward(const at::Tensor col, const at::Tensor mask_h_idx, const at::Tensor mask_w_idx, int height, int width, int channels, at::Tensor im) { - if (col.type().is_cuda()) { + if (col.device().is_cuda()) { #ifdef WITH_CUDA return masked_col2im_forward_cuda(col, mask_h_idx, mask_w_idx, height, width, channels, im); diff --git a/mmdet/ops/nms/src/cpu/nms_cpu.cpp b/mmdet/ops/nms/src/cpu/nms_cpu.cpp index 4d11abec7e69bf46711115a62daebebb95c54e9a..aa652ea396c9533ec8b2bcd3b076b9496041c4d1 100644 --- a/mmdet/ops/nms/src/cpu/nms_cpu.cpp +++ b/mmdet/ops/nms/src/cpu/nms_cpu.cpp @@ -6,7 +6,7 @@ template <typename scalar_t> at::Tensor nms_cpu_kernel(const at::Tensor& dets, const float threshold) { - AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor"); + AT_ASSERTM(!dets.device().is_cuda(), "dets must be a CPU tensor"); if (dets.numel() == 0) { return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); @@ -26,13 +26,13 @@ at::Tensor nms_cpu_kernel(const at::Tensor& dets, const float threshold) { at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU)); - auto suppressed = suppressed_t.data<uint8_t>(); - auto order = order_t.data<int64_t>(); - auto x1 = x1_t.data<scalar_t>(); - auto y1 = y1_t.data<scalar_t>(); - auto x2 = x2_t.data<scalar_t>(); - auto y2 = y2_t.data<scalar_t>(); - auto areas = areas_t.data<scalar_t>(); + auto suppressed = suppressed_t.data_ptr<uint8_t>(); + auto order = order_t.data_ptr<int64_t>(); + auto x1 = x1_t.data_ptr<scalar_t>(); + auto y1 = y1_t.data_ptr<scalar_t>(); + auto x2 = x2_t.data_ptr<scalar_t>(); + auto y2 = y2_t.data_ptr<scalar_t>(); + auto areas = areas_t.data_ptr<scalar_t>(); for (int64_t _i = 0; _i < ndets; _i++) { auto i = order[_i]; @@ -73,7 +73,7 @@ template <typename scalar_t> at::Tensor soft_nms_cpu_kernel(const at::Tensor& dets, const float threshold, const unsigned char method, const float sigma, const float min_score) { - AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor"); + AT_ASSERTM(!dets.device().is_cuda(), "dets must be a CPU tensor"); if (dets.numel() == 0) { return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); @@ -88,16 +88,16 @@ at::Tensor soft_nms_cpu_kernel(const at::Tensor& dets, const float threshold, at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t); auto ndets = dets.size(0); - auto x1 = x1_t.data<scalar_t>(); - auto y1 = y1_t.data<scalar_t>(); - auto x2 = x2_t.data<scalar_t>(); - auto y2 = y2_t.data<scalar_t>(); - auto scores = scores_t.data<scalar_t>(); - auto areas = areas_t.data<scalar_t>(); + auto x1 = x1_t.data_ptr<scalar_t>(); + auto y1 = y1_t.data_ptr<scalar_t>(); + auto x2 = x2_t.data_ptr<scalar_t>(); + auto y2 = y2_t.data_ptr<scalar_t>(); + auto scores = scores_t.data_ptr<scalar_t>(); + auto areas = areas_t.data_ptr<scalar_t>(); int64_t pos = 0; at::Tensor inds_t = at::arange(ndets, dets.options()); - auto inds = inds_t.data<scalar_t>(); + auto inds = inds_t.data_ptr<scalar_t>(); for (int64_t i = 0; i < ndets; i++) { auto max_score = scores[i]; diff --git a/mmdet/ops/nms/src/cuda/nms_cuda.cpp b/mmdet/ops/nms/src/cuda/nms_cuda.cpp index 61ca93a273c4075ca1ea20adfb549c7cb5f8e1a6..d46b81669041eb998660bba5f48d0775de586c7c 100644 --- a/mmdet/ops/nms/src/cuda/nms_cuda.cpp +++ b/mmdet/ops/nms/src/cuda/nms_cuda.cpp @@ -1,7 +1,7 @@ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #include <torch/extension.h> -#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") +#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ") at::Tensor nms_cuda_forward(const at::Tensor boxes, float nms_overlap_thresh); diff --git a/mmdet/ops/nms/src/cuda/nms_kernel.cu b/mmdet/ops/nms/src/cuda/nms_kernel.cu index 4a0800f52076ebced136bf99ae3eaa0a6dd8b944..bb6d18abcfa597a4d159580b59c30e82718924d3 100644 --- a/mmdet/ops/nms/src/cuda/nms_kernel.cu +++ b/mmdet/ops/nms/src/cuda/nms_kernel.cu @@ -74,7 +74,7 @@ at::Tensor nms_cuda_forward(const at::Tensor boxes, float nms_overlap_thresh) { at::DeviceGuard guard(boxes.device()); using scalar_t = float; - AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor"); + AT_ASSERTM(boxes.device().is_cuda(), "boxes must be a CUDA tensor"); auto scores = boxes.select(1, 4); auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); auto boxes_sorted = boxes.index_select(0, order_t); @@ -83,7 +83,7 @@ at::Tensor nms_cuda_forward(const at::Tensor boxes, float nms_overlap_thresh) { const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock); - scalar_t* boxes_dev = boxes_sorted.data<scalar_t>(); + scalar_t* boxes_dev = boxes_sorted.data_ptr<scalar_t>(); THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState @@ -114,7 +114,7 @@ at::Tensor nms_cuda_forward(const at::Tensor boxes, float nms_overlap_thresh) { memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU)); - int64_t* keep_out = keep.data<int64_t>(); + int64_t* keep_out = keep.data_ptr<int64_t>(); int num_to_keep = 0; for (int i = 0; i < boxes_num; i++) { diff --git a/mmdet/ops/nms/src/nms_ext.cpp b/mmdet/ops/nms/src/nms_ext.cpp index 6d95303a315043defb6e48e145caa5b09a241c0d..6c311f2652d6cc097bf5f135c231936929c3d713 100644 --- a/mmdet/ops/nms/src/nms_ext.cpp +++ b/mmdet/ops/nms/src/nms_ext.cpp @@ -13,7 +13,7 @@ at::Tensor nms_cuda(const at::Tensor& dets, const float threshold); #endif at::Tensor nms(const at::Tensor& dets, const float threshold){ - if (dets.type().is_cuda()) { + if (dets.device().is_cuda()) { #ifdef WITH_CUDA return nms_cuda(dets, threshold); #else @@ -26,7 +26,7 @@ at::Tensor nms(const at::Tensor& dets, const float threshold){ at::Tensor soft_nms(const at::Tensor& dets, const float threshold, const unsigned char method, const float sigma, const float min_score) { - if (dets.type().is_cuda()) { + if (dets.device().is_cuda()) { AT_ERROR("soft_nms is not implemented on GPU"); } return soft_nms_cpu(dets, threshold, method, sigma, min_score); diff --git a/mmdet/ops/roi_align/src/cpu/roi_align_v2.cpp b/mmdet/ops/roi_align/src/cpu/roi_align_v2.cpp index 2c6b557da24eb19837c8ae8299f1da29dd0e8b80..9e01fe17da0b0693ad874a5e1ad5dbb397817dc5 100644 --- a/mmdet/ops/roi_align/src/cpu/roi_align_v2.cpp +++ b/mmdet/ops/roi_align/src/cpu/roi_align_v2.cpp @@ -357,11 +357,11 @@ at::Tensor ROIAlignForwardV2CPULaucher(const at::Tensor& input, if (output.numel() == 0) return output; - AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "ROIAlign_forward", [&] { + AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "ROIAlign_forward", [&] { ROIAlignForward<scalar_t>( - output_size, input.contiguous().data<scalar_t>(), spatial_scale, + output_size, input.contiguous().data_ptr<scalar_t>(), spatial_scale, channels, height, width, pooled_height, pooled_width, sampling_ratio, - rois.contiguous().data<scalar_t>(), output.data<scalar_t>(), aligned); + rois.contiguous().data_ptr<scalar_t>(), output.data_ptr<scalar_t>(), aligned); }); return output; } @@ -393,11 +393,11 @@ at::Tensor ROIAlignBackwardV2CPULaucher( int h_stride = grad.stride(2); int w_stride = grad.stride(3); - AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad.type(), "ROIAlign_backward", [&] { + AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad.scalar_type(), "ROIAlign_backward", [&] { ROIAlignBackward<scalar_t>( - grad.numel(), grad.contiguous().data<scalar_t>(), spatial_scale, + grad.numel(), grad.contiguous().data_ptr<scalar_t>(), spatial_scale, channels, height, width, pooled_height, pooled_width, sampling_ratio, - grad_input.data<scalar_t>(), rois.contiguous().data<scalar_t>(), + grad_input.data_ptr<scalar_t>(), rois.contiguous().data_ptr<scalar_t>(), n_stride, c_stride, h_stride, w_stride, aligned); }); return grad_input; diff --git a/mmdet/ops/roi_align/src/cuda/roi_align_kernel.cu b/mmdet/ops/roi_align/src/cuda/roi_align_kernel.cu index 113fc11047564daff2d701d1459ad0b7ee89b767..7afa33229d84fa04f746fc3477c83dfc19ee01f8 100644 --- a/mmdet/ops/roi_align/src/cuda/roi_align_kernel.cu +++ b/mmdet/ops/roi_align/src/cuda/roi_align_kernel.cu @@ -125,9 +125,9 @@ int ROIAlignForwardLaucher(const at::Tensor features, const at::Tensor rois, const int output_size = num_rois * pooled_height * pooled_width * channels; AT_DISPATCH_FLOATING_TYPES_AND_HALF( features.scalar_type(), "ROIAlignLaucherForward", ([&] { - const scalar_t *bottom_data = features.data<scalar_t>(); - const scalar_t *rois_data = rois.data<scalar_t>(); - scalar_t *top_data = output.data<scalar_t>(); + const scalar_t *bottom_data = features.data_ptr<scalar_t>(); + const scalar_t *rois_data = rois.data_ptr<scalar_t>(); + scalar_t *top_data = output.data_ptr<scalar_t>(); ROIAlignForwardV1<scalar_t> <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, @@ -263,9 +263,9 @@ int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois, AT_DISPATCH_FLOATING_TYPES_AND_HALF( top_grad.scalar_type(), "ROIAlignLaucherBackward", ([&] { - const scalar_t *top_diff = top_grad.data<scalar_t>(); - const scalar_t *rois_data = rois.data<scalar_t>(); - scalar_t *bottom_diff = bottom_grad.data<scalar_t>(); + const scalar_t *top_diff = top_grad.data_ptr<scalar_t>(); + const scalar_t *rois_data = rois.data_ptr<scalar_t>(); + scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>(); if (sizeof(scalar_t) == sizeof(double)) { fprintf(stderr, "double is not supported\n"); exit(-1); diff --git a/mmdet/ops/roi_align/src/cuda/roi_align_kernel_v2.cu b/mmdet/ops/roi_align/src/cuda/roi_align_kernel_v2.cu index 9a2f71509334156ba11966923aea0767b0e983a4..0189323cd1ead8a932d358ce79477c66e6c93e5d 100644 --- a/mmdet/ops/roi_align/src/cuda/roi_align_kernel_v2.cu +++ b/mmdet/ops/roi_align/src/cuda/roi_align_kernel_v2.cu @@ -297,9 +297,9 @@ at::Tensor ROIAlignForwardV2Laucher(const at::Tensor& input, AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] { RoIAlignForwardV2<scalar_t><<<grid, block, 0, stream>>>( - output_size, input.contiguous().data<scalar_t>(), spatial_scale, + output_size, input.contiguous().data_ptr<scalar_t>(), spatial_scale, channels, height, width, pooled_height, pooled_width, sampling_ratio, - rois.contiguous().data<scalar_t>(), output.data<scalar_t>(), aligned); + rois.contiguous().data_ptr<scalar_t>(), output.data_ptr<scalar_t>(), aligned); }); cudaDeviceSynchronize(); AT_CUDA_CHECK(cudaGetLastError()); @@ -338,10 +338,10 @@ at::Tensor ROIAlignBackwardV2Laucher( AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIAlign_backward", [&] { RoIAlignBackwardFeatureV2<scalar_t><<<grid, block, 0, stream>>>( - grad.numel(), grad.contiguous().data<scalar_t>(), num_rois, + grad.numel(), grad.contiguous().data_ptr<scalar_t>(), num_rois, spatial_scale, channels, height, width, pooled_height, pooled_width, - sampling_ratio, grad_input.data<scalar_t>(), - rois.contiguous().data<scalar_t>(), aligned); + sampling_ratio, grad_input.data_ptr<scalar_t>(), + rois.contiguous().data_ptr<scalar_t>(), aligned); }); AT_CUDA_CHECK(cudaGetLastError()); return grad_input; diff --git a/mmdet/ops/roi_align/src/roi_align_ext.cpp b/mmdet/ops/roi_align/src/roi_align_ext.cpp index f01351a8f16c6989ff9916ba06ac5890dbb3fcc8..18add01bba22424343eab57c8263753d7c93498c 100644 --- a/mmdet/ops/roi_align/src/roi_align_ext.cpp +++ b/mmdet/ops/roi_align/src/roi_align_ext.cpp @@ -46,9 +46,9 @@ at::Tensor ROIAlignBackwardV2CPULaucher( const int channels, const int height, const int width, const int sampling_ratio, bool aligned); -#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") +#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ") #define CHECK_CONTIGUOUS(x) \ - AT_CHECK(x.is_contiguous(), #x, " must be contiguous ") + TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ") #define CHECK_INPUT(x) \ CHECK_CUDA(x); \ CHECK_CONTIGUOUS(x) @@ -56,7 +56,7 @@ at::Tensor ROIAlignBackwardV2CPULaucher( int ROIAlign_forwardV1(at::Tensor features, at::Tensor rois, int pooled_height, int pooled_width, float spatial_scale, int sample_num, at::Tensor output) { - if (features.type().is_cuda()) { + if (features.device().is_cuda()) { #ifdef WITH_CUDA CHECK_INPUT(features); CHECK_INPUT(rois); @@ -91,7 +91,7 @@ int ROIAlign_forwardV1(at::Tensor features, at::Tensor rois, int pooled_height, int ROIAlign_backwardV1(at::Tensor top_grad, at::Tensor rois, int pooled_height, int pooled_width, float spatial_scale, int sample_num, at::Tensor bottom_grad) { - if (top_grad.type().is_cuda()) { + if (top_grad.device().is_cuda()) { #ifdef WITH_CUDA CHECK_INPUT(top_grad); CHECK_INPUT(rois); @@ -129,7 +129,7 @@ inline at::Tensor ROIAlign_forwardV2(const at::Tensor& input, const int pooled_height, const int pooled_width, const int sampling_ratio, bool aligned) { - if (input.type().is_cuda()) { + if (input.device().is_cuda()) { #ifdef WITH_CUDA return ROIAlignForwardV2Laucher(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned); @@ -146,7 +146,7 @@ inline at::Tensor ROIAlign_backwardV2( const int pooled_height, const int pooled_width, const int batch_size, const int channels, const int height, const int width, const int sampling_ratio, bool aligned) { - if (grad.type().is_cuda()) { + if (grad.device().is_cuda()) { #ifdef WITH_CUDA return ROIAlignBackwardV2Laucher(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, diff --git a/mmdet/ops/roi_pool/src/cuda/roi_pool_kernel.cu b/mmdet/ops/roi_pool/src/cuda/roi_pool_kernel.cu index 2e34ff0a10f2a9fc350eb2b658cdca678d1642ee..88fab97fbb4c7b965558158b6c5cbd00b86de97a 100644 --- a/mmdet/ops/roi_pool/src/cuda/roi_pool_kernel.cu +++ b/mmdet/ops/roi_pool/src/cuda/roi_pool_kernel.cu @@ -88,10 +88,10 @@ int ROIPoolForwardLaucher(const at::Tensor features, const at::Tensor rois, AT_DISPATCH_FLOATING_TYPES_AND_HALF( features.scalar_type(), "ROIPoolLaucherForward", ([&] { - const scalar_t *bottom_data = features.data<scalar_t>(); - const scalar_t *rois_data = rois.data<scalar_t>(); - scalar_t *top_data = output.data<scalar_t>(); - int *argmax_data = argmax.data<int>(); + const scalar_t *bottom_data = features.data_ptr<scalar_t>(); + const scalar_t *rois_data = rois.data_ptr<scalar_t>(); + scalar_t *top_data = output.data_ptr<scalar_t>(); + int *argmax_data = argmax.data_ptr<int>(); ROIPoolForward<scalar_t><<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, at::cuda::getCurrentCUDAStream()>>>( @@ -132,10 +132,10 @@ int ROIPoolBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois, const int output_size = num_rois * pooled_h * pooled_w * channels; AT_DISPATCH_FLOATING_TYPES_AND_HALF( top_grad.scalar_type(), "ROIPoolLaucherBackward", ([&] { - const scalar_t *top_diff = top_grad.data<scalar_t>(); - const scalar_t *rois_data = rois.data<scalar_t>(); - const int *argmax_data = argmax.data<int>(); - scalar_t *bottom_diff = bottom_grad.data<scalar_t>(); + const scalar_t *top_diff = top_grad.data_ptr<scalar_t>(); + const scalar_t *rois_data = rois.data_ptr<scalar_t>(); + const int *argmax_data = argmax.data_ptr<int>(); + scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>(); if (sizeof(scalar_t) == sizeof(double)) { fprintf(stderr, "double is not supported\n"); exit(-1); diff --git a/mmdet/ops/roi_pool/src/roi_pool_ext.cpp b/mmdet/ops/roi_pool/src/roi_pool_ext.cpp index af7bd8553c3ff0e9753c07bb9dcdabc46edbb4a2..27d6b8a5d07c6ff685653905bd802b8cd277cb13 100644 --- a/mmdet/ops/roi_pool/src/roi_pool_ext.cpp +++ b/mmdet/ops/roi_pool/src/roi_pool_ext.cpp @@ -18,9 +18,9 @@ int ROIPoolBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois, const int pooled_w, at::Tensor bottom_grad); #endif -#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") +#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ") #define CHECK_CONTIGUOUS(x) \ - AT_CHECK(x.is_contiguous(), #x, " must be contiguous ") + TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ") #define CHECK_INPUT(x) \ CHECK_CUDA(x); \ CHECK_CONTIGUOUS(x) @@ -29,7 +29,7 @@ int roi_pooling_forward(at::Tensor features, at::Tensor rois, int pooled_height, int pooled_width, float spatial_scale, at::Tensor output, at::Tensor argmax) { - if (features.type().is_cuda()) { + if (features.device().is_cuda()) { #ifdef WITH_CUDA CHECK_INPUT(features); CHECK_INPUT(rois); @@ -64,7 +64,7 @@ int roi_pooling_forward(at::Tensor features, at::Tensor rois, int roi_pooling_backward(at::Tensor top_grad, at::Tensor rois, at::Tensor argmax, float spatial_scale, at::Tensor bottom_grad) { - if (top_grad.type().is_cuda()) { + if (top_grad.device().is_cuda()) { #ifdef WITH_CUDA CHECK_INPUT(top_grad); CHECK_INPUT(rois); diff --git a/mmdet/ops/sigmoid_focal_loss/src/cuda/sigmoid_focal_loss_cuda.cu b/mmdet/ops/sigmoid_focal_loss/src/cuda/sigmoid_focal_loss_cuda.cu index 5101a113effcfea16f01426456d1cba9cf0aa2f4..797dcf355ebadcc17bb754614aa8f68698b4c773 100644 --- a/mmdet/ops/sigmoid_focal_loss/src/cuda/sigmoid_focal_loss_cuda.cu +++ b/mmdet/ops/sigmoid_focal_loss/src/cuda/sigmoid_focal_loss_cuda.cu @@ -100,8 +100,8 @@ at::Tensor SigmoidFocalLoss_forward_cuda(const at::Tensor &logits, const at::Tensor &targets, const int num_classes, const float gamma, const float alpha) { - AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor"); - AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor"); + AT_ASSERTM(logits.device().is_cuda(), "logits must be a CUDA tensor"); + AT_ASSERTM(targets.device().is_cuda(), "targets must be a CUDA tensor"); AT_ASSERTM(logits.dim() == 2, "logits should be NxClass"); const int num_samples = logits.size(0); @@ -121,9 +121,9 @@ at::Tensor SigmoidFocalLoss_forward_cuda(const at::Tensor &logits, AT_DISPATCH_FLOATING_TYPES_AND_HALF( logits.scalar_type(), "SigmoidFocalLoss_forward", [&] { SigmoidFocalLossForward<scalar_t><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>( - losses_size, logits.contiguous().data<scalar_t>(), - targets.contiguous().data<int64_t>(), num_classes, gamma, alpha, - num_samples, losses.data<scalar_t>()); + losses_size, logits.contiguous().data_ptr<scalar_t>(), + targets.contiguous().data_ptr<int64_t>(), num_classes, gamma, alpha, + num_samples, losses.data_ptr<scalar_t>()); }); THCudaCheck(cudaGetLastError()); return losses; @@ -135,9 +135,9 @@ at::Tensor SigmoidFocalLoss_backward_cuda(const at::Tensor &logits, const int num_classes, const float gamma, const float alpha) { - AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor"); - AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor"); - AT_ASSERTM(d_losses.type().is_cuda(), "d_losses must be a CUDA tensor"); + AT_ASSERTM(logits.device().is_cuda(), "logits must be a CUDA tensor"); + AT_ASSERTM(targets.device().is_cuda(), "targets must be a CUDA tensor"); + AT_ASSERTM(d_losses.device().is_cuda(), "d_losses must be a CUDA tensor"); AT_ASSERTM(logits.dim() == 2, "logits should be NxClass"); @@ -160,10 +160,10 @@ at::Tensor SigmoidFocalLoss_backward_cuda(const at::Tensor &logits, AT_DISPATCH_FLOATING_TYPES_AND_HALF( logits.scalar_type(), "SigmoidFocalLoss_backward", [&] { SigmoidFocalLossBackward<scalar_t><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>( - d_logits_size, logits.contiguous().data<scalar_t>(), - targets.contiguous().data<int64_t>(), - d_losses.contiguous().data<scalar_t>(), num_classes, gamma, alpha, - num_samples, d_logits.data<scalar_t>()); + d_logits_size, logits.contiguous().data_ptr<scalar_t>(), + targets.contiguous().data_ptr<int64_t>(), + d_losses.contiguous().data_ptr<scalar_t>(), num_classes, gamma, alpha, + num_samples, d_logits.data_ptr<scalar_t>()); }); THCudaCheck(cudaGetLastError()); diff --git a/mmdet/ops/sigmoid_focal_loss/src/sigmoid_focal_loss_ext.cpp b/mmdet/ops/sigmoid_focal_loss/src/sigmoid_focal_loss_ext.cpp index faf2e7872975cacf0953ba3095f80bba85e75003..3d66f3f8ff8f402290c247e489a2cd3fb012dd43 100644 --- a/mmdet/ops/sigmoid_focal_loss/src/sigmoid_focal_loss_ext.cpp +++ b/mmdet/ops/sigmoid_focal_loss/src/sigmoid_focal_loss_ext.cpp @@ -20,7 +20,7 @@ at::Tensor SigmoidFocalLoss_forward(const at::Tensor &logits, const at::Tensor &targets, const int num_classes, const float gamma, const float alpha) { - if (logits.type().is_cuda()) { + if (logits.device().is_cuda()) { #ifdef WITH_CUDA at::DeviceGuard guard(logits.device()); return SigmoidFocalLoss_forward_cuda(logits, targets, num_classes, gamma, @@ -37,7 +37,7 @@ at::Tensor SigmoidFocalLoss_backward(const at::Tensor &logits, const at::Tensor &d_losses, const int num_classes, const float gamma, const float alpha) { - if (logits.type().is_cuda()) { + if (logits.device().is_cuda()) { #ifdef WITH_CUDA at::DeviceGuard guard(logits.device()); return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses, diff --git a/setup.py b/setup.py index 14af9d1bce63322faebf28a5751f554b02eff009..e70a53110a674ea06a915f2dd5f435c06cd5dde3 100755 --- a/setup.py +++ b/setup.py @@ -282,19 +282,6 @@ if __name__ == '__main__': 'src/cuda/masked_conv2d_cuda.cpp', 'src/cuda/masked_conv2d_kernel.cu' ]), - make_cuda_ext( - name='affine_grid_ext', - module='mmdet.ops.affine_grid', - sources=[ - 'src/affine_grid_ext.cpp', 'src/cpu/affine_grid_cpu.cpp' - ]), - make_cuda_ext( - name='grid_sampler_ext', - module='mmdet.ops.grid_sampler', - sources=[ - 'src/grid_sampler_ext.cpp', 'src/cpu/grid_sampler_cpu.cpp' - ], - sources_cuda=['src/cuda/grid_sampler_cuda.cu']), make_cuda_ext( name='carafe_ext', module='mmdet.ops.carafe',