diff --git a/.travis.yml b/.travis.yml
index e14af9c0bcd1b58adefae850430adfea445beb0f..27afe1fd6f5e63a4ec618e2278d54d9d86f265ad 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,11 +2,19 @@ dist: bionic  # ubuntu 18.04
 language: python
 
 python:
-  - "3.5"
   - "3.6"
   - "3.7"
 
-env: CUDA=10.1.105-1 CUDA_SHORT=10.1 UBUNTU_VERSION=ubuntu1804 FORCE_CUDA=1
+env:
+  global:
+    - CUDA=10.1.105-1
+    - CUDA_SHORT=10.1
+    - UBUNTU_VERSION=ubuntu1804
+    - FORCE_CUDA=1
+  matrix:
+    - TORCH=1.3.1 TORCHVISION=0.4.2 CUDA_ARCH=6.0
+    - TORCH=1.5.0 TORCHVISION=0.6.0 CUDA_ARCH=7.0
+
 cache: pip
 
 # Ref to CUDA installation in Travis: https://github.com/jeremad/cuda-travis
@@ -25,7 +33,7 @@ before_install:
 
 install:
   - pip install Pillow==6.2.2  # remove this line when torchvision>=0.5
-  - pip install torch==1.2 torchvision==0.4.0  # TODO: fix CI for pytorch>1.2
+  - pip install torch==${TORCH} torchvision==${TORCHVISION}
   - pip install "git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI"
   - pip install -r requirements.txt
 
@@ -36,7 +44,7 @@ before_script:
 
 script:
   - python setup.py check -m -s
-  - python setup.py build_ext --inplace
+  - TORCH_CUDA_ARCH_LIST="${CUDA_ARCH}" python setup.py build_ext --inplace
   - coverage run --branch --source mmdet -m py.test -v --xdoctest-modules tests mmdet
 
 after_success:
diff --git a/mmdet/models/mask_heads/fcn_mask_head.py b/mmdet/models/mask_heads/fcn_mask_head.py
index 30090d52bfc614adf50debed8ac7d7493446ed75..62849a5824fa6432623043f305fa270295fcb8a9 100644
--- a/mmdet/models/mask_heads/fcn_mask_head.py
+++ b/mmdet/models/mask_heads/fcn_mask_head.py
@@ -2,12 +2,12 @@ import numpy as np
 import pycocotools.mask as mask_util
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from torch.nn.modules.utils import _pair
 
 from mmdet.core import auto_fp16, force_fp32, mask_target
 from mmdet.ops import Conv2d, ConvModule, build_upsample_layer
 from mmdet.ops.carafe import CARAFEPack
-from mmdet.ops.grid_sampler import grid_sample
 from ..builder import HEADS, build_loss
 
 BYTES_PER_FLOAT = 4
@@ -302,7 +302,7 @@ def _do_paste_mask(masks, boxes, img_h, img_w, skip_empty=True):
     gy = img_y[:, :, None].expand(N, img_y.size(1), img_x.size(1))
     grid = torch.stack([gx, gy], dim=3)
 
-    img_masks = grid_sample(
+    img_masks = F.grid_sample(
         masks.to(dtype=torch.float32), grid, align_corners=False)
 
     if skip_empty:
diff --git a/mmdet/ops/affine_grid/__init__.py b/mmdet/ops/affine_grid/__init__.py
deleted file mode 100644
index 8530ade33843475b8bb8b90bae9636323939b78d..0000000000000000000000000000000000000000
--- a/mmdet/ops/affine_grid/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .affine_grid import affine_grid
-
-__all__ = ['affine_grid']
diff --git a/mmdet/ops/affine_grid/affine_grid.py b/mmdet/ops/affine_grid/affine_grid.py
deleted file mode 100644
index 7c24fa7991e04b22abaa08ac15a4fecd2941bf79..0000000000000000000000000000000000000000
--- a/mmdet/ops/affine_grid/affine_grid.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import torch
-import torch.nn.functional as F
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-
-from . import affine_grid_ext
-
-
-class _AffineGridGenerator(Function):
-
-    @staticmethod
-    def forward(ctx, theta, size, align_corners):
-
-        ctx.save_for_backward(theta)
-        ctx.size = size
-        ctx.align_corners = align_corners
-
-        func = affine_grid_ext.affine_grid_generator_forward
-
-        output = func(theta, size, align_corners)
-
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(ctx, grad_output):
-        theta = ctx.saved_tensors
-        size = ctx.size
-        align_corners = ctx.align_corners
-
-        func = affine_grid_ext.affine_grid_generator_backward
-
-        grad_input = func(grad_output, theta, size, align_corners)
-
-        return grad_input, None, None
-
-
-def affine_grid(theta, size, align_corners=False):
-    if torch.__version__ >= '1.3':
-        return F.affine_grid(theta, size, align_corners)
-    elif align_corners:
-        return F.affine_grid(theta, size)
-    else:
-        # enforce floating point dtype on theta
-        if not theta.is_floating_point():
-            raise ValueError(
-                'Expected theta to have floating point type, but got {}'.
-                format(theta.dtype))
-        # check that shapes and sizes match
-        if len(size) == 4:
-            if theta.dim() != 3 or theta.size(-2) != 2 or theta.size(-1) != 3:
-                raise ValueError(
-                    'Expected a batch of 2D affine matrices of shape Nx2x3 '
-                    'for size {}. Got {}.'.format(size, theta.shape))
-        elif len(size) == 5:
-            if theta.dim() != 3 or theta.size(-2) != 3 or theta.size(-1) != 4:
-                raise ValueError(
-                    'Expected a batch of 3D affine matrices of shape Nx3x4 '
-                    'for size {}. Got {}.'.format(size, theta.shape))
-        else:
-            raise NotImplementedError(
-                'affine_grid only supports 4D and 5D sizes, '
-                'for 2D and 3D affine transforms, respectively. '
-                'Got size {}.'.format(size))
-        if min(size) <= 0:
-            raise ValueError(
-                'Expected non-zero, positive output size. Got {}'.format(size))
-        return _AffineGridGenerator.apply(theta, size, align_corners)
diff --git a/mmdet/ops/affine_grid/src/affine_grid_ext.cpp b/mmdet/ops/affine_grid/src/affine_grid_ext.cpp
deleted file mode 100644
index cc5c80d780cf85f80e113d1b8857ed829fda9c47..0000000000000000000000000000000000000000
--- a/mmdet/ops/affine_grid/src/affine_grid_ext.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-// Modified from https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/AffineGridGenerator.cpp
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
-#include <torch/extension.h>
-
-namespace mmdetection {
-
-using namespace at;
-
-Tensor affine_grid_generator_forward(const Tensor &theta, IntArrayRef size,
-                                     bool align_corners);
-
-Tensor affine_grid_generator_backward(const Tensor &grad, IntArrayRef size,
-                                      bool align_corners);
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-m.def("affine_grid_generator_forward", &affine_grid_generator_forward,
-"affine_grid_generator_forward");
-m.def("affine_grid_generator_backward", &affine_grid_generator_backward,
-"affine_grid_generator_backward");
-}
-
-}  // namespace mmdetection
diff --git a/mmdet/ops/affine_grid/src/cpu/affine_grid_cpu.cpp b/mmdet/ops/affine_grid/src/cpu/affine_grid_cpu.cpp
deleted file mode 100644
index 51434604fd6e91f821ea65893190086a040fe7b7..0000000000000000000000000000000000000000
--- a/mmdet/ops/affine_grid/src/cpu/affine_grid_cpu.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-// Modified from https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/AffineGridGenerator.cpp
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
-#include <torch/extension.h>
-
-namespace mmdetection {
-
-using namespace at;
-
-at::Tensor linspace_from_neg_one(const Tensor& grid, int64_t num_steps,
-                                 bool align_corners) {
-  if (num_steps <= 1) {
-    return at::tensor(0, grid.options());
-  }
-  auto range = at::linspace(-1, 1, num_steps, grid.options());
-  if (!align_corners) {
-    range = range * (num_steps - 1) / num_steps;
-  }
-  return range;
-}
-
-Tensor make_base_grid_4D(const Tensor& theta, int64_t N, int64_t C, int64_t H,
-                         int64_t W, bool align_corners) {
-  auto base_grid = at::empty({N, H, W, 3}, theta.options());
-
-  base_grid.select(-1, 0).copy_(linspace_from_neg_one(theta, W, align_corners));
-  base_grid.select(-1, 1).copy_(
-      linspace_from_neg_one(theta, H, align_corners).unsqueeze_(-1));
-  base_grid.select(-1, 2).fill_(1);
-
-  return base_grid;
-}
-
-Tensor make_base_grid_5D(const Tensor& theta, int64_t N, int64_t C, int64_t D,
-                         int64_t H, int64_t W, bool align_corners) {
-  auto base_grid = at::empty({N, D, H, W, 4}, theta.options());
-
-  base_grid.select(-1, 0).copy_(linspace_from_neg_one(theta, W, align_corners));
-  base_grid.select(-1, 1).copy_(
-      linspace_from_neg_one(theta, H, align_corners).unsqueeze_(-1));
-  base_grid.select(-1, 2).copy_(linspace_from_neg_one(theta, D, align_corners)
-                                    .unsqueeze_(-1)
-                                    .unsqueeze_(-1));
-  base_grid.select(-1, 3).fill_(1);
-
-  return base_grid;
-}
-
-Tensor affine_grid_generator_4D_forward(const Tensor& theta, int64_t N,
-                                        int64_t C, int64_t H, int64_t W,
-                                        bool align_corners) {
-  Tensor base_grid = make_base_grid_4D(theta, N, C, H, W, align_corners);
-  auto grid = base_grid.view({N, H * W, 3}).bmm(theta.transpose(1, 2));
-  return grid.view({N, H, W, 2});
-}
-
-Tensor affine_grid_generator_5D_forward(const Tensor& theta, int64_t N,
-                                        int64_t C, int64_t D, int64_t H,
-                                        int64_t W, bool align_corners) {
-  Tensor base_grid = make_base_grid_5D(theta, N, C, D, H, W, align_corners);
-  auto grid = base_grid.view({N, D * H * W, 4}).bmm(theta.transpose(1, 2));
-  return grid.view({N, D, H, W, 3});
-}
-
-Tensor affine_grid_generator_forward(const Tensor& theta, IntArrayRef size,
-                                     bool align_corners) {
-  if (size.size() == 4) {
-    return affine_grid_generator_4D_forward(theta, size[0], size[1], size[2],
-                                            size[3], align_corners);
-  } else {
-    return affine_grid_generator_5D_forward(theta, size[0], size[1], size[2],
-                                            size[3], size[4], align_corners);
-  }
-}
-
-Tensor affine_grid_generator_4D_backward(const Tensor& grad_grid, int64_t N,
-                                         int64_t C, int64_t H, int64_t W,
-                                         bool align_corners) {
-  auto base_grid = make_base_grid_4D(grad_grid, N, C, H, W, align_corners);
-  AT_ASSERT(grad_grid.sizes() == IntArrayRef({N, H, W, 2}));
-  auto grad_theta = base_grid.view({N, H * W, 3})
-                        .transpose(1, 2)
-                        .bmm(grad_grid.view({N, H * W, 2}));
-  return grad_theta.transpose(1, 2);
-}
-
-Tensor affine_grid_generator_5D_backward(const Tensor& grad_grid, int64_t N,
-                                         int64_t C, int64_t D, int64_t H,
-                                         int64_t W, bool align_corners) {
-  auto base_grid = make_base_grid_5D(grad_grid, N, C, D, H, W, align_corners);
-  AT_ASSERT(grad_grid.sizes() == IntArrayRef({N, D, H, W, 3}));
-  auto grad_theta = base_grid.view({N, D * H * W, 4})
-                        .transpose(1, 2)
-                        .bmm(grad_grid.view({N, D * H * W, 3}));
-  return grad_theta.transpose(1, 2);
-}
-
-Tensor affine_grid_generator_backward(const Tensor& grad, IntArrayRef size,
-                                      bool align_corners) {
-  if (size.size() == 4) {
-    return affine_grid_generator_4D_backward(grad, size[0], size[1], size[2],
-                                             size[3], align_corners);
-  } else {
-    return affine_grid_generator_5D_backward(grad, size[0], size[1], size[2],
-                                             size[3], size[4], align_corners);
-  }
-}
-}  // namespace mmdetection
diff --git a/mmdet/ops/carafe/src/carafe_ext.cpp b/mmdet/ops/carafe/src/carafe_ext.cpp
index 5bee3dafc114aaaf8db749794e9c2e9b842dbf57..7998ac2cd9a8aa6d0add9244048718caead68a4e 100644
--- a/mmdet/ops/carafe/src/carafe_ext.cpp
+++ b/mmdet/ops/carafe/src/carafe_ext.cpp
@@ -22,7 +22,7 @@ int carafe_forward(at::Tensor features, at::Tensor rfeatures,
                    at::Tensor masks, at::Tensor rmasks, int kernel_size,
                    int group_size, int scale_factor, at::Tensor routput,
                    at::Tensor output) {
-  if (features.type().is_cuda()) {
+  if (features.device().is_cuda()) {
 #ifdef WITH_CUDA
     return carafe_forward_cuda(features, rfeatures, masks, rmasks, kernel_size,
                                group_size, scale_factor, routput, output);
@@ -39,7 +39,7 @@ int carafe_backward(at::Tensor top_grad, at::Tensor rfeatures,
                     at::Tensor rbottom_grad_hs, at::Tensor rbottom_grad,
                     at::Tensor rmask_grad, at::Tensor bottom_grad,
                     at::Tensor mask_grad) {
-  if (top_grad.type().is_cuda()) {
+  if (top_grad.device().is_cuda()) {
 #ifdef WITH_CUDA
     return carafe_backward_cuda(top_grad, rfeatures, masks, kernel_size,
         group_size, scale_factor, rtop_grad, rbottom_grad_hs, rbottom_grad,
diff --git a/mmdet/ops/carafe/src/carafe_naive_ext.cpp b/mmdet/ops/carafe/src/carafe_naive_ext.cpp
index 06fe912ad6d502d9cd7387227ae3f4b4a9095c1e..357b8625df8fb18589829bbded26315f9070e743 100644
--- a/mmdet/ops/carafe/src/carafe_naive_ext.cpp
+++ b/mmdet/ops/carafe/src/carafe_naive_ext.cpp
@@ -18,7 +18,7 @@ int carafe_naive_backward_cuda(at::Tensor top_grad, at::Tensor features,
 int carafe_naive_forward(at::Tensor features, at::Tensor masks,
                          int kernel_size, int group_size, int scale_factor,
                          at::Tensor output) {
-  if (features.type().is_cuda()) {
+  if (features.device().is_cuda()) {
 #ifdef WITH_CUDA
     return carafe_naive_forward_cuda(features, masks, kernel_size,
         group_size, scale_factor, output);
@@ -33,7 +33,7 @@ int carafe_naive_backward(at::Tensor top_grad, at::Tensor features,
                                at::Tensor masks, int kernel_size,
                                int group_size, int scale_factor,
                                at::Tensor bottom_grad, at::Tensor mask_grad) {
-  if (top_grad.type().is_cuda()) {
+  if (top_grad.device().is_cuda()) {
 #ifdef WITH_CUDA
     return carafe_naive_backward_cuda(top_grad, features, masks, kernel_size,
         group_size, scale_factor, bottom_grad, mask_grad);
diff --git a/mmdet/ops/carafe/src/cuda/carafe_cuda.cpp b/mmdet/ops/carafe/src/cuda/carafe_cuda.cpp
index 28d890f545166a6c751d801ec03e0111d8da16bf..59b536c027c9407dcb86e2d14099d8d210ddabcb 100644
--- a/mmdet/ops/carafe/src/cuda/carafe_cuda.cpp
+++ b/mmdet/ops/carafe/src/cuda/carafe_cuda.cpp
@@ -24,9 +24,9 @@ int CARAFEBackwardLaucher(const at::Tensor top_grad, const at::Tensor rfeatures,
                           at::Tensor rmask_grad, at::Tensor bottom_grad,
                           at::Tensor mask_grad);
 
-#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
+#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
 #define CHECK_CONTIGUOUS(x) \
-  AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
+  TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
 #define CHECK_INPUT(x) \
   CHECK_CUDA(x);       \
   CHECK_CONTIGUOUS(x)
diff --git a/mmdet/ops/carafe/src/cuda/carafe_cuda_kernel.cu b/mmdet/ops/carafe/src/cuda/carafe_cuda_kernel.cu
index da62755067bbaaa174cfed886789b4f807a2aab6..3a02a20f847734f83567bf80f277a22b67e78c0f 100644
--- a/mmdet/ops/carafe/src/cuda/carafe_cuda_kernel.cu
+++ b/mmdet/ops/carafe/src/cuda/carafe_cuda_kernel.cu
@@ -156,9 +156,9 @@ int CARAFEForwardLaucher(const at::Tensor features, const at::Tensor masks,
   // one warp per pixel
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      features.type(), "NCHW2NHWC_Feature", ([&] {
-        const scalar_t *bottom_data = features.data<scalar_t>();
-        scalar_t *top_data = rfeatures.data<scalar_t>();
+      features.scalar_type(), "NCHW2NHWC_Feature", ([&] {
+        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
+        scalar_t *top_data = rfeatures.data_ptr<scalar_t>();
         const int dh = divideUP(channels, kTileDim);
         const int dw = divideUP(input_height * input_width, kTileDim);
         BatchTranspose2DCUDAKernel<scalar_t>
@@ -167,9 +167,9 @@ int CARAFEForwardLaucher(const at::Tensor features, const at::Tensor masks,
                 bottom_data, top_data);
       }));
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      features.type(), "NCHW2NHWC_Masks", ([&] {
-        const scalar_t *bottom_data = masks.data<scalar_t>();
-        scalar_t *top_data = rmasks.data<scalar_t>();
+      features.scalar_type(), "NCHW2NHWC_Masks", ([&] {
+        const scalar_t *bottom_data = masks.data_ptr<scalar_t>();
+        scalar_t *top_data = rmasks.data_ptr<scalar_t>();
         const int dh = divideUP(mask_channels, kTileDim);
         const int dw = divideUP(output_height * output_width, kTileDim);
         BatchTranspose2DCUDAKernel<scalar_t>
@@ -178,12 +178,12 @@ int CARAFEForwardLaucher(const at::Tensor features, const at::Tensor masks,
                 bottom_data, top_data);
       }));
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      features.type(), "CARAFELaucherForward", ([&] {
+      features.scalar_type(), "CARAFELaucherForward", ([&] {
         const int num_kernels =
             batch_size * output_height * output_width * THREADS_PER_PIXEL;
-        const scalar_t *bottom_data = rfeatures.data<scalar_t>();
-        const scalar_t *bottom_masks = rmasks.data<scalar_t>();
-        scalar_t *top_data = routput.data<scalar_t>();
+        const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>();
+        const scalar_t *bottom_masks = rmasks.data_ptr<scalar_t>();
+        scalar_t *top_data = routput.data_ptr<scalar_t>();
 
         CARAFEForward<scalar_t>
             <<<at::cuda::ATenCeilDiv(num_kernels, THREADS_PER_BLOCK),
@@ -193,9 +193,9 @@ int CARAFEForwardLaucher(const at::Tensor features, const at::Tensor masks,
                 output_height, output_width, mask_channels, top_data);
       }));
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      features.type(), "NHWC2NCHW", ([&] {
-        const scalar_t *bottom_data = routput.data<scalar_t>();
-        scalar_t *top_data = output.data<scalar_t>();
+      features.scalar_type(), "NHWC2NCHW", ([&] {
+        const scalar_t *bottom_data = routput.data_ptr<scalar_t>();
+        scalar_t *top_data = output.data_ptr<scalar_t>();
         const int dh = divideUP(output_height * output_width, kTileDim);
         const int dw = divideUP(channels, kTileDim);
         BatchTranspose2DCUDAKernel<scalar_t>
@@ -388,9 +388,9 @@ int CARAFEBackwardLaucher(const at::Tensor top_grad, const at::Tensor rfeatures,
                           at::Tensor mask_grad) {
   cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      top_grad.type(), "NCHW2NHWC_Top_Grad", ([&] {
-        const scalar_t *bottom_data = top_grad.data<scalar_t>();
-        scalar_t *top_data = rtop_grad.data<scalar_t>();
+      top_grad.scalar_type(), "NCHW2NHWC_Top_Grad", ([&] {
+        const scalar_t *bottom_data = top_grad.data_ptr<scalar_t>();
+        scalar_t *top_data = rtop_grad.data_ptr<scalar_t>();
         const int dh = divideUP(channels, kTileDim);
         const int dw = divideUP(output_height * output_width, kTileDim);
         BatchTranspose2DCUDAKernel<scalar_t>
@@ -400,12 +400,12 @@ int CARAFEBackwardLaucher(const at::Tensor top_grad, const at::Tensor rfeatures,
       }));
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      top_grad.type(), "CARAFELaucherBackward_Feature", ([&] {
+      top_grad.scalar_type(), "CARAFELaucherBackward_Feature", ([&] {
         const int num_kernels =
             batch_size * output_height * output_width * THREADS_PER_PIXEL;
-        const scalar_t *top_diff = rtop_grad.data<scalar_t>();
-        const scalar_t *bottom_masks = masks.data<scalar_t>();
-        scalar_t *bottom_diff = rbottom_grad_hs.data<scalar_t>();
+        const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>();
+        const scalar_t *bottom_masks = masks.data_ptr<scalar_t>();
+        scalar_t *bottom_diff = rbottom_grad_hs.data_ptr<scalar_t>();
 
         CARAFEBackward_Feature<scalar_t>
             <<<at::cuda::ATenCeilDiv(num_kernels, THREADS_PER_BLOCK),
@@ -415,11 +415,11 @@ int CARAFEBackwardLaucher(const at::Tensor top_grad, const at::Tensor rfeatures,
                 output_height, output_width, mask_channels, bottom_diff);
       }));
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      top_grad.type(), "FeatureSum", ([&] {
+      top_grad.scalar_type(), "FeatureSum", ([&] {
         const int num_kernels =
             batch_size * input_height * input_width * THREADS_PER_PIXEL;
-        const scalar_t *bottom_diff_hs = rbottom_grad_hs.data<scalar_t>();
-        scalar_t *bottom_diff = rbottom_grad.data<scalar_t>();
+        const scalar_t *bottom_diff_hs = rbottom_grad_hs.data_ptr<scalar_t>();
+        scalar_t *bottom_diff = rbottom_grad.data_ptr<scalar_t>();
 
         FeatureSum<scalar_t>
             <<<at::cuda::ATenCeilDiv(num_kernels, THREADS_PER_BLOCK),
@@ -428,9 +428,9 @@ int CARAFEBackwardLaucher(const at::Tensor top_grad, const at::Tensor rfeatures,
                 input_height, input_width, bottom_diff);
       }));
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      top_grad.type(), "NHWC2NCHW_Bottom_Grad", ([&] {
-        const scalar_t *bottom_data = rbottom_grad.data<scalar_t>();
-        scalar_t *top_data = bottom_grad.data<scalar_t>();
+      top_grad.scalar_type(), "NHWC2NCHW_Bottom_Grad", ([&] {
+        const scalar_t *bottom_data = rbottom_grad.data_ptr<scalar_t>();
+        scalar_t *top_data = bottom_grad.data_ptr<scalar_t>();
         const int dh = divideUP(input_height * input_width, kTileDim);
         const int dw = divideUP(channels, kTileDim);
         BatchTranspose2DCUDAKernel<scalar_t>
@@ -440,12 +440,12 @@ int CARAFEBackwardLaucher(const at::Tensor top_grad, const at::Tensor rfeatures,
       }));
 
   AT_DISPATCH_FLOATING_TYPES(
-      top_grad.type(), "CARAFELaucherBackward_Mask", ([&] {
+      top_grad.scalar_type(), "CARAFELaucherBackward_Mask", ([&] {
         const int num_kernels = batch_size * output_height * output_width *
                                 mask_channels * WARP_SIZE;
-        const scalar_t *top_diff = rtop_grad.data<scalar_t>();
-        const scalar_t *bottom_data = rfeatures.data<scalar_t>();
-        scalar_t *mask_diff = rmask_grad.data<scalar_t>();
+        const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>();
+        const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>();
+        scalar_t *mask_diff = rmask_grad.data_ptr<scalar_t>();
 
         CARAFEBackward_Mask<scalar_t>
             <<<at::cuda::ATenCeilDiv(num_kernels, THREADS_PER_BLOCK),
@@ -455,9 +455,9 @@ int CARAFEBackwardLaucher(const at::Tensor top_grad, const at::Tensor rfeatures,
                 output_height, output_width, mask_channels, mask_diff);
       }));
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      top_grad.type(), "NHWC2NCHW_Mask_Grad", ([&] {
-        const scalar_t *bottom_data = rmask_grad.data<scalar_t>();
-        scalar_t *top_data = mask_grad.data<scalar_t>();
+      top_grad.scalar_type(), "NHWC2NCHW_Mask_Grad", ([&] {
+        const scalar_t *bottom_data = rmask_grad.data_ptr<scalar_t>();
+        scalar_t *top_data = mask_grad.data_ptr<scalar_t>();
         const int dh = divideUP(output_height * output_width, kTileDim);
         const int dw = divideUP(mask_channels, kTileDim);
         BatchTranspose2DCUDAKernel<scalar_t>
diff --git a/mmdet/ops/carafe/src/cuda/carafe_naive_cuda.cpp b/mmdet/ops/carafe/src/cuda/carafe_naive_cuda.cpp
index 611f1d114710f6b276455ed58cecaa48a7a634b5..394afd3ad06f88943ae50b022278d55343a1ad62 100644
--- a/mmdet/ops/carafe/src/cuda/carafe_naive_cuda.cpp
+++ b/mmdet/ops/carafe/src/cuda/carafe_naive_cuda.cpp
@@ -18,9 +18,9 @@ int CARAFENAIVEBackwardLaucher(const at::Tensor top_grad,
                                const int height, const int width,
                                at::Tensor bottom_grad, at::Tensor mask_grad);
 
-#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
+#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
 #define CHECK_CONTIGUOUS(x) \
-  AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
+  TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
 #define CHECK_INPUT(x) \
   CHECK_CUDA(x);       \
   CHECK_CONTIGUOUS(x)
diff --git a/mmdet/ops/carafe/src/cuda/carafe_naive_cuda_kernel.cu b/mmdet/ops/carafe/src/cuda/carafe_naive_cuda_kernel.cu
index 3edbae7948128441a2fc4263f7cda22d13a989fc..9cf9855a71c5e58c2ca3e4e369c5ded70a52e8fd 100644
--- a/mmdet/ops/carafe/src/cuda/carafe_naive_cuda_kernel.cu
+++ b/mmdet/ops/carafe/src/cuda/carafe_naive_cuda_kernel.cu
@@ -76,10 +76,10 @@ int CARAFENAIVEForwardLaucher(const at::Tensor features, const at::Tensor masks,
                               const int width, at::Tensor output) {
   const int output_size = batch_size * channels * height * width;
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      features.type(), "CARAFENAIVELaucherForward", ([&] {
-        const scalar_t *bottom_data = features.data<scalar_t>();
-        const scalar_t *bottom_masks = masks.data<scalar_t>();
-        scalar_t *top_data = output.data<scalar_t>();
+      features.scalar_type(), "CARAFENAIVELaucherForward", ([&] {
+        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
+        const scalar_t *bottom_masks = masks.data_ptr<scalar_t>();
+        scalar_t *top_data = output.data_ptr<scalar_t>();
 
         CARAFENAIVEForward<scalar_t>
             <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
@@ -152,12 +152,12 @@ int CARAFENAIVEBackwardLaucher(const at::Tensor top_grad,
   const int output_size = batch_size * channels * height * width;
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-      top_grad.type(), "CARAFENAIVELaucherBackward", ([&] {
-        const scalar_t *top_diff = top_grad.data<scalar_t>();
-        const scalar_t *bottom_data = features.data<scalar_t>();
-        const scalar_t *bottom_masks = masks.data<scalar_t>();
-        scalar_t *bottom_diff = bottom_grad.data<scalar_t>();
-        scalar_t *mask_diff = mask_grad.data<scalar_t>();
+      top_grad.scalar_type(), "CARAFENAIVELaucherBackward", ([&] {
+        const scalar_t *top_diff = top_grad.data_ptr<scalar_t>();
+        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
+        const scalar_t *bottom_masks = masks.data_ptr<scalar_t>();
+        scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>();
+        scalar_t *mask_diff = mask_grad.data_ptr<scalar_t>();
 
         CARAFENAIVEBackward<scalar_t>
             <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
diff --git a/mmdet/ops/dcn/src/cuda/deform_conv_cuda.cpp b/mmdet/ops/dcn/src/cuda/deform_conv_cuda.cpp
index 8601eb3b276a4cf2105c04bca35e25b4addd8d84..5d9424908ed2dbd4ac3cdb98d13e09287a4d2f2d 100644
--- a/mmdet/ops/dcn/src/cuda/deform_conv_cuda.cpp
+++ b/mmdet/ops/dcn/src/cuda/deform_conv_cuda.cpp
@@ -63,26 +63,26 @@ void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput,
                  at::Tensor weight, int kH, int kW, int dH, int dW, int padH,
                  int padW, int dilationH, int dilationW, int group,
                  int deformable_group) {
-  AT_CHECK(weight.ndimension() == 4,
+  TORCH_CHECK(weight.ndimension() == 4,
            "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
            "but got: %s",
            weight.ndimension());
 
-  AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
 
-  AT_CHECK(kW > 0 && kH > 0,
+  TORCH_CHECK(kW > 0 && kH > 0,
            "kernel size should be greater than zero, but got kH: %d kW: %d", kH,
            kW);
 
-  AT_CHECK((weight.size(2) == kH && weight.size(3) == kW),
+  TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW),
            "kernel size should be consistent with weight, ",
            "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d", kH,
            kW, weight.size(2), weight.size(3));
 
-  AT_CHECK(dW > 0 && dH > 0,
+  TORCH_CHECK(dW > 0 && dH > 0,
            "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
 
-  AT_CHECK(
+  TORCH_CHECK(
       dilationW > 0 && dilationH > 0,
       "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
       dilationH, dilationW);
@@ -98,7 +98,7 @@ void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput,
     dimw++;
   }
 
-  AT_CHECK(ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s",
+  TORCH_CHECK(ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s",
            ndim);
 
   long nInputPlane = weight.size(1) * group;
@@ -110,7 +110,7 @@ void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput,
   long outputWidth =
       (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
 
-  AT_CHECK(nInputPlane % deformable_group == 0,
+  TORCH_CHECK(nInputPlane % deformable_group == 0,
            "input channels must divide deformable group size");
 
   if (outputWidth < 1 || outputHeight < 1)
@@ -120,27 +120,27 @@ void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput,
         nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
         outputWidth);
 
-  AT_CHECK(input.size(1) == nInputPlane,
+  TORCH_CHECK(input.size(1) == nInputPlane,
            "invalid number of input planes, expected: %d, but got: %d",
            nInputPlane, input.size(1));
 
-  AT_CHECK((inputHeight >= kH && inputWidth >= kW),
+  TORCH_CHECK((inputHeight >= kH && inputWidth >= kW),
            "input image is smaller than kernel");
 
-  AT_CHECK((offset.size(2) == outputHeight && offset.size(3) == outputWidth),
+  TORCH_CHECK((offset.size(2) == outputHeight && offset.size(3) == outputWidth),
            "invalid spatial size of offset, expected height: %d width: %d, but "
            "got height: %d width: %d",
            outputHeight, outputWidth, offset.size(2), offset.size(3));
 
-  AT_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
+  TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
            "invalid number of channels of offset");
 
   if (gradOutput != NULL) {
-    AT_CHECK(gradOutput->size(dimf) == nOutputPlane,
+    TORCH_CHECK(gradOutput->size(dimf) == nOutputPlane,
              "invalid number of gradOutput planes, expected: %d, but got: %d",
              nOutputPlane, gradOutput->size(dimf));
 
-    AT_CHECK((gradOutput->size(dimh) == outputHeight &&
+    TORCH_CHECK((gradOutput->size(dimh) == outputHeight &&
               gradOutput->size(dimw) == outputWidth),
              "invalid size of gradOutput, expected height: %d width: %d , but "
              "got height: %d width: %d",
@@ -191,7 +191,7 @@ int deform_conv_forward_cuda(at::Tensor input, at::Tensor weight,
   long outputHeight =
       (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
 
-  AT_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
 
   output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
                         outputHeight, outputWidth});
@@ -298,7 +298,7 @@ int deform_conv_backward_input_cuda(at::Tensor input, at::Tensor offset,
   long outputHeight =
       (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
 
-  AT_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
+  TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
   gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
   columns = at::zeros(
       {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
@@ -414,7 +414,7 @@ int deform_conv_backward_parameters_cuda(
   long outputHeight =
       (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
 
-  AT_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
 
   columns = at::zeros(
       {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
@@ -494,8 +494,8 @@ void modulated_deform_conv_cuda_forward(
     const int pad_h, const int pad_w, const int dilation_h,
     const int dilation_w, const int group, const int deformable_group,
     const bool with_bias) {
-  AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
-  AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
   at::DeviceGuard guard(input.device());
 
   const int batch = input.size(0);
@@ -576,8 +576,8 @@ void modulated_deform_conv_cuda_backward(
     int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
     int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
     const bool with_bias) {
-  AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
-  AT_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
   at::DeviceGuard guard(input.device());
 
   const int batch = input.size(0);
diff --git a/mmdet/ops/dcn/src/cuda/deform_conv_cuda_kernel.cu b/mmdet/ops/dcn/src/cuda/deform_conv_cuda_kernel.cu
index e7a26f2e830846f80272bcd8c5ce0def34593c95..98752dccf8c58817ca1a952554dd3f33188a2d34 100644
--- a/mmdet/ops/dcn/src/cuda/deform_conv_cuda_kernel.cu
+++ b/mmdet/ops/dcn/src/cuda/deform_conv_cuda_kernel.cu
@@ -258,9 +258,9 @@ void deformable_im2col(
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       data_im.scalar_type(), "deformable_im2col_gpu", ([&] {
-        const scalar_t *data_im_ = data_im.data<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
-        scalar_t *data_col_ = data_col.data<scalar_t>();
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
 
         deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
             num_kernels, data_im_, data_offset_, height, width, ksize_h, ksize_w,
@@ -352,9 +352,9 @@ void deformable_col2im(
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
-        const scalar_t *data_col_ = data_col.data<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
-        scalar_t *grad_im_ = grad_im.data<scalar_t>();
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
 
         deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
             num_kernels, data_col_, data_offset_, channels, height, width, ksize_h,
@@ -450,10 +450,10 @@ void deformable_col2im_coord(
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       data_col.scalar_type(), "deformable_col2im_coord_gpu", ([&] {
-        const scalar_t *data_col_ = data_col.data<scalar_t>();
-        const scalar_t *data_im_ = data_im.data<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
-        scalar_t *grad_offset_ = grad_offset.data<scalar_t>();
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
 
         deformable_col2im_coord_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
             num_kernels, data_col_, data_im_, data_offset_, channels, height, width,
@@ -780,10 +780,10 @@ void modulated_deformable_im2col_cuda(
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       data_im.scalar_type(), "modulated_deformable_im2col_gpu", ([&] {
-        const scalar_t *data_im_ = data_im.data<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
-        const scalar_t *data_mask_ = data_mask.data<scalar_t>();
-        scalar_t *data_col_ = data_col.data<scalar_t>();
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
 
         modulated_deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
             num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im, kernel_h, kenerl_w,
@@ -812,10 +812,10 @@ void modulated_deformable_col2im_cuda(
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       data_col.scalar_type(), "modulated_deformable_col2im_gpu", ([&] {
-        const scalar_t *data_col_ = data_col.data<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
-        const scalar_t *data_mask_ = data_mask.data<scalar_t>();
-        scalar_t *grad_im_ = grad_im.data<scalar_t>();
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
 
         modulated_deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
             num_kernels, data_col_, data_offset_, data_mask_, channels, height_im, width_im,
@@ -845,12 +845,12 @@ void modulated_deformable_col2im_coord_cuda(
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       data_col.scalar_type(), "modulated_deformable_col2im_coord_gpu", ([&] {
-        const scalar_t *data_col_ = data_col.data<scalar_t>();
-        const scalar_t *data_im_ = data_im.data<scalar_t>();
-        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
-        const scalar_t *data_mask_ = data_mask.data<scalar_t>();
-        scalar_t *grad_offset_ = grad_offset.data<scalar_t>();
-        scalar_t *grad_mask_ = grad_mask.data<scalar_t>();
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
+        scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>();
 
         modulated_deformable_col2im_coord_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
             num_kernels, data_col_, data_im_, data_offset_, data_mask_, channels, height_im, width_im,
diff --git a/mmdet/ops/dcn/src/cuda/deform_pool_cuda.cpp b/mmdet/ops/dcn/src/cuda/deform_pool_cuda.cpp
index d7ed3f639ea0a6d6f7f71b3d3bcdd23a54c77499..3c09f998029714bdd3542788bc6da6e4f48e9d0b 100644
--- a/mmdet/ops/dcn/src/cuda/deform_pool_cuda.cpp
+++ b/mmdet/ops/dcn/src/cuda/deform_pool_cuda.cpp
@@ -33,7 +33,7 @@ void deform_psroi_pooling_cuda_forward(
     at::Tensor top_count, const int no_trans, const float spatial_scale,
     const int output_dim, const int group_size, const int pooled_size,
     const int part_size, const int sample_per_part, const float trans_std) {
-  AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
   at::DeviceGuard guard(input.device());
 
   const int batch = input.size(0);
@@ -59,8 +59,8 @@ void deform_psroi_pooling_cuda_backward(
     const int no_trans, const float spatial_scale, const int output_dim,
     const int group_size, const int pooled_size, const int part_size,
     const int sample_per_part, const float trans_std) {
-  AT_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous");
-  AT_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  TORCH_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous");
+  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
   at::DeviceGuard guard(input.device());
 
   const int batch = input.size(0);
diff --git a/mmdet/ops/dcn/src/cuda/deform_pool_cuda_kernel.cu b/mmdet/ops/dcn/src/cuda/deform_pool_cuda_kernel.cu
index 05b00d4be618353b404540469bf6118902651ca2..18e3a048d3f48ce6ce86162a354aeaa29ce001a6 100644
--- a/mmdet/ops/dcn/src/cuda/deform_pool_cuda_kernel.cu
+++ b/mmdet/ops/dcn/src/cuda/deform_pool_cuda_kernel.cu
@@ -290,11 +290,11 @@ void DeformablePSROIPoolForward(const at::Tensor data,
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       data.scalar_type(), "deformable_psroi_pool_forward", ([&] {
-        const scalar_t *bottom_data = data.data<scalar_t>();
-        const scalar_t *bottom_rois = bbox.data<scalar_t>();
-        const scalar_t *bottom_trans = no_trans ? NULL : trans.data<scalar_t>();
-        scalar_t *top_data = out.data<scalar_t>();
-        scalar_t *top_count_data = top_count.data<scalar_t>();
+        const scalar_t *bottom_data = data.data_ptr<scalar_t>();
+        const scalar_t *bottom_rois = bbox.data_ptr<scalar_t>();
+        const scalar_t *bottom_trans = no_trans ? NULL : trans.data_ptr<scalar_t>();
+        scalar_t *top_data = out.data_ptr<scalar_t>();
+        scalar_t *top_count_data = top_count.data_ptr<scalar_t>();
 
         DeformablePSROIPoolForwardKernel<<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
             count, bottom_data, (scalar_t)spatial_scale, channels, height, width, pooled_height, pooled_width,
@@ -341,13 +341,13 @@ void DeformablePSROIPoolBackwardAcc(const at::Tensor out_grad,
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       out_grad.scalar_type(), "deformable_psroi_pool_backward_acc", ([&] {
-        const scalar_t *top_diff = out_grad.data<scalar_t>();
-        const scalar_t *bottom_data = data.data<scalar_t>();
-        const scalar_t *bottom_rois = bbox.data<scalar_t>();
-        const scalar_t *bottom_trans = no_trans ? NULL : trans.data<scalar_t>();
-        scalar_t *bottom_data_diff = in_grad.data<scalar_t>();
-        scalar_t *bottom_trans_diff = no_trans ? NULL : trans_grad.data<scalar_t>();
-        const scalar_t *top_count_data = top_count.data<scalar_t>();
+        const scalar_t *top_diff = out_grad.data_ptr<scalar_t>();
+        const scalar_t *bottom_data = data.data_ptr<scalar_t>();
+        const scalar_t *bottom_rois = bbox.data_ptr<scalar_t>();
+        const scalar_t *bottom_trans = no_trans ? NULL : trans.data_ptr<scalar_t>();
+        scalar_t *bottom_data_diff = in_grad.data_ptr<scalar_t>();
+        scalar_t *bottom_trans_diff = no_trans ? NULL : trans_grad.data_ptr<scalar_t>();
+        const scalar_t *top_count_data = top_count.data_ptr<scalar_t>();
 
         DeformablePSROIPoolBackwardAccKernel<<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
             count, top_diff, top_count_data, num_rois, (scalar_t)spatial_scale, channels, height, width,
diff --git a/mmdet/ops/dcn/src/deform_conv_ext.cpp b/mmdet/ops/dcn/src/deform_conv_ext.cpp
index 2beaeffcbb0a41364f4440a201f39a0b6d30ffd8..fac60162b699438bf17d32a7660564b45c3b9745 100644
--- a/mmdet/ops/dcn/src/deform_conv_ext.cpp
+++ b/mmdet/ops/dcn/src/deform_conv_ext.cpp
@@ -54,7 +54,7 @@ int deform_conv_forward(at::Tensor input, at::Tensor weight,
                              int kH, int dW, int dH, int padW, int padH,
                              int dilationW, int dilationH, int group,
                              int deformable_group, int im2col_step) {
-  if (input.type().is_cuda()) {
+  if (input.device().is_cuda()) {
 #ifdef WITH_CUDA
     return deform_conv_forward_cuda(input, weight, offset, output, columns,
         ones, kW, kH, dW, dH, padW, padH, dilationW, dilationH, group,
@@ -73,7 +73,7 @@ int deform_conv_backward_input(at::Tensor input, at::Tensor offset,
                                     int dH, int padW, int padH, int dilationW,
                                     int dilationH, int group,
                                     int deformable_group, int im2col_step) {
-  if (input.type().is_cuda()) {
+  if (input.device().is_cuda()) {
 #ifdef WITH_CUDA
     return deform_conv_backward_input_cuda(input, offset, gradOutput,
         gradInput, gradOffset, weight, columns, kW, kH, dW, dH, padW, padH,
@@ -91,7 +91,7 @@ int deform_conv_backward_parameters(
     at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH,
     int padW, int padH, int dilationW, int dilationH, int group,
     int deformable_group, float scale, int im2col_step) {
-  if (input.type().is_cuda()) {
+  if (input.device().is_cuda()) {
 #ifdef WITH_CUDA
     return deform_conv_backward_parameters_cuda(input, offset, gradOutput,
         gradWeight, columns, ones, kW, kH, dW, dH, padW, padH, dilationW,
@@ -110,7 +110,7 @@ void modulated_deform_conv_forward(
     const int pad_h, const int pad_w, const int dilation_h,
     const int dilation_w, const int group, const int deformable_group,
     const bool with_bias) {
-  if (input.type().is_cuda()) {
+  if (input.device().is_cuda()) {
 #ifdef WITH_CUDA
     return modulated_deform_conv_cuda_forward(input, weight, bias, ones,
         offset, mask, output, columns, kernel_h, kernel_w, stride_h,
@@ -131,7 +131,7 @@ void modulated_deform_conv_backward(
     int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
     int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
     const bool with_bias) {
-  if (input.type().is_cuda()) {
+  if (input.device().is_cuda()) {
 #ifdef WITH_CUDA
     return modulated_deform_conv_cuda_backward(input, weight, bias, ones,
         offset, mask, columns, grad_input, grad_weight, grad_bias, grad_offset,
diff --git a/mmdet/ops/dcn/src/deform_pool_ext.cpp b/mmdet/ops/dcn/src/deform_pool_ext.cpp
index f590fabec5f4e998d1090c730c325b92ba01ba81..877064828d5b999f4cd4c3f239540862c16de268 100644
--- a/mmdet/ops/dcn/src/deform_pool_ext.cpp
+++ b/mmdet/ops/dcn/src/deform_pool_ext.cpp
@@ -31,7 +31,7 @@ void deform_psroi_pooling_forward(
     at::Tensor top_count, const int no_trans, const float spatial_scale,
     const int output_dim, const int group_size, const int pooled_size,
     const int part_size, const int sample_per_part, const float trans_std) {
-  if (input.type().is_cuda()) {
+  if (input.device().is_cuda()) {
 #ifdef WITH_CUDA
     return deform_psroi_pooling_cuda_forward(input, bbox, trans, out, top_count,
         no_trans, spatial_scale, output_dim, group_size, pooled_size,
@@ -49,7 +49,7 @@ void deform_psroi_pooling_backward(
     const int no_trans, const float spatial_scale, const int output_dim,
     const int group_size, const int pooled_size, const int part_size,
     const int sample_per_part, const float trans_std) {
-  if (input.type().is_cuda()) {
+  if (input.device().is_cuda()) {
 #ifdef WITH_CUDA
     return deform_psroi_pooling_cuda_backward(out_grad, input, bbox, trans,
         top_count, input_grad, trans_grad, no_trans, spatial_scale,
diff --git a/mmdet/ops/grid_sampler/__init__.py b/mmdet/ops/grid_sampler/__init__.py
deleted file mode 100644
index 868617a6b3f42049d8b78253bf639f07e47ec981..0000000000000000000000000000000000000000
--- a/mmdet/ops/grid_sampler/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .grid_sampler import grid_sample
-
-__all__ = ['grid_sample']
diff --git a/mmdet/ops/grid_sampler/grid_sampler.py b/mmdet/ops/grid_sampler/grid_sampler.py
deleted file mode 100644
index b5c59aa4906966f62f1ac584c8a38dae549b0e2d..0000000000000000000000000000000000000000
--- a/mmdet/ops/grid_sampler/grid_sampler.py
+++ /dev/null
@@ -1,100 +0,0 @@
-import torch
-import torch.nn.functional as F
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-
-from . import grid_sampler_ext
-
-
-class _GridSampler(Function):
-
-    @staticmethod
-    def forward(ctx, input, grid, mode_enum, padding_mode_enum, align_corners):
-
-        ctx.save_for_backward(input, grid)
-        ctx.mode_enum = mode_enum
-        ctx.padding_mode_enum = padding_mode_enum
-        ctx.align_corners = align_corners
-
-        output = grid_sampler_ext.grid_sampler_forward(input, grid, mode_enum,
-                                                       padding_mode_enum,
-                                                       align_corners)
-
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(ctx, grad_output):
-        input, grid = ctx.saved_tensors
-        mode_enum = ctx.mode_enum
-        padding_mode_enum = ctx.padding_mode_enum
-        align_corners = ctx.align_corners
-
-        grad_input, grad_grid = grid_sampler_ext.grid_sampler_backward(
-            grad_output, input, grid, mode_enum, padding_mode_enum,
-            align_corners)
-
-        return grad_input, grad_grid, None, None, None
-
-
-def grid_sample(input,
-                grid,
-                mode='bilinear',
-                padding_mode='zeros',
-                align_corners=False):
-    if torch.__version__ >= '1.3':
-        return F.grid_sample(input, grid, mode, padding_mode, align_corners)
-    elif align_corners:
-        return F.grid_sample(input, grid, mode, padding_mode)
-    else:
-
-        # use self-compiled grid_sampler to support align_corners=False
-
-        assert mode in ['bilinear', 'nearest'], \
-            'expected mode to be bilinear or nearest, but got: {}'.format(mode)
-
-        assert padding_mode in ['zeros', 'border', 'reflection'], \
-            'expected padding_mode to be zeros, border, or reflection, ' \
-            'but got: {}'.format(padding_mode)
-
-        if mode == 'bilinear':
-            mode_enum = 0
-        else:
-            mode_enum = 1
-
-        if padding_mode == 'zeros':
-            padding_mode_enum = 0
-        elif padding_mode == 'border':
-            padding_mode_enum = 1
-        else:
-            padding_mode_enum = 2
-
-        # shape check
-        assert input.device == grid.device, \
-            'expected input and grid to be on same device, ' \
-            'but input is on {} and grid is on {}'.format(
-                input.device, grid.device)
-        assert input.dtype == grid.dtype, \
-            'expected input and grid to have the same dtype, ' \
-            'but input has {} and grid has {}'.format(
-                input.dtype, grid.dtype)
-        assert input.dim() == 4 or input.dim() == 5, \
-            'expected 4D or 5D input and grid with same number of dimensions' \
-            'but got input with sizes {} and grid with sizes {}'.format(
-                input.size(), grid.size())
-        assert input.size(0) == grid.size(0), \
-            'expected input and grid to have the same batch size, ' \
-            'but got input with sizes {} and grid with sizes {}'.format(
-                input.size(), grid.size())
-        assert grid.size(-1) == input.dim() - 2, \
-            'expected grid to have size {} in last {} dimension, ' \
-            'but got grid with sizes '.format(
-                input.dim() - 2, grid.size())
-        for i in range(2, input.dim()):
-            assert input.size(i) > 0, \
-                'expected input to have non-empty spatial dimensions, ' \
-                'but input has sizes {} with dimension {} being empty'.format(
-                    input.sizes(), i)
-
-        return _GridSampler.apply(input, grid, mode_enum, padding_mode_enum,
-                                  align_corners)
diff --git a/mmdet/ops/grid_sampler/src/cpu/grid_sampler_cpu.cpp b/mmdet/ops/grid_sampler/src/cpu/grid_sampler_cpu.cpp
deleted file mode 100644
index cf1776ed1d7de573c97dd3254ef08d2352ca4377..0000000000000000000000000000000000000000
--- a/mmdet/ops/grid_sampler/src/cpu/grid_sampler_cpu.cpp
+++ /dev/null
@@ -1,692 +0,0 @@
-// Modified from https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/GridSampler.cpp
-
-#include <torch/extension.h>
-#include "grid_sampler_cpu.h"
-#include <ATen/ATen.h>
-#include <ATen/Device.h>
-#include <ATen/NativeFunctions.h>
-#include <c10/core/Layout.h>
-#include <c10/util/Exception.h>
-
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-namespace mmdetection {
-
-using namespace at;
-using mmdetection::detail::GridSamplerInterpolation;
-using mmdetection::detail::GridSamplerPadding;
-
-namespace {
-
-  template<typename scalar_t>
-  Tensor grid_sampler_2d_forward_cpu_impl(const Tensor& input, const Tensor& grid,
-                                          GridSamplerInterpolation interpolation_mode,
-                                          GridSamplerPadding padding_mode,
-                                          bool align_corners) {
-    int64_t N = input.size(0);
-    int64_t C = input.size(1);
-    int64_t inp_H = input.size(2);
-    int64_t inp_W = input.size(3);
-    int64_t out_H = grid.size(1);
-    int64_t out_W = grid.size(2);
-    auto output = at::empty({N, C, out_H, out_W}, input.options());
-    int64_t inp_sN = input.stride(0);
-    int64_t inp_sC = input.stride(1);
-    int64_t inp_sH = input.stride(2);
-    int64_t inp_sW = input.stride(3);
-    int64_t grid_sN = grid.stride(0);
-    int64_t grid_sH = grid.stride(1);
-    int64_t grid_sW = grid.stride(2);
-    int64_t grid_sCoor = grid.stride(3);
-    int64_t out_sN = output.stride(0);
-    int64_t out_sC = output.stride(1);
-    int64_t out_sH = output.stride(2);
-    int64_t out_sW = output.stride(3);
-    scalar_t *inp_ptr = input.data<scalar_t>();
-    scalar_t *out_ptr = output.data<scalar_t>();
-    scalar_t *grid_ptr = grid.data<scalar_t>();
-    // loop over each output pixel
-    #ifdef _OPENMP
-    #pragma omp parallel for
-    #endif
-    for (int64_t n = 0; n < N; ++n) {
-        scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
-        scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
-          for (int64_t h = 0; h < out_H; ++h) {
-            for (int64_t w = 0; w < out_W; ++w) {
-              // get the corresponding input x, y, z co-ordinates from grid
-              scalar_t *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
-              scalar_t ix = *grid_ptr_NHW;
-              scalar_t iy = grid_ptr_NHW[grid_sCoor];
-
-              ix = grid_sampler_compute_source_index(ix, inp_W, padding_mode, align_corners);
-              iy = grid_sampler_compute_source_index(iy, inp_H, padding_mode, align_corners);
-
-              if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
-                // get corner pixel values from (x, y, z)
-                // for 4d, we used north-east-south-west
-                // for 5d, we add top-bottom
-                int64_t ix_nw = static_cast<int64_t>(std::floor(ix));
-                int64_t iy_nw = static_cast<int64_t>(std::floor(iy));
-
-                int64_t ix_ne = ix_nw + 1;
-                int64_t iy_ne = iy_nw;
-
-                int64_t ix_sw = ix_nw;
-                int64_t iy_sw = iy_nw + 1;
-
-                int64_t ix_se = ix_nw + 1;
-                int64_t iy_se = iy_nw + 1;
-
-                // get surfaces to each neighbor:
-                scalar_t nw = (ix_se - ix)    * (iy_se - iy)   ;
-                scalar_t ne = (ix    - ix_sw) * (iy_sw - iy)   ;
-                scalar_t sw = (ix_ne - ix)    * (iy    - iy_ne);
-                scalar_t se = (ix    - ix_nw) * (iy    - iy_nw);
-
-                // calculate bilinear weighted pixel value and set output pixel
-                scalar_t *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
-                scalar_t *inp_ptr_NC = inp_ptr_N;
-                for (int c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
-                  //   (c, iz_tnw, iy_tnw, ix_tnw) * tnw + (c, iz_tne, iy_tne, ix_tne) * tne
-                  // + (c, iz_tsw, iy_tsw, ix_tsw) * tsw + (c, iz_tse, iy_tse, ix_tse) * tse
-                  // + (c, iz_bnw, iy_bnw, ix_bnw) * bnw + (c, iz_bne, iy_bne, ix_bne) * bne
-                  // + (c, iz_bsw, iy_bsw, ix_bsw) * bsw + (c, iz_bse, iy_bse, ix_bse) * bse
-                  *out_ptr_NCHW = static_cast<scalar_t>(0);
-                  if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
-                    *out_ptr_NCHW += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
-                  }
-                  if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
-                    *out_ptr_NCHW += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
-                  }
-                  if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
-                    *out_ptr_NCHW += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
-                  }
-                  if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
-                    *out_ptr_NCHW += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
-                  }
-                }
-              } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
-                int64_t ix_nearest = static_cast<int64_t>(std::round(ix));
-                int64_t iy_nearest = static_cast<int64_t>(std::round(iy));
-
-                // assign nearest neighor pixel value to output pixel
-                scalar_t *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
-                scalar_t *inp_ptr_NC = inp_ptr_N;
-                for (int c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
-                  if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) {
-                    *out_ptr_NCHW = inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
-                  } else {
-                    *out_ptr_NCHW = static_cast<scalar_t>(0);
-                  }
-                }
-              }
-            }
-          }
-      }
-
-    return output;
-  }
-
-  template<typename scalar_t>
-  Tensor grid_sampler_3d_forward_cpu_impl(const Tensor& input, const Tensor& grid,
-                                  GridSamplerInterpolation interpolation_mode,
-                                  GridSamplerPadding padding_mode,
-                                  bool align_corners) {
-    int64_t N = input.size(0);
-    int64_t C = input.size(1);
-    int64_t inp_D = input.size(2);
-    int64_t inp_H = input.size(3);
-    int64_t inp_W = input.size(4);
-    int64_t out_D = grid.size(1);
-    int64_t out_H = grid.size(2);
-    int64_t out_W = grid.size(3);
-    auto output = at::empty({N, C, out_D, out_H, out_W}, input.options());
-    int64_t inp_sN = input.stride(0);
-    int64_t inp_sC = input.stride(1);
-    int64_t inp_sD = input.stride(2);
-    int64_t inp_sH = input.stride(3);
-    int64_t inp_sW = input.stride(4);
-    int64_t grid_sN = grid.stride(0);
-    int64_t grid_sD = grid.stride(1);
-    int64_t grid_sH = grid.stride(2);
-    int64_t grid_sW = grid.stride(3);
-    int64_t grid_sCoor = grid.stride(4);
-    int64_t out_sN = output.stride(0);
-    int64_t out_sC = output.stride(1);
-    int64_t out_sD = output.stride(2);
-    int64_t out_sH = output.stride(3);
-    int64_t out_sW = output.stride(4);
-    scalar_t *inp_ptr = input.data<scalar_t>();
-    scalar_t *out_ptr = output.data<scalar_t>();
-    scalar_t *grid_ptr = grid.data<scalar_t>();
-    // loop over each output pixel
-    #ifdef _OPENMP
-    #pragma omp parallel for
-    #endif
-    for (int64_t n = 0; n < N; ++n) {
-        scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
-        scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
-        for (int64_t d = 0; d < out_D; ++d) {
-          for (int64_t h = 0; h < out_H; ++h) {
-            for (int64_t w = 0; w < out_W; ++w) {
-              // get the corresponding input x, y, z co-ordinates from grid
-              scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW;
-              scalar_t ix = *grid_ptr_NDHW;
-              scalar_t iy = grid_ptr_NDHW[grid_sCoor];
-              scalar_t iz = grid_ptr_NDHW[2 * grid_sCoor];
-
-              ix = grid_sampler_compute_source_index(ix, inp_W, padding_mode, align_corners);
-              iy = grid_sampler_compute_source_index(iy, inp_H, padding_mode, align_corners);
-              iz = grid_sampler_compute_source_index(iz, inp_D, padding_mode, align_corners);
-
-              if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
-                // get corner pixel values from (x, y, z)
-                // for 4d, we used north-east-south-west
-                // for 5d, we add top-bottom
-                int64_t ix_tnw = static_cast<int64_t>(std::floor(ix));
-                int64_t iy_tnw = static_cast<int64_t>(std::floor(iy));
-                int64_t iz_tnw = static_cast<int64_t>(std::floor(iz));
-
-                int64_t ix_tne = ix_tnw + 1;
-                int64_t iy_tne = iy_tnw;
-                int64_t iz_tne = iz_tnw;
-
-                int64_t ix_tsw = ix_tnw;
-                int64_t iy_tsw = iy_tnw + 1;
-                int64_t iz_tsw = iz_tnw;
-
-                int64_t ix_tse = ix_tnw + 1;
-                int64_t iy_tse = iy_tnw + 1;
-                int64_t iz_tse = iz_tnw;
-
-                int64_t ix_bnw = ix_tnw;
-                int64_t iy_bnw = iy_tnw;
-                int64_t iz_bnw = iz_tnw + 1;
-
-                int64_t ix_bne = ix_tnw + 1;
-                int64_t iy_bne = iy_tnw;
-                int64_t iz_bne = iz_tnw + 1;
-
-                int64_t ix_bsw = ix_tnw;
-                int64_t iy_bsw = iy_tnw + 1;
-                int64_t iz_bsw = iz_tnw + 1;
-
-                int64_t ix_bse = ix_tnw + 1;
-                int64_t iy_bse = iy_tnw + 1;
-                int64_t iz_bse = iz_tnw + 1;
-
-                // get surfaces to each neighbor:
-                scalar_t tnw = (ix_bse - ix)    * (iy_bse - iy)    * (iz_bse - iz);
-                scalar_t tne = (ix    - ix_bsw) * (iy_bsw - iy)    * (iz_bsw - iz);
-                scalar_t tsw = (ix_bne - ix)    * (iy    - iy_bne) * (iz_bne - iz);
-                scalar_t tse = (ix    - ix_bnw) * (iy    - iy_bnw) * (iz_bnw - iz);
-                scalar_t bnw = (ix_tse - ix)    * (iy_tse - iy)    * (iz - iz_tse);
-                scalar_t bne = (ix    - ix_tsw) * (iy_tsw - iy)    * (iz - iz_tsw);
-                scalar_t bsw = (ix_tne - ix)    * (iy    - iy_tne) * (iz - iz_tne);
-                scalar_t bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
-
-                // calculate bilinear weighted pixel value and set output pixel
-                scalar_t *out_ptr_NCDHW = out_ptr + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
-                scalar_t *inp_ptr_NC = inp_ptr_N;
-                for (int c = 0; c < C; ++c, out_ptr_NCDHW += out_sC, inp_ptr_NC += inp_sC) {
-                  //   (c, iz_tnw, iy_tnw, ix_tnw) * tnw + (c, iz_tne, iy_tne, ix_tne) * tne
-                  // + (c, iz_tsw, iy_tsw, ix_tsw) * tsw + (c, iz_tse, iy_tse, ix_tse) * tse
-                  // + (c, iz_bnw, iy_bnw, ix_bnw) * bnw + (c, iz_bne, iy_bne, ix_bne) * bne
-                  // + (c, iz_bsw, iy_bsw, ix_bsw) * bsw + (c, iz_bse, iy_bse, ix_bse) * bse
-                  *out_ptr_NCDHW = static_cast<scalar_t>(0);
-                  if (within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) {
-                    *out_ptr_NCDHW += inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] * tnw;
-                  }
-                  if (within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W)) {
-                    *out_ptr_NCDHW += inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] * tne;
-                  }
-                  if (within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W)) {
-                    *out_ptr_NCDHW += inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] * tsw;
-                  }
-                  if (within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W)) {
-                    *out_ptr_NCDHW += inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] * tse;
-                  }
-                  if (within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W)) {
-                    *out_ptr_NCDHW += inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] * bnw;
-                  }
-                  if (within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W)) {
-                    *out_ptr_NCDHW += inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] * bne;
-                  }
-                  if (within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W)) {
-                    *out_ptr_NCDHW += inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] * bsw;
-                  }
-                  if (within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W)) {
-                    *out_ptr_NCDHW += inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] * bse;
-                  }
-                }
-              } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
-                int64_t ix_nearest = static_cast<int64_t>(std::round(ix));
-                int64_t iy_nearest = static_cast<int64_t>(std::round(iy));
-                int64_t iz_nearest = static_cast<int64_t>(std::round(iz));
-
-                // assign nearest neighor pixel value to output pixel
-                scalar_t *out_ptr_NCDHW = out_ptr + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
-                scalar_t *inp_ptr_NC = inp_ptr_N;
-                for (int c = 0; c < C; ++c, out_ptr_NCDHW += out_sC, inp_ptr_NC += inp_sC) {
-                  if (within_bounds_3d(iz_nearest, iy_nearest, ix_nearest, inp_D, inp_H, inp_W)) {
-                    *out_ptr_NCDHW = inp_ptr_NC[iz_nearest * inp_sD + iy_nearest * inp_sH + ix_nearest * inp_sW];
-                  } else {
-                    *out_ptr_NCDHW = static_cast<scalar_t>(0);
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    return output;
-  }
-
-  template<typename scalar_t>
-  std::tuple<Tensor, Tensor>
-  grid_sampler_2d_backward_cpu_impl(const Tensor& grad_output,
-                                    const Tensor& input, const Tensor& grid,
-                                    GridSamplerInterpolation interpolation_mode,
-                                    GridSamplerPadding padding_mode,
-                                    bool align_corners) {
-    auto grad_input = at::zeros_like(input);
-    auto grad_grid = at::empty_like(grid);
-    // If interpolation mode is Nearest, then grad_grid is not filled in the
-    // loop below.
-    if (interpolation_mode == GridSamplerInterpolation::Nearest) {
-      grad_grid.zero_();
-    }
-    int64_t N = input.size(0);
-    int64_t C = input.size(1);
-    int64_t inp_H = input.size(2);
-    int64_t inp_W = input.size(3);
-    int64_t out_H = grid.size(1);
-    int64_t out_W = grid.size(2);
-    int64_t inp_sN = input.stride(0);
-    int64_t inp_sC = input.stride(1);
-    int64_t inp_sH = input.stride(2);
-    int64_t inp_sW = input.stride(3);
-    int64_t grid_sN = grid.stride(0);
-    int64_t grid_sH = grid.stride(1);
-    int64_t grid_sW = grid.stride(2);
-    int64_t grid_sCoor = grid.stride(3);
-    int64_t gOut_sN = grad_output.stride(0);
-    int64_t gOut_sC = grad_output.stride(1);
-    int64_t gOut_sH = grad_output.stride(2);
-    int64_t gOut_sW = grad_output.stride(3);
-    int64_t gInp_sN = grad_input.stride(0);
-    int64_t gInp_sC = grad_input.stride(1);
-    int64_t gInp_sH = grad_input.stride(2);
-    int64_t gInp_sW = grad_input.stride(3);
-    int64_t gGrid_sN = grad_grid.stride(0);
-    int64_t gGrid_sW = grad_grid.stride(2);
-    scalar_t *inp_ptr = input.data<scalar_t>();
-    scalar_t *grid_ptr = grid.data<scalar_t>();
-    scalar_t *gOut_ptr = grad_output.data<scalar_t>();
-    scalar_t *gInp_ptr = grad_input.data<scalar_t>();
-    scalar_t *gGrid_ptr = grad_grid.data<scalar_t>();
-    // loop over each output pixel
-    #ifdef _OPENMP
-    #pragma omp parallel for
-    #endif
-    for (int64_t n = 0; n < N; ++n) {
-        scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
-        scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
-        scalar_t *gGrid_ptr_NHW = gGrid_ptr + n * gGrid_sN;
-          for (int64_t h = 0; h < out_H; ++h) {
-            for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NHW += gGrid_sW /* grad_grid is contiguous */ ) {
-              // get the corresponding input x, y, z co-ordinates from grid
-              scalar_t *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
-              scalar_t ix = *grid_ptr_NHW;
-              scalar_t iy = grid_ptr_NHW[grid_sCoor];
-
-              // multipliers for gradients on ix, iy, and iz
-              scalar_t gix_mult, giy_mult;
-              ix = grid_sampler_compute_source_index_set_grad(ix, inp_W, padding_mode, align_corners, &gix_mult);
-              iy = grid_sampler_compute_source_index_set_grad(iy, inp_H, padding_mode, align_corners, &giy_mult);
-
-              if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
-                // get corner pixel values from (x, y, z)
-                // for 4d, we used north-east-south-west
-                // for 5d, we add top-bottom
-                int64_t ix_nw = static_cast<int64_t>(std::floor(ix));
-                int64_t iy_nw = static_cast<int64_t>(std::floor(iy));
-
-                int64_t ix_ne = ix_nw + 1;
-                int64_t iy_ne = iy_nw;
-
-                int64_t ix_sw = ix_nw;
-                int64_t iy_sw = iy_nw + 1;
-
-                int64_t ix_se = ix_nw + 1;
-                int64_t iy_se = iy_nw + 1;
-
-                // get surfaces to each neighbor:
-                scalar_t nw = (ix_se - ix)    * (iy_se - iy)   ;
-                scalar_t ne = (ix    - ix_sw) * (iy_sw - iy)   ;
-                scalar_t sw = (ix_ne - ix)    * (iy    - iy_ne);
-                scalar_t se = (ix    - ix_nw) * (iy    - iy_nw);
-
-                scalar_t gix = static_cast<scalar_t>(0), giy = static_cast<scalar_t>(0);
-                scalar_t *gOut_ptr_NCHW = gOut_ptr + n * gOut_sN + h * gOut_sH + w * gOut_sW;
-                scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN;
-                scalar_t *inp_ptr_NC = inp_ptr_N;
-                // calculate bilinear weighted pixel value and set output pixel
-                for (int c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC += inp_sC) {
-                  scalar_t gOut = *gOut_ptr_NCHW;
-
-                  // calculate and set grad_input
-                  safe_add_2d(gInp_ptr_NC, iy_nw, ix_nw, gInp_sH, gInp_sW, inp_H, inp_W, nw * gOut);
-                  safe_add_2d(gInp_ptr_NC, iy_ne, ix_ne, gInp_sH, gInp_sW, inp_H, inp_W, ne * gOut);
-                  safe_add_2d(gInp_ptr_NC, iy_sw, ix_sw, gInp_sH, gInp_sW, inp_H, inp_W, sw * gOut);
-                  safe_add_2d(gInp_ptr_NC, iy_se, ix_se, gInp_sH, gInp_sW, inp_H, inp_W, se * gOut);
-
-                  // calculate grad_grid
-                  if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
-                    scalar_t nw_val = inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW];
-                    gix -= nw_val * (iy_se - iy) * gOut;
-                    giy -= nw_val * (ix_se - ix) * gOut;
-                  }
-                  if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
-                    scalar_t ne_val = inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW];
-                    gix += ne_val * (iy_sw - iy)    * gOut;
-                    giy -= ne_val * (ix    - ix_sw) * gOut;
-                  }
-                  if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
-                    scalar_t sw_val = inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW];
-                    gix -= sw_val * (iy - iy_ne) * gOut;
-                    giy += sw_val * (ix_ne - ix) * gOut;
-                  }
-                  if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
-                    scalar_t se_val = inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW];
-                    gix += se_val * (iy - iy_nw)    * gOut;
-                    giy += se_val * (ix    - ix_nw) * gOut;
-                  }
-                }
-
-                // assuming grad_grid is contiguous
-                gGrid_ptr_NHW[0] = gix_mult * gix;
-                gGrid_ptr_NHW[1] = giy_mult * giy;
-              } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
-                int64_t ix_nearest = static_cast<int64_t>(std::round(ix));
-                int64_t iy_nearest = static_cast<int64_t>(std::round(iy));
-
-                // assign nearest neighor pixel value to output pixel
-                scalar_t *gOut_ptr_NCHW = gOut_ptr + n * gOut_sN + h * gOut_sH + w * gOut_sW;
-                scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN;
-                for (int c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, gInp_ptr_NC += gInp_sC) {
-                  // calculate and set grad_input
-                  safe_add_2d(gInp_ptr_NC, iy_nearest, ix_nearest,
-                              gInp_sH, gInp_sW, inp_H, inp_W, *gOut_ptr_NCHW);
-                }
-              }
-            }
-          }
-      }
-    return std::make_tuple(grad_input, grad_grid);
-  }
-
-  template<typename scalar_t>
-  std::tuple<Tensor, Tensor>
-  grid_sampler_3d_backward_cpu_impl(const Tensor& grad_output,
-                                    const Tensor& input, const Tensor& grid,
-                                    GridSamplerInterpolation interpolation_mode,
-                                    GridSamplerPadding padding_mode,
-                                    bool align_corners) {
-    auto grad_input = at::zeros_like(input);
-    auto grad_grid = at::empty_like(grid);
-    // If interpolation mode is Nearest, then grad_grid is not filled in the
-    // loop below.
-    if (interpolation_mode == GridSamplerInterpolation::Nearest) {
-      grad_grid.zero_();
-    }
-    int64_t N = input.size(0);
-    int64_t C = input.size(1);
-    int64_t inp_D = input.size(2);
-    int64_t inp_H = input.size(3);
-    int64_t inp_W = input.size(4);
-    int64_t out_D = grid.size(1);
-    int64_t out_H = grid.size(2);
-    int64_t out_W = grid.size(3);
-    int64_t inp_sN = input.stride(0);
-    int64_t inp_sC = input.stride(1);
-    int64_t inp_sD = input.stride(2);
-    int64_t inp_sH = input.stride(3);
-    int64_t inp_sW = input.stride(4);
-    int64_t grid_sN = grid.stride(0);
-    int64_t grid_sD = grid.stride(1);
-    int64_t grid_sH = grid.stride(2);
-    int64_t grid_sW = grid.stride(3);
-    int64_t grid_sCoor = grid.stride(4);
-    int64_t gOut_sN = grad_output.stride(0);
-    int64_t gOut_sC = grad_output.stride(1);
-    int64_t gOut_sD = grad_output.stride(2);
-    int64_t gOut_sH = grad_output.stride(3);
-    int64_t gOut_sW = grad_output.stride(4);
-    int64_t gInp_sN = grad_input.stride(0);
-    int64_t gInp_sC = grad_input.stride(1);
-    int64_t gInp_sD = grad_input.stride(2);
-    int64_t gInp_sH = grad_input.stride(3);
-    int64_t gInp_sW = grad_input.stride(4);
-    int64_t gGrid_sN = grad_grid.stride(0);
-    int64_t gGrid_sW = grad_grid.stride(3);
-    scalar_t *inp_ptr = input.data<scalar_t>();
-    scalar_t *grid_ptr = grid.data<scalar_t>();
-    scalar_t *gOut_ptr = grad_output.data<scalar_t>();
-    scalar_t *gInp_ptr = grad_input.data<scalar_t>();
-    scalar_t *gGrid_ptr = grad_grid.data<scalar_t>();
-    // loop over each output pixel
-    #ifdef _OPENMP
-    #pragma omp parallel for
-    #endif
-    for (int64_t n = 0; n < N; ++n) {
-        scalar_t *grid_ptr_N = grid_ptr + n * grid_sN;
-        scalar_t *inp_ptr_N = inp_ptr + n * inp_sN;
-        scalar_t *gGrid_ptr_NDHW = gGrid_ptr + n * gGrid_sN;
-        for (int64_t d = 0; d < out_D; ++d) {
-          for (int64_t h = 0; h < out_H; ++h) {
-            for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NDHW += gGrid_sW /* grad_grid is contiguous */ ) {
-              // get the corresponding input x, y, z co-ordinates from grid
-              scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW;
-              scalar_t ix = *grid_ptr_NDHW;
-              scalar_t iy = grid_ptr_NDHW[grid_sCoor];
-              scalar_t iz = grid_ptr_NDHW[2 * grid_sCoor];
-
-              // multipliers for gradients on ix, iy, and iz
-              scalar_t gix_mult, giy_mult, giz_mult;
-              ix = grid_sampler_compute_source_index_set_grad(ix, inp_W, padding_mode, align_corners, &gix_mult);
-              iy = grid_sampler_compute_source_index_set_grad(iy, inp_H, padding_mode, align_corners, &giy_mult);
-              iz = grid_sampler_compute_source_index_set_grad(iz, inp_D, padding_mode, align_corners, &giz_mult);
-
-              if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
-                // get corner pixel values from (x, y, z)
-                // for 4d, we used north-east-south-west
-                // for 5d, we add top-bottom
-                int64_t ix_tnw = static_cast<int64_t>(std::floor(ix));
-                int64_t iy_tnw = static_cast<int64_t>(std::floor(iy));
-                int64_t iz_tnw = static_cast<int64_t>(std::floor(iz));
-
-                int64_t ix_tne = ix_tnw + 1;
-                int64_t iy_tne = iy_tnw;
-                int64_t iz_tne = iz_tnw;
-
-                int64_t ix_tsw = ix_tnw;
-                int64_t iy_tsw = iy_tnw + 1;
-                int64_t iz_tsw = iz_tnw;
-
-                int64_t ix_tse = ix_tnw + 1;
-                int64_t iy_tse = iy_tnw + 1;
-                int64_t iz_tse = iz_tnw;
-
-                int64_t ix_bnw = ix_tnw;
-                int64_t iy_bnw = iy_tnw;
-                int64_t iz_bnw = iz_tnw + 1;
-
-                int64_t ix_bne = ix_tnw + 1;
-                int64_t iy_bne = iy_tnw;
-                int64_t iz_bne = iz_tnw + 1;
-
-                int64_t ix_bsw = ix_tnw;
-                int64_t iy_bsw = iy_tnw + 1;
-                int64_t iz_bsw = iz_tnw + 1;
-
-                int64_t ix_bse = ix_tnw + 1;
-                int64_t iy_bse = iy_tnw + 1;
-                int64_t iz_bse = iz_tnw + 1;
-
-                // get surfaces to each neighbor:
-                scalar_t tnw = (ix_bse - ix)    * (iy_bse - iy)    * (iz_bse - iz);
-                scalar_t tne = (ix    - ix_bsw) * (iy_bsw - iy)    * (iz_bsw - iz);
-                scalar_t tsw = (ix_bne - ix)    * (iy    - iy_bne) * (iz_bne - iz);
-                scalar_t tse = (ix    - ix_bnw) * (iy    - iy_bnw) * (iz_bnw - iz);
-                scalar_t bnw = (ix_tse - ix)    * (iy_tse - iy)    * (iz - iz_tse);
-                scalar_t bne = (ix    - ix_tsw) * (iy_tsw - iy)    * (iz - iz_tsw);
-                scalar_t bsw = (ix_tne - ix)    * (iy    - iy_tne) * (iz - iz_tne);
-                scalar_t bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
-
-                scalar_t gix = static_cast<scalar_t>(0), giy = static_cast<scalar_t>(0), giz = static_cast<scalar_t>(0);
-                scalar_t *gOut_ptr_NCDHW = gOut_ptr + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
-                scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN;
-                scalar_t *inp_ptr_NC = inp_ptr_N;
-                // calculate bilinear weighted pixel value and set output pixel
-                for (int c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC += inp_sC) {
-                  scalar_t gOut = *gOut_ptr_NCDHW;
-
-                  // calculate and set grad_input
-                  safe_add_3d(gInp_ptr_NC, iz_tnw, iy_tnw, ix_tnw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tnw * gOut);
-                  safe_add_3d(gInp_ptr_NC, iz_tne, iy_tne, ix_tne, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tne * gOut);
-                  safe_add_3d(gInp_ptr_NC, iz_tsw, iy_tsw, ix_tsw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tsw * gOut);
-                  safe_add_3d(gInp_ptr_NC, iz_tse, iy_tse, ix_tse, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tse * gOut);
-                  safe_add_3d(gInp_ptr_NC, iz_bnw, iy_bnw, ix_bnw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bnw * gOut);
-                  safe_add_3d(gInp_ptr_NC, iz_bne, iy_bne, ix_bne, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bne * gOut);
-                  safe_add_3d(gInp_ptr_NC, iz_bsw, iy_bsw, ix_bsw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bsw * gOut);
-                  safe_add_3d(gInp_ptr_NC, iz_bse, iy_bse, ix_bse, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bse * gOut);
-
-                  // calculate grad_grid
-                  if (within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) {
-                    scalar_t tnw_val = inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW];
-                    gix -= tnw_val * (iy_bse - iy)    * (iz_bse - iz)    * gOut;
-                    giy -= tnw_val * (ix_bse - ix)    * (iz_bse - iz)    * gOut;
-                    giz -= tnw_val * (ix_bse - ix)    * (iy_bse - iy)    * gOut;
-                  }
-                  if (within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W)) {
-                    scalar_t tne_val = inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW];
-                    gix += tne_val * (iy_bsw - iy)    * (iz_bsw - iz)    * gOut;
-                    giy -= tne_val * (ix    - ix_bsw) * (iz_bsw - iz)    * gOut;
-                    giz -= tne_val * (ix    - ix_bsw) * (iy_bsw - iy)    * gOut;
-                  }
-                  if (within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W)) {
-                    scalar_t tsw_val = inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW];
-                    gix -= tsw_val * (iy - iy_bne)    * (iz_bne - iz)    * gOut;
-                    giy += tsw_val * (ix_bne - ix)    * (iz_bne - iz)    * gOut;
-                    giz -= tsw_val * (ix_bne - ix)    * (iy    - iy_bne) * gOut;
-                  }
-                  if (within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W)) {
-                    scalar_t tse_val = inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW];
-                    gix += tse_val * (iy - iy_bnw)    * (iz_bnw - iz)    * gOut;
-                    giy += tse_val * (ix    - ix_bnw) * (iz_bnw - iz)    * gOut;
-                    giz -= tse_val * (ix    - ix_bnw) * (iy    - iy_bnw) * gOut;
-                  }
-                  if (within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W)) {
-                    scalar_t bnw_val = inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW];
-                    gix -= bnw_val * (iy_tse - iy)    * (iz - iz_tse)    * gOut;
-                    giy -= bnw_val * (ix_tse - ix)    * (iz - iz_tse)    * gOut;
-                    giz += bnw_val * (ix_tse - ix)    * (iy_tse - iy)    * gOut;
-                  }
-                  if (within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W)) {
-                    scalar_t bne_val = inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW];
-                    gix += bne_val * (iy_tsw - iy)    * (iz - iz_tsw)    * gOut;
-                    giy -= bne_val * (ix    - ix_tsw) * (iz - iz_tsw)    * gOut;
-                    giz += bne_val * (ix    - ix_tsw) * (iy_tsw - iy)    * gOut;
-                  }
-                  if (within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W)) {
-                    scalar_t bsw_val = inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW];
-                    gix -= bsw_val * (iy - iy_tne)    * (iz - iz_tne)    * gOut;
-                    giy += bsw_val * (ix_tne - ix)    * (iz - iz_tne)    * gOut;
-                    giz += bsw_val * (ix_tne - ix)    * (iy    - iy_tne) * gOut;
-                  }
-                  if (within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W)) {
-                    scalar_t bse_val = inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW];
-                    gix += bse_val * (iy - iy_tnw)    * (iz - iz_tnw)    * gOut;
-                    giy += bse_val * (ix    - ix_tnw) * (iz - iz_tnw)    * gOut;
-                    giz += bse_val * (ix    - ix_tnw) * (iy    - iy_tnw) * gOut;
-                  }
-                }
-
-                // assuming grad_grid is contiguous
-                gGrid_ptr_NDHW[0] = gix_mult * gix;
-                gGrid_ptr_NDHW[1] = giy_mult * giy;
-                gGrid_ptr_NDHW[2] = giz_mult * giz;
-              } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
-                int64_t ix_nearest = static_cast<int64_t>(std::round(ix));
-                int64_t iy_nearest = static_cast<int64_t>(std::round(iy));
-                int64_t iz_nearest = static_cast<int64_t>(std::round(iz));
-
-                // assign nearest neighor pixel value to output pixel
-                scalar_t *gOut_ptr_NCDHW = gOut_ptr + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
-                scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN;
-                for (int c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, gInp_ptr_NC += gInp_sC) {
-                  // calculate and set grad_input
-                  safe_add_3d(gInp_ptr_NC, iz_nearest, iy_nearest, ix_nearest,
-                              gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, *gOut_ptr_NCDHW);
-                }
-              }
-            }
-          }
-        }
-      }
-    return std::make_tuple(grad_input, grad_grid);
-  }
-
-}  // namespace
-
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-Tensor grid_sampler_2d_forward_cpu(const Tensor& input, const Tensor& grid,
-                                   int64_t interpolation_mode, int64_t padding_mode,
-                                   bool align_corners) {
-    return AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "grid_sampler_2d_forward_cpu", [&] {
-        return grid_sampler_2d_forward_cpu_impl<scalar_t>(
-                input, grid, static_cast<GridSamplerInterpolation>(interpolation_mode),
-                static_cast<GridSamplerPadding>(padding_mode), align_corners);
-    });
-}
-
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-Tensor grid_sampler_3d_forward_cpu(const Tensor& input, const Tensor& grid,
-                           int64_t interpolation_mode, int64_t padding_mode,
-                           bool align_corners) {
-  return AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "grid_sampler_3d_forward_cpu", [&] {
-    return grid_sampler_3d_forward_cpu_impl<scalar_t>(
-      input, grid, static_cast<GridSamplerInterpolation>(interpolation_mode),
-      static_cast<GridSamplerPadding>(padding_mode), align_corners);
-  });
-}
-
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-std::tuple<Tensor, Tensor>
-grid_sampler_2d_backward_cpu(const Tensor& grad_output, const Tensor& input, const Tensor& grid,
-                             int64_t interpolation_mode, int64_t padding_mode, bool align_corners) {
-    return AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "grid_sampler_2d_backward_cpu", [&] {
-        return grid_sampler_2d_backward_cpu_impl<scalar_t>(
-                grad_output, input, grid,
-                static_cast<GridSamplerInterpolation>(interpolation_mode),
-                static_cast<GridSamplerPadding>(padding_mode), align_corners);
-    });
-}
-
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-std::tuple<Tensor, Tensor>
-grid_sampler_3d_backward_cpu(const Tensor& grad_output, const Tensor& input, const Tensor& grid,
-                             int64_t interpolation_mode, int64_t padding_mode, bool align_corners) {
-  return AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "grid_sampler_3d_backward_cpu", [&] {
-    return grid_sampler_3d_backward_cpu_impl<scalar_t>(
-      grad_output, input, grid,
-      static_cast<GridSamplerInterpolation>(interpolation_mode),
-      static_cast<GridSamplerPadding>(padding_mode), align_corners);
-  });
-}
-
-}  // namespace mmdetection
diff --git a/mmdet/ops/grid_sampler/src/cpu/grid_sampler_cpu.h b/mmdet/ops/grid_sampler/src/cpu/grid_sampler_cpu.h
deleted file mode 100644
index 3c9ae45063bf212d715abcf54c0bdccdb23958fc..0000000000000000000000000000000000000000
--- a/mmdet/ops/grid_sampler/src/cpu/grid_sampler_cpu.h
+++ /dev/null
@@ -1,225 +0,0 @@
-// Modified from https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/GridSampler.h
-
-#pragma once
-
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
-
-namespace mmdetection {
-
-namespace detail {
-
-  enum class GridSamplerInterpolation {Bilinear, Nearest};
-  enum class GridSamplerPadding {Zeros, Border, Reflection};
-
-}  // namespace detail
-
-using detail::GridSamplerInterpolation;
-using detail::GridSamplerPadding;
-
-// Unnormalizes a coordinate from the -1 to +1 scale to its pixel index value,
-// where we view each pixel as an area between (idx - 0.5) and (idx + 0.5).
-// if align_corners: -1 and +1 get sent to the centers of the corner pixels
-//     -1 --> 0
-//     +1 --> (size - 1)
-//     scale_factor = (size - 1) / 2
-// if not align_corners: -1 and +1 get sent to the image edges
-//     -1 --> -0.5
-//     +1 --> (size - 1) + 0.5 == size - 0.5
-//     scale_factor = size / 2
-template <typename scalar_t>
-static inline scalar_t grid_sampler_unnormalize(scalar_t coord, int64_t size,
-                                                bool align_corners) {
-  if (align_corners) {
-    // unnormalize coord from [-1, 1] to [0, size - 1]
-    return ((coord + 1) / 2) * (size - 1);
-  } else {
-    // unnormalize coord from [-1, 1] to [-0.5, size - 0.5]
-    return ((coord + 1) * size - 1) / 2;
-  }
-}
-
-// grid_sampler_unnormalize_set_grad works the same as grid_sampler_unnormalize
-// except that it also returns the `d output / d input` via pointer argument
-// `grad_in`.
-// This is useful in the backward pass of grid_sampler.
-template <typename scalar_t>
-static inline scalar_t grid_sampler_unnormalize_set_grad(scalar_t coord, int64_t size,
-                                                         bool align_corners, scalar_t *grad_in) {
-  if (align_corners) {
-    // unnormalize coord from [-1, 1] to [0, size - 1]
-    *grad_in = static_cast<scalar_t>(size - 1) / 2;
-    return ((coord + 1) / 2) * (size - 1);
-  } else {
-    // unnormalize coord from [-1, 1] to [-0.5, size - 0.5]
-    *grad_in = static_cast<scalar_t>(size) / 2;
-    return ((coord + 1) * size - 1) / 2;
-  }
-}
-
-// Clips coordinates to between 0 and clip_limit - 1
-template<typename scalar_t>
-static inline scalar_t clip_coordinates(scalar_t in, int64_t clip_limit) {
-  return std::min(static_cast<scalar_t>(clip_limit - 1), std::max(in, static_cast<scalar_t>(0)));
-}
-
-// clip_coordinates_set_grad works similarly to clip_coordinates except that
-// it also returns the `d output / d input` via pointer argument `grad_in`.
-// This is useful in the backward pass of grid_sampler.
-template<typename scalar_t>
-static inline scalar_t clip_coordinates_set_grad(scalar_t in, int64_t clip_limit,
-                                                 scalar_t *grad_in) {
-  if (in < static_cast<scalar_t>(0)) {
-    *grad_in = static_cast<scalar_t>(0);
-    return static_cast<scalar_t>(0);
-  } else {
-    scalar_t max = static_cast<scalar_t>(clip_limit - 1);
-    if (in > max) {
-      *grad_in = static_cast<scalar_t>(0);
-      return max;
-    } else {
-      *grad_in = static_cast<scalar_t>(1);
-      return in;
-    }
-  }
-}
-
-// Reflects coordinates until they fall between low and high (inclusive).
-// The bounds are passed as twice their value so that half-integer values
-// can be represented as ints.
-template<typename scalar_t>
-static inline scalar_t reflect_coordinates(scalar_t in, int64_t twice_low,
-                                           int64_t twice_high) {
-  if (twice_low == twice_high) {
-    return static_cast<scalar_t>(0);
-  }
-  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
-  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
-  in = std::fabs(in - min);
-  // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
-  scalar_t extra = std::fmod(in, span);
-  int flips = static_cast<int>(std::floor(in / span));
-  if (flips % 2 == 0) {
-    return extra + min;
-  } else {
-    return span - extra + min;
-  }
-}
-
-// reflect_coordinates_set_grad works similarly to reflect_coordinates except
-// that it also returns the `d output / d input` via pointer argument
-// `grad_in`.
-// This is useful in the backward pass of grid_sampler.
-template<typename scalar_t>
-static inline scalar_t reflect_coordinates_set_grad(scalar_t in, int64_t twice_low,
-                                                    int64_t twice_high, scalar_t *grad_in) {
-  if (twice_low == twice_high) {
-    *grad_in = static_cast<scalar_t>(0);
-    return static_cast<scalar_t>(0);
-  }
-  int grad_in_mult_;
-  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
-  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
-  in = in - min;
-  if (in < static_cast<scalar_t>(0)) {
-    grad_in_mult_ = -1;
-    in = -in;
-  } else {
-    grad_in_mult_ = 1;
-  }
-  // `fmod` returns same sign as `in`, which is positive after the `if` above.
-  scalar_t extra = std::fmod(in, span);
-  int flips = static_cast<int>(std::floor(in / span));
-  if (flips % 2 == 0) {
-    *grad_in = static_cast<scalar_t>(grad_in_mult_);
-    return extra + min;
-  } else {
-    *grad_in = static_cast<scalar_t>(-grad_in_mult_);
-    return span - extra + min;
-  }
-}
-
-// Computes the pixel source index value for a grid coordinate
-template <typename scalar_t>
-static inline scalar_t grid_sampler_compute_source_index(
-    scalar_t coord,
-    int64_t size,
-    GridSamplerPadding padding_mode,
-    bool align_corners) {
-  coord = grid_sampler_unnormalize(coord, size, align_corners);
-  if (padding_mode == GridSamplerPadding::Border) {
-    // clip coordinates to image borders
-    coord = clip_coordinates(coord, size);
-  } else if (padding_mode == GridSamplerPadding::Reflection) {
-    // reflect coordinates by image borders
-    if (align_corners) {
-      coord = reflect_coordinates(coord, 0, 2*(size - 1));
-    } else {
-      coord = reflect_coordinates(coord, -1, 2*size - 1);
-      // when align_corners=False, reflection does not auto clip coords
-      coord = clip_coordinates(coord, size);
-    }
-  }
-  return coord;
-}
-
-// grid_sampler_compute_source_index_set_grad works similarly to
-// grid_sampler_compute_source_index except that it also returns the
-// `d output / d input` via pointer argument `grad_in`.
-// This is useful in the backward pass of grid_sampler.
-template <typename scalar_t>
-static inline scalar_t grid_sampler_compute_source_index_set_grad(
-    scalar_t coord,
-    int64_t size,
-    GridSamplerPadding padding_mode,
-    bool align_corners,
-    scalar_t *grad_in) {
-  scalar_t grad_clip, grad_refl;
-  coord = grid_sampler_unnormalize_set_grad(coord, size, align_corners, grad_in);
-  if (padding_mode == GridSamplerPadding::Border) {
-    // clip coordinates to image borders
-    coord = clip_coordinates_set_grad(coord, size, &grad_clip);
-    *grad_in = (*grad_in) * grad_clip;
-  } else if (padding_mode == GridSamplerPadding::Reflection) {
-    // reflect coordinates by image borders
-    if (align_corners) {
-      coord = reflect_coordinates_set_grad(coord, 0, 2*(size - 1), &grad_refl);
-      *grad_in = (*grad_in) * grad_refl;
-    } else {
-      coord = reflect_coordinates_set_grad(coord, -1, 2*size - 1, &grad_refl);
-      // when align_corners=False, reflection does not auto clip coords
-      coord = clip_coordinates_set_grad(coord, size, &grad_clip);
-      *grad_in = (*grad_in) * grad_refl * grad_clip;
-    }
-  }
-  return coord;
-}
-
-static inline bool within_bounds_2d(int64_t h, int64_t w, int64_t H, int64_t W) {
-  return h >= 0 && h < H && w >= 0 && w < W;
-}
-
-static inline bool within_bounds_3d(int64_t d, int64_t h, int64_t w, int64_t D, int64_t H, int64_t W) {
-  return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W;
-}
-
-template<typename scalar_t>
-static inline void safe_add_2d(scalar_t *data, int64_t h, int64_t w,
-                               int64_t sH, int64_t sW, int64_t H, int64_t W,
-                               scalar_t delta) {
-  if (within_bounds_2d(h, w, H, W)) {
-    data[h * sH + w * sW] += delta;
-  }
-}
-
-template<typename scalar_t>
-static inline void safe_add_3d(scalar_t *data, int64_t d, int64_t h, int64_t w,
-                               int64_t sD, int64_t sH, int64_t sW,
-                               int64_t D, int64_t H, int64_t W,
-                               scalar_t delta) {
-  if (within_bounds_3d(d, h, w, D, H, W)) {
-    data[d * sD + h * sH + w * sW] += delta;
-  }
-}
-
-}  // namespace mmdetection
diff --git a/mmdet/ops/grid_sampler/src/cuda/grid_sampler_cuda.cu b/mmdet/ops/grid_sampler/src/cuda/grid_sampler_cuda.cu
deleted file mode 100644
index 2d747a0b897dda1b0a29998a1825832b6b5eb99c..0000000000000000000000000000000000000000
--- a/mmdet/ops/grid_sampler/src/cuda/grid_sampler_cuda.cu
+++ /dev/null
@@ -1,718 +0,0 @@
-// Modified from https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda/GridSampler.cu
-
-#include <ATen/ATen.h>
-#include "grid_sampler_cuda.cuh"
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/CUDAApplyUtils.cuh>
-#include <ATen/cuda/detail/TensorInfo.cuh>
-#include <ATen/cuda/detail/IndexUtils.cuh>
-#include <ATen/cuda/detail/KernelUtils.h>
-#include <c10/macros/Macros.h>
-
-namespace mmdetection {
-
-using namespace at::cuda::detail;
-
-using mmdetection::detail::GridSamplerInterpolation;
-using mmdetection::detail::GridSamplerPadding;
-
-namespace {
-  template <typename scalar_t>
-  C10_LAUNCH_BOUNDS_1(1024)
-  __global__ void grid_sampler_2d_forward_kernel_cuda(
-      const int nthreads,
-      TensorInfo<scalar_t, int> input,
-      TensorInfo<scalar_t, int> grid,
-      TensorInfo<scalar_t, int> output,
-      const GridSamplerInterpolation interpolation_mode,
-      const GridSamplerPadding padding_mode,
-      bool align_corners) {
-
-    int C = input.sizes[1];
-    int inp_H = input.sizes[2];
-    int inp_W = input.sizes[3];
-    int out_H = grid.sizes[1];
-    int out_W = grid.sizes[2];
-    int inp_sN = input.strides[0];
-    int inp_sC = input.strides[1];
-    int inp_sH = input.strides[2];
-    int inp_sW = input.strides[3];
-    int grid_sN = grid.strides[0];
-    int grid_sH = grid.strides[1];
-    int grid_sW = grid.strides[2];
-    int grid_sCoor = grid.strides[3];
-    int out_sN = output.strides[0];
-    int out_sC = output.strides[1];
-    int out_sH = output.strides[2];
-    int out_sW = output.strides[3];
-
-    CUDA_KERNEL_LOOP(index, nthreads) {
-      const int w = index % out_W;
-      const int h = (index / out_W) % out_H;
-      const int n = index / (out_H * out_W);
-      const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
-
-      // get the corresponding input x, y co-ordinates from grid
-      scalar_t ix = grid.data[grid_offset];
-      scalar_t iy = grid.data[grid_offset + grid_sCoor];
-
-      ix = grid_sampler_compute_source_index(ix, inp_W, padding_mode, align_corners);
-      iy = grid_sampler_compute_source_index(iy, inp_H, padding_mode, align_corners);
-
-      if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
-        // get NE, NW, SE, SW pixel values from (x, y)
-        int ix_nw = static_cast<int>(::floor(ix));
-        int iy_nw = static_cast<int>(::floor(iy));
-        int ix_ne = ix_nw + 1;
-        int iy_ne = iy_nw;
-        int ix_sw = ix_nw;
-        int iy_sw = iy_nw + 1;
-        int ix_se = ix_nw + 1;
-        int iy_se = iy_nw + 1;
-
-        // get surfaces to each neighbor:
-        scalar_t nw = (ix_se - ix)    * (iy_se - iy);
-        scalar_t ne = (ix    - ix_sw) * (iy_sw - iy);
-        scalar_t sw = (ix_ne - ix)    * (iy    - iy_ne);
-        scalar_t se = (ix    - ix_nw) * (iy    - iy_nw);
-
-        // calculate bilinear weighted pixel value and set output pixel
-        auto inp_ptr_NC = input.data + n * inp_sN;
-        auto out_ptr_NCHW = output.data + n * out_sN + h * out_sH + w * out_sW;
-        for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC) {
-          *out_ptr_NCHW = static_cast<scalar_t>(0);
-          if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
-            *out_ptr_NCHW += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
-          }
-          if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
-            *out_ptr_NCHW += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
-          }
-          if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
-            *out_ptr_NCHW += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
-          }
-          if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
-            *out_ptr_NCHW += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
-          }
-        }
-      } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
-        int ix_nearest = static_cast<int>(::round(ix));
-        int iy_nearest = static_cast<int>(::round(iy));
-
-        // assign nearest neighor pixel value to output pixel
-        auto inp_ptr_NC = input.data + n * inp_sN;
-        auto out_ptr_NCHW = output.data + n * out_sN + h * out_sH + w * out_sW;
-        for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC) {
-          if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) {
-            *out_ptr_NCHW = inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
-          } else {
-            *out_ptr_NCHW = static_cast<scalar_t>(0);
-          }
-        }
-      }
-    }
-  }
-
-  template <typename scalar_t>
-  C10_LAUNCH_BOUNDS_1(1024)
-  __global__ void grid_sampler_3d_forward_kernel_cuda(
-      const int nthreads,
-      TensorInfo<scalar_t, int> input,
-      TensorInfo<scalar_t, int> grid,
-      TensorInfo<scalar_t, int> output,
-      const GridSamplerInterpolation interpolation_mode,
-      const GridSamplerPadding padding_mode,
-      bool align_corners) {
-
-    int C = input.sizes[1];
-    int inp_D = input.sizes[2];
-    int inp_H = input.sizes[3];
-    int inp_W = input.sizes[4];
-    int out_D = grid.sizes[1];
-    int out_H = grid.sizes[2];
-    int out_W = grid.sizes[3];
-    int inp_sN = input.strides[0];
-    int inp_sC = input.strides[1];
-    int inp_sD = input.strides[2];
-    int inp_sH = input.strides[3];
-    int inp_sW = input.strides[4];
-    int grid_sN = grid.strides[0];
-    int grid_sD = grid.strides[1];
-    int grid_sH = grid.strides[2];
-    int grid_sW = grid.strides[3];
-    int grid_sCoor = grid.strides[4];
-    int out_sN = output.strides[0];
-    int out_sC = output.strides[1];
-    int out_sD = output.strides[2];
-    int out_sH = output.strides[3];
-    int out_sW = output.strides[4];
-
-    CUDA_KERNEL_LOOP(index, nthreads) {
-      const int w = index % out_W;
-      const int h = (index / out_W) % out_H;
-      const int d = (index / (out_H * out_W)) % out_D;
-      const int n = index / (out_D * out_H * out_W);
-      const int grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
-
-      // get the corresponding input x, y, z co-ordinates from grid
-      scalar_t ix = grid.data[grid_offset];
-      scalar_t iy = grid.data[grid_offset + grid_sCoor];
-      scalar_t iz = grid.data[grid_offset + 2 * grid_sCoor];
-
-      ix = grid_sampler_compute_source_index(ix, inp_W, padding_mode, align_corners);
-      iy = grid_sampler_compute_source_index(iy, inp_H, padding_mode, align_corners);
-      iz = grid_sampler_compute_source_index(iz, inp_D, padding_mode, align_corners);
-
-      if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
-        // get corner pixel values from (x, y, z)
-        // for 4d, we used north-east-south-west
-        // for 5d, we add top-bottom
-        int ix_tnw = static_cast<int>(::floor(ix));
-        int iy_tnw = static_cast<int>(::floor(iy));
-        int iz_tnw = static_cast<int>(::floor(iz));
-
-        int ix_tne = ix_tnw + 1;
-        int iy_tne = iy_tnw;
-        int iz_tne = iz_tnw;
-
-        int ix_tsw = ix_tnw;
-        int iy_tsw = iy_tnw + 1;
-        int iz_tsw = iz_tnw;
-
-        int ix_tse = ix_tnw + 1;
-        int iy_tse = iy_tnw + 1;
-        int iz_tse = iz_tnw;
-
-        int ix_bnw = ix_tnw;
-        int iy_bnw = iy_tnw;
-        int iz_bnw = iz_tnw + 1;
-
-        int ix_bne = ix_tnw + 1;
-        int iy_bne = iy_tnw;
-        int iz_bne = iz_tnw + 1;
-
-        int ix_bsw = ix_tnw;
-        int iy_bsw = iy_tnw + 1;
-        int iz_bsw = iz_tnw + 1;
-
-        int ix_bse = ix_tnw + 1;
-        int iy_bse = iy_tnw + 1;
-        int iz_bse = iz_tnw + 1;
-
-        // get surfaces to each neighbor:
-        scalar_t tnw = (ix_bse - ix)    * (iy_bse - iy)    * (iz_bse - iz);
-        scalar_t tne = (ix    - ix_bsw) * (iy_bsw - iy)    * (iz_bsw - iz);
-        scalar_t tsw = (ix_bne - ix)    * (iy    - iy_bne) * (iz_bne - iz);
-        scalar_t tse = (ix    - ix_bnw) * (iy    - iy_bnw) * (iz_bnw - iz);
-        scalar_t bnw = (ix_tse - ix)    * (iy_tse - iy)    * (iz - iz_tse);
-        scalar_t bne = (ix    - ix_tsw) * (iy_tsw - iy)    * (iz - iz_tsw);
-        scalar_t bsw = (ix_tne - ix)    * (iy    - iy_tne) * (iz - iz_tne);
-        scalar_t bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
-
-        auto inp_ptr_NC = input.data + n * inp_sN;
-        auto out_ptr_NCDHW = output.data + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
-        for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
-          //   (c, iz_tnw, iy_tnw, ix_tnw) * tnw + (c, iz_tne, iy_tne, ix_tne) * tne
-          // + (c, iz_tsw, iy_tsw, ix_tsw) * tsw + (c, iz_tse, iy_tse, ix_tse) * tse
-          // + (c, iz_bnw, iy_bnw, ix_bnw) * bnw + (c, iz_bne, iy_bne, ix_bne) * bne
-          // + (c, iz_bsw, iy_bsw, ix_bsw) * bsw + (c, iz_bse, iy_bse, ix_bse) * bse
-          *out_ptr_NCDHW = static_cast<scalar_t>(0);
-          if (within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) {
-            *out_ptr_NCDHW += inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] * tnw;
-          }
-          if (within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W)) {
-            *out_ptr_NCDHW += inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] * tne;
-          }
-          if (within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W)) {
-            *out_ptr_NCDHW += inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] * tsw;
-          }
-          if (within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W)) {
-            *out_ptr_NCDHW += inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] * tse;
-          }
-          if (within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W)) {
-            *out_ptr_NCDHW += inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] * bnw;
-          }
-          if (within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W)) {
-            *out_ptr_NCDHW += inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] * bne;
-          }
-          if (within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W)) {
-            *out_ptr_NCDHW += inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] * bsw;
-          }
-          if (within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W)) {
-            *out_ptr_NCDHW += inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] * bse;
-          }
-        }
-      } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
-        int ix_nearest = static_cast<int>(::round(ix));
-        int iy_nearest = static_cast<int>(::round(iy));
-        int iz_nearest = static_cast<int>(::round(iz));
-
-        // assign nearest neighor pixel value to output pixel
-        auto inp_ptr_NC = input.data + n * inp_sN;
-        auto out_ptr_NCDHW = output.data + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
-        for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
-          if (within_bounds_3d(iz_nearest, iy_nearest, ix_nearest, inp_D, inp_H, inp_W)) {
-            *out_ptr_NCDHW = inp_ptr_NC[iz_nearest * inp_sD + iy_nearest * inp_sH + ix_nearest * inp_sW];
-          } else {
-            *out_ptr_NCDHW = static_cast<scalar_t>(0);
-          }
-        }
-      }
-    }
-  }
-
-  template <typename scalar_t>
-  C10_LAUNCH_BOUNDS_1(1024)
-  __global__ void grid_sampler_2d_backward_kernel_cuda(
-      const int nthreads,
-      TensorInfo<scalar_t, int> grad_output,
-      TensorInfo<scalar_t, int> input,
-      TensorInfo<scalar_t, int> grid,
-      TensorInfo<scalar_t, int> grad_input,  // initialized to zeros
-      TensorInfo<scalar_t, int> grad_grid,   // initialized to empty
-      const GridSamplerInterpolation interpolation_mode,
-      const GridSamplerPadding padding_mode,
-      bool align_corners) {
-
-    int C = input.sizes[1];
-    int inp_H = input.sizes[2];
-    int inp_W = input.sizes[3];
-    int out_H = grid.sizes[1];
-    int out_W = grid.sizes[2];
-    int inp_sN = input.strides[0];
-    int inp_sC = input.strides[1];
-    int inp_sH = input.strides[2];
-    int inp_sW = input.strides[3];
-    int grid_sN = grid.strides[0];
-    int grid_sH = grid.strides[1];
-    int grid_sW = grid.strides[2];
-    int grid_sCoor = grid.strides[3];
-    int gOut_sN = grad_output.strides[0];
-    int gOut_sC = grad_output.strides[1];
-    int gOut_sH = grad_output.strides[2];
-    int gOut_sW = grad_output.strides[3];
-    int gInp_sN = grad_input.strides[0];
-    int gInp_sC = grad_input.strides[1];
-    int gInp_sH = grad_input.strides[2];
-    int gInp_sW = grad_input.strides[3];
-    int gGrid_sW = grad_grid.strides[2];
-
-    CUDA_KERNEL_LOOP(index, nthreads) {
-      const int w = index % out_W;
-      const int h = (index / out_W) % out_H;
-      const int n = index / (out_H * out_W);
-      const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
-
-      // get the corresponding input x, y co-ordinates from grid
-      scalar_t ix = grid.data[grid_offset];
-      scalar_t iy = grid.data[grid_offset + grid_sCoor];
-
-      // multipliers for gradients on ix and iy
-      scalar_t gix_mult, giy_mult;
-      ix = grid_sampler_compute_source_index_set_grad(ix, inp_W, padding_mode, align_corners, &gix_mult);
-      iy = grid_sampler_compute_source_index_set_grad(iy, inp_H, padding_mode, align_corners, &giy_mult);
-
-      if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
-        // get NE, NW, SE, SW pixel values from (x, y)
-        int ix_nw = static_cast<int>(::floor(ix));
-        int iy_nw = static_cast<int>(::floor(iy));
-        int ix_ne = ix_nw + 1;
-        int iy_ne = iy_nw;
-        int ix_sw = ix_nw;
-        int iy_sw = iy_nw + 1;
-        int ix_se = ix_nw + 1;
-        int iy_se = iy_nw + 1;
-
-        // get surfaces to each neighbor:
-        scalar_t nw = (ix_se - ix)    * (iy_se - iy);
-        scalar_t ne = (ix    - ix_sw) * (iy_sw - iy);
-        scalar_t sw = (ix_ne - ix)    * (iy    - iy_ne);
-        scalar_t se = (ix    - ix_nw) * (iy    - iy_nw);
-
-        scalar_t gix = static_cast<scalar_t>(0), giy = static_cast<scalar_t>(0);
-        scalar_t *gOut_ptr_NCHW = grad_output.data + n * gOut_sN + h * gOut_sH + w * gOut_sW;
-        scalar_t *gInp_ptr_NC = grad_input.data + n * gInp_sN;
-        scalar_t *inp_ptr_NC = input.data + n * inp_sN;
-        for (int c = 0; c < C; ++c, inp_ptr_NC += inp_sC, gInp_ptr_NC += gInp_sC, gOut_ptr_NCHW += gOut_sC) {
-          scalar_t gOut = *gOut_ptr_NCHW;
-
-          // calculate and set grad_input
-          safe_add_2d(gInp_ptr_NC, iy_nw, ix_nw, gInp_sH, gInp_sW, inp_H, inp_W, nw * gOut);
-          safe_add_2d(gInp_ptr_NC, iy_ne, ix_ne, gInp_sH, gInp_sW, inp_H, inp_W, ne * gOut);
-          safe_add_2d(gInp_ptr_NC, iy_sw, ix_sw, gInp_sH, gInp_sW, inp_H, inp_W, sw * gOut);
-          safe_add_2d(gInp_ptr_NC, iy_se, ix_se, gInp_sH, gInp_sW, inp_H, inp_W, se * gOut);
-
-          // calculate grad_grid
-          if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
-            scalar_t nw_val = inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW];
-            gix -= nw_val * (iy_se - iy) * gOut;
-            giy -= nw_val * (ix_se - ix) * gOut;
-          }
-          if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
-            scalar_t ne_val = inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW];
-            gix += ne_val * (iy_sw - iy) * gOut;
-            giy -= ne_val * (ix - ix_sw) * gOut;
-          }
-          if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
-            scalar_t sw_val = inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW];
-            gix -= sw_val * (iy - iy_ne) * gOut;
-            giy += sw_val * (ix_ne - ix) * gOut;
-          }
-          if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
-            scalar_t se_val = inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW];
-            gix += se_val * (iy - iy_nw) * gOut;
-            giy += se_val * (ix - ix_nw) * gOut;
-          }
-        }
-
-        // assuming grad_grid is contiguous
-        // thus we can
-        //   1. use index with gGrid_sW to directly compute gGrid_ptr_NHW
-        //   2. directly assign to gGrid_ptr_NHW[0], gGrid_ptr_NHW[1]
-        scalar_t *gGrid_ptr_NHW = grad_grid.data + index * gGrid_sW;
-        gGrid_ptr_NHW[0] = gix_mult * gix;
-        gGrid_ptr_NHW[1] = giy_mult * giy;
-      } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
-        int ix_nearest = static_cast<int>(::round(ix));
-        int iy_nearest = static_cast<int>(::round(iy));
-
-        // assign nearest neighor pixel value to output pixel
-        scalar_t *gOut_ptr_NCHW = grad_output.data + n * gOut_sN + h * gOut_sH + w * gOut_sW;
-        scalar_t *gInp_ptr_NC = grad_input.data + n * gInp_sN;
-        for (int c = 0; c < C; ++c, gInp_ptr_NC += gInp_sC, gOut_ptr_NCHW += gOut_sC) {
-          // calculate and set grad_input
-          safe_add_2d(gInp_ptr_NC, iy_nearest, ix_nearest, gInp_sH, gInp_sW, inp_H, inp_W, *gOut_ptr_NCHW);
-        }
-
-        // assuming grad_grid is contiguous
-        // thus we can
-        //   1. use index with gGrid_sW to directly compute gGrid_ptr_NHW
-        //   2. directly assign to gGrid_ptr_NHW[0], gGrid_ptr_NHW[1]
-        scalar_t *gGrid_ptr_NHW = grad_grid.data + index * gGrid_sW;
-        gGrid_ptr_NHW[0] = static_cast<scalar_t>(0);
-        gGrid_ptr_NHW[1] = static_cast<scalar_t>(0);
-      }
-    }
-  }
-
-  template <typename scalar_t>
-  C10_LAUNCH_BOUNDS_1(1024)
-  __global__ void grid_sampler_3d_backward_kernel_cuda(
-      const int nthreads,
-      TensorInfo<scalar_t, int> grad_output,
-      TensorInfo<scalar_t, int> input,
-      TensorInfo<scalar_t, int> grid,
-      TensorInfo<scalar_t, int> grad_input,  // initialized to zeros
-      TensorInfo<scalar_t, int> grad_grid,   // initialized to empty
-      const GridSamplerInterpolation interpolation_mode,
-      const GridSamplerPadding padding_mode,
-      bool align_corners) {
-
-    int C = input.sizes[1];
-    int inp_D = input.sizes[2];
-    int inp_H = input.sizes[3];
-    int inp_W = input.sizes[4];
-    int out_D = grid.sizes[1];
-    int out_H = grid.sizes[2];
-    int out_W = grid.sizes[3];
-    int inp_sN = input.strides[0];
-    int inp_sC = input.strides[1];
-    int inp_sD = input.strides[2];
-    int inp_sH = input.strides[3];
-    int inp_sW = input.strides[4];
-    int grid_sN = grid.strides[0];
-    int grid_sD = grid.strides[1];
-    int grid_sH = grid.strides[2];
-    int grid_sW = grid.strides[3];
-    int grid_sCoor = grid.strides[4];
-    int gOut_sN = grad_output.strides[0];
-    int gOut_sC = grad_output.strides[1];
-    int gOut_sD = grad_output.strides[2];
-    int gOut_sH = grad_output.strides[3];
-    int gOut_sW = grad_output.strides[4];
-    int gInp_sN = grad_input.strides[0];
-    int gInp_sC = grad_input.strides[1];
-    int gInp_sD = grad_input.strides[2];
-    int gInp_sH = grad_input.strides[3];
-    int gInp_sW = grad_input.strides[4];
-    int gGrid_sW = grad_grid.strides[3];
-
-    CUDA_KERNEL_LOOP(index, nthreads) {
-      const int w = index % out_W;
-      const int h = (index / out_W) % out_H;
-      const int d = (index / (out_H * out_W)) % out_D;
-      const int n = index / (out_D * out_H * out_W);
-      const int grid_offset = n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
-
-      // get the corresponding input x, y, z co-ordinates from grid
-      scalar_t ix = grid.data[grid_offset];
-      scalar_t iy = grid.data[grid_offset + grid_sCoor];
-      scalar_t iz = grid.data[grid_offset + 2 * grid_sCoor];
-
-      // multipliers for gradients on ix, iy, and iz
-      scalar_t gix_mult, giy_mult, giz_mult;
-      ix = grid_sampler_compute_source_index_set_grad(ix, inp_W, padding_mode, align_corners, &gix_mult);
-      iy = grid_sampler_compute_source_index_set_grad(iy, inp_H, padding_mode, align_corners, &giy_mult);
-      iz = grid_sampler_compute_source_index_set_grad(iz, inp_D, padding_mode, align_corners, &giz_mult);
-
-      if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
-        // get corner pixel values from (x, y, z)
-        // for 4d, we used north-east-south-west
-        // for 5d, we add top-bottom
-        int ix_tnw = static_cast<int>(::floor(ix));
-        int iy_tnw = static_cast<int>(::floor(iy));
-        int iz_tnw = static_cast<int>(::floor(iz));
-
-        int ix_tne = ix_tnw + 1;
-        int iy_tne = iy_tnw;
-        int iz_tne = iz_tnw;
-
-        int ix_tsw = ix_tnw;
-        int iy_tsw = iy_tnw + 1;
-        int iz_tsw = iz_tnw;
-
-        int ix_tse = ix_tnw + 1;
-        int iy_tse = iy_tnw + 1;
-        int iz_tse = iz_tnw;
-
-        int ix_bnw = ix_tnw;
-        int iy_bnw = iy_tnw;
-        int iz_bnw = iz_tnw + 1;
-
-        int ix_bne = ix_tnw + 1;
-        int iy_bne = iy_tnw;
-        int iz_bne = iz_tnw + 1;
-
-        int ix_bsw = ix_tnw;
-        int iy_bsw = iy_tnw + 1;
-        int iz_bsw = iz_tnw + 1;
-
-        int ix_bse = ix_tnw + 1;
-        int iy_bse = iy_tnw + 1;
-        int iz_bse = iz_tnw + 1;
-
-        // get surfaces to each neighbor:
-        scalar_t tnw = (ix_bse - ix)    * (iy_bse - iy)    * (iz_bse - iz);
-        scalar_t tne = (ix    - ix_bsw) * (iy_bsw - iy)    * (iz_bsw - iz);
-        scalar_t tsw = (ix_bne - ix)    * (iy    - iy_bne) * (iz_bne - iz);
-        scalar_t tse = (ix    - ix_bnw) * (iy    - iy_bnw) * (iz_bnw - iz);
-        scalar_t bnw = (ix_tse - ix)    * (iy_tse - iy)    * (iz - iz_tse);
-        scalar_t bne = (ix    - ix_tsw) * (iy_tsw - iy)    * (iz - iz_tsw);
-        scalar_t bsw = (ix_tne - ix)    * (iy    - iy_tne) * (iz - iz_tne);
-        scalar_t bse = (ix    - ix_tnw) * (iy    - iy_tnw) * (iz - iz_tnw);
-
-        scalar_t gix = static_cast<scalar_t>(0), giy = static_cast<scalar_t>(0), giz = static_cast<scalar_t>(0);
-        scalar_t *gOut_ptr_NCDHW = grad_output.data + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
-        scalar_t *gInp_ptr_NC = grad_input.data + n * gInp_sN;
-        scalar_t *inp_ptr_NC = input.data + n * inp_sN;
-        // calculate bilinear weighted pixel value and set output pixel
-        for (int c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC += inp_sC) {
-          scalar_t gOut = *gOut_ptr_NCDHW;
-
-          // calculate and set grad_input
-          safe_add_3d(gInp_ptr_NC, iz_tnw, iy_tnw, ix_tnw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tnw * gOut);
-          safe_add_3d(gInp_ptr_NC, iz_tne, iy_tne, ix_tne, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tne * gOut);
-          safe_add_3d(gInp_ptr_NC, iz_tsw, iy_tsw, ix_tsw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tsw * gOut);
-          safe_add_3d(gInp_ptr_NC, iz_tse, iy_tse, ix_tse, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, tse * gOut);
-          safe_add_3d(gInp_ptr_NC, iz_bnw, iy_bnw, ix_bnw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bnw * gOut);
-          safe_add_3d(gInp_ptr_NC, iz_bne, iy_bne, ix_bne, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bne * gOut);
-          safe_add_3d(gInp_ptr_NC, iz_bsw, iy_bsw, ix_bsw, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bsw * gOut);
-          safe_add_3d(gInp_ptr_NC, iz_bse, iy_bse, ix_bse, gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, bse * gOut);
-
-          // calculate grad_grid
-          if (within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) {
-            scalar_t tnw_val = inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW];
-            gix -= tnw_val * (iy_bse - iy)    * (iz_bse - iz)    * gOut;
-            giy -= tnw_val * (ix_bse - ix)    * (iz_bse - iz)    * gOut;
-            giz -= tnw_val * (ix_bse - ix)    * (iy_bse - iy)    * gOut;
-          }
-          if (within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W)) {
-            scalar_t tne_val = inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW];
-            gix += tne_val * (iy_bsw - iy)    * (iz_bsw - iz)    * gOut;
-            giy -= tne_val * (ix    - ix_bsw) * (iz_bsw - iz)    * gOut;
-            giz -= tne_val * (ix    - ix_bsw) * (iy_bsw - iy)    * gOut;
-          }
-          if (within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W)) {
-            scalar_t tsw_val = inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW];
-            gix -= tsw_val * (iy - iy_bne)    * (iz_bne - iz)    * gOut;
-            giy += tsw_val * (ix_bne - ix)    * (iz_bne - iz)    * gOut;
-            giz -= tsw_val * (ix_bne - ix)    * (iy    - iy_bne) * gOut;
-          }
-          if (within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W)) {
-            scalar_t tse_val = inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW];
-            gix += tse_val * (iy - iy_bnw)    * (iz_bnw - iz)    * gOut;
-            giy += tse_val * (ix    - ix_bnw) * (iz_bnw - iz)    * gOut;
-            giz -= tse_val * (ix    - ix_bnw) * (iy    - iy_bnw) * gOut;
-          }
-          if (within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W)) {
-            scalar_t bnw_val = inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW];
-            gix -= bnw_val * (iy_tse - iy)    * (iz - iz_tse)    * gOut;
-            giy -= bnw_val * (ix_tse - ix)    * (iz - iz_tse)    * gOut;
-            giz += bnw_val * (ix_tse - ix)    * (iy_tse - iy)    * gOut;
-          }
-          if (within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W)) {
-            scalar_t bne_val = inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW];
-            gix += bne_val * (iy_tsw - iy)    * (iz - iz_tsw)    * gOut;
-            giy -= bne_val * (ix    - ix_tsw) * (iz - iz_tsw)    * gOut;
-            giz += bne_val * (ix    - ix_tsw) * (iy_tsw - iy)    * gOut;
-          }
-          if (within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W)) {
-            scalar_t bsw_val = inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW];
-            gix -= bsw_val * (iy - iy_tne)    * (iz - iz_tne)    * gOut;
-            giy += bsw_val * (ix_tne - ix)    * (iz - iz_tne)    * gOut;
-            giz += bsw_val * (ix_tne - ix)    * (iy    - iy_tne) * gOut;
-          }
-          if (within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W)) {
-            scalar_t bse_val = inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW];
-            gix += bse_val * (iy - iy_tnw)    * (iz - iz_tnw)    * gOut;
-            giy += bse_val * (ix    - ix_tnw) * (iz - iz_tnw)    * gOut;
-            giz += bse_val * (ix    - ix_tnw) * (iy    - iy_tnw) * gOut;
-          }
-        }
-
-        // assuming grad_grid is contiguous
-        // thus we can
-        //   1. use index with gGrid_sW to directly compute gGrid_ptr_NDHW
-        //   2. directly assign to gGrid_ptr_NDHW[0], gGrid_ptr_NDHW[1], gGrid_ptr_NDHW[2]
-        scalar_t *gGrid_ptr_NDHW = grad_grid.data + index * gGrid_sW;
-        gGrid_ptr_NDHW[0] = gix_mult * gix;
-        gGrid_ptr_NDHW[1] = giy_mult * giy;
-        gGrid_ptr_NDHW[2] = giz_mult * giz;
-      } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
-        int ix_nearest = static_cast<int>(::round(ix));
-        int iy_nearest = static_cast<int>(::round(iy));
-        int iz_nearest = static_cast<int>(::round(iz));
-
-        // assign nearest neighor pixel value to output pixel
-        scalar_t *gOut_ptr_NCDHW = grad_output.data + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW;
-        scalar_t *gInp_ptr_NC = grad_input.data + n * gInp_sN;
-        for (int c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, gInp_ptr_NC += gInp_sC) {
-          // calculate and set grad_input
-          safe_add_3d(gInp_ptr_NC, iz_nearest, iy_nearest, ix_nearest,
-                      gInp_sD, gInp_sH, gInp_sW, inp_D, inp_H, inp_W, *gOut_ptr_NCDHW);
-        }
-
-        // assuming grad_grid is contiguous
-        // thus we can
-        //   1. use index with gGrid_sW to directly compute gGrid_ptr_NDHW
-        //   2. directly assign to gGrid_ptr_NDHW[0], gGrid_ptr_NDHW[1], gGrid_ptr_NDHW[2]
-        scalar_t *gGrid_ptr_NDHW = grad_grid.data + index * gGrid_sW;
-        gGrid_ptr_NDHW[0] = static_cast<scalar_t>(0);
-        gGrid_ptr_NDHW[1] = static_cast<scalar_t>(0);
-        gGrid_ptr_NDHW[2] = static_cast<scalar_t>(0);
-      }
-    }
-  }
-}  // namespace
-
-using namespace at;
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-Tensor grid_sampler_2d_forward_cuda(const Tensor& input, const Tensor& grid,
-                            int64_t interpolation_mode, int64_t padding_mode,
-                            bool align_corners) {
-  auto N = input.size(0);
-  auto H = grid.size(1);
-  auto W = grid.size(2);
-  auto output = at::empty({N, input.size(1), H, W}, input.options());
-  int count = static_cast<int>(N * H * W);
-  if (count > 0) {
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_2d_forward_cuda", [&] {
-      grid_sampler_2d_forward_kernel_cuda<scalar_t>
-        <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
-          count,
-          getTensorInfo<scalar_t, int>(input),
-          getTensorInfo<scalar_t, int>(grid),
-          getTensorInfo<scalar_t, int>(output),
-          static_cast<GridSamplerInterpolation>(interpolation_mode),
-          static_cast<GridSamplerPadding>(padding_mode),
-          align_corners);
-    });
-  }
-  return output;
-}
-
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-Tensor grid_sampler_3d_forward_cuda(const Tensor& input, const Tensor& grid,
-                            int64_t interpolation_mode, int64_t padding_mode,
-                            bool align_corners) {
-  auto N = input.size(0);
-  auto D = grid.size(1);
-  auto H = grid.size(2);
-  auto W = grid.size(3);
-  auto output = at::empty({N, input.size(1), D, H, W}, input.options());
-  int count = static_cast<int>(N * D * H * W);
-  if (count > 0) {
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_3d_forward_cuda", [&] {
-      grid_sampler_3d_forward_kernel_cuda<scalar_t>
-        <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
-          count,
-          getTensorInfo<scalar_t, int>(input),
-          getTensorInfo<scalar_t, int>(grid),
-          getTensorInfo<scalar_t, int>(output),
-          static_cast<GridSamplerInterpolation>(interpolation_mode),
-          static_cast<GridSamplerPadding>(padding_mode),
-          align_corners);
-    });
-  }
-  return output;
-}
-
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-std::tuple<Tensor, Tensor>
-grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input,
-                              const Tensor& grid, int64_t interpolation_mode,
-                              int64_t padding_mode, bool align_corners) {
-  auto N = input.size(0);
-  auto H = grid.size(1);
-  auto W = grid.size(2);
-  auto grad_input = at::zeros_like(input);
-  auto grad_grid = at::empty_like(grid);
-  int count = static_cast<int>(N * H * W);
-  if (count > 0) {
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_2d_backward_cuda", [&] {
-      grid_sampler_2d_backward_kernel_cuda<scalar_t>
-        <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
-          count,
-          getTensorInfo<scalar_t, int>(grad_output),
-          getTensorInfo<scalar_t, int>(input),
-          getTensorInfo<scalar_t, int>(grid),
-          getTensorInfo<scalar_t, int>(grad_input),
-          getTensorInfo<scalar_t, int>(grad_grid),
-          static_cast<GridSamplerInterpolation>(interpolation_mode),
-          static_cast<GridSamplerPadding>(padding_mode),
-          align_corners);
-    });
-  }
-  return std::make_tuple(grad_input, grad_grid);
-}
-
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-std::tuple<Tensor, Tensor>
-grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input,
-                              const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode,
-                              bool align_corners) {
-  auto N = input.size(0);
-  auto D = grid.size(1);
-  auto H = grid.size(2);
-  auto W = grid.size(3);
-  auto grad_input = at::zeros_like(input);
-  auto grad_grid = at::empty_like(grid);
-  int count = static_cast<int>(N * D * H * W);
-  if (count > 0) {
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "grid_sampler_3d_backward_cuda", [&] {
-      grid_sampler_3d_backward_kernel_cuda<scalar_t>
-        <<<GET_BLOCKS(count), CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream()>>>(
-          count,
-          getTensorInfo<scalar_t, int>(grad_output),
-          getTensorInfo<scalar_t, int>(input),
-          getTensorInfo<scalar_t, int>(grid),
-          getTensorInfo<scalar_t, int>(grad_input),
-          getTensorInfo<scalar_t, int>(grad_grid),
-          static_cast<GridSamplerInterpolation>(interpolation_mode),
-          static_cast<GridSamplerPadding>(padding_mode),
-          align_corners);
-    });
-  }
-  return std::make_tuple(grad_input, grad_grid);
-}
-
-}  // namespace mmdetection
diff --git a/mmdet/ops/grid_sampler/src/cuda/grid_sampler_cuda.cuh b/mmdet/ops/grid_sampler/src/cuda/grid_sampler_cuda.cuh
deleted file mode 100644
index a84fa7c076ecd8302aacddf6c350196cc5ce964e..0000000000000000000000000000000000000000
--- a/mmdet/ops/grid_sampler/src/cuda/grid_sampler_cuda.cuh
+++ /dev/null
@@ -1,233 +0,0 @@
-// Modified from https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cuda/GridSampler.cuh
-
-#include <ATen/ATen.h>
-#include <ATen/NativeFunctions.h>
-#include <ATen/cuda/CUDAApplyUtils.cuh>
-#include <THC/THCAtomics.cuh>
-
-namespace mmdetection {
-
-namespace detail {
-
-  enum class GridSamplerInterpolation {Bilinear, Nearest};
-  enum class GridSamplerPadding {Zeros, Border, Reflection};
-
-}  // namespace detail
-
-using detail::GridSamplerInterpolation;
-using detail::GridSamplerPadding;
-
-// Unnormalizes a coordinate from the -1 to +1 scale to its pixel index value,
-// where we view each pixel as an area between (idx - 0.5) and (idx + 0.5).
-// if align_corners: -1 and +1 get sent to the centers of the corner pixels
-//     -1 --> 0
-//     +1 --> (size - 1)
-//     scale_factor = (size - 1) / 2
-// if not align_corners: -1 and +1 get sent to the image edges
-//     -1 --> -0.5
-//     +1 --> (size - 1) + 0.5 == size - 0.5
-//     scale_factor = size / 2
-template <typename scalar_t>
-static __forceinline__ __device__
-scalar_t grid_sampler_unnormalize(scalar_t coord, int size, bool align_corners) {
-  if (align_corners) {
-    // unnormalize coord from [-1, 1] to [0, size - 1]
-    return ((coord + 1.f) / 2) * (size - 1);
-  } else {
-    // unnormalize coord from [-1, 1] to [-0.5, size - 0.5]
-    return ((coord + 1.f) * size - 1) / 2;
-  }
-}
-
-// grid_sampler_unnormalize_set_grad works the same as grid_sampler_unnormalize
-// except that it also returns the `d output / d input` via pointer argument
-// `grad_in`.
-// This is useful in the backward pass of grid_sampler.
-template <typename scalar_t>
-static __forceinline__ __device__
-scalar_t grid_sampler_unnormalize_set_grad(scalar_t coord, int size,
-                                           bool align_corners, scalar_t *grad_in) {
-  if (align_corners) {
-    // unnormalize coord from [-1, 1] to [0, size - 1]
-    *grad_in = static_cast<scalar_t>(size - 1) / 2;
-    return ((coord + 1.f) / 2) * (size - 1);
-  } else {
-    // unnormalize coord from [-1, 1] to [-0.5, size - 0.5]
-    *grad_in = static_cast<scalar_t>(size) / 2;
-    return ((coord + 1.f) * size - 1) / 2;
-  }
-}
-
-// Clips coordinates to between 0 and clip_limit - 1
-template <typename scalar_t>
-static __forceinline__ __device__
-scalar_t clip_coordinates(scalar_t in, int clip_limit) {
-  return ::min(static_cast<scalar_t>(clip_limit - 1), ::max(in, static_cast<scalar_t>(0)));
-}
-
-// clip_coordinates_set_grad works similarly to clip_coordinates except that
-// it also returns the `d output / d input` via pointer argument `grad_in`.
-// This is useful in the backward pass of grid_sampler.
-template <typename scalar_t>
-static __forceinline__ __device__
-scalar_t clip_coordinates_set_grad(scalar_t in, int clip_limit, scalar_t *grad_in) {
-  if (in < static_cast<scalar_t>(0)) {
-    *grad_in = static_cast<scalar_t>(0);
-    return static_cast<scalar_t>(0);
-  } else {
-    scalar_t max = static_cast<scalar_t>(clip_limit - 1);
-    if (in > max) {
-      *grad_in = static_cast<scalar_t>(0);
-      return max;
-    } else {
-      *grad_in = static_cast<scalar_t>(1);
-      return in;
-    }
-  }
-}
-
-// Reflects coordinates until they fall between low and high (inclusive).
-// The bounds are passed as twice their value so that half-integer values
-// can be represented as ints.
-template <typename scalar_t>
-static __forceinline__ __device__
-scalar_t reflect_coordinates(scalar_t in, int twice_low, int twice_high) {
-  if (twice_low == twice_high) {
-    return static_cast<scalar_t>(0);
-  }
-  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
-  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
-  in = ::fabs(in - min);
-  // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
-  scalar_t extra = ::fmod(in, span);
-  int flips = static_cast<int>(::floor(in / span));
-  if (flips % 2 == 0) {
-    return extra + min;
-  } else {
-    return span - extra + min;
-  }
-}
-
-// reflect_coordinates_set_grad works similarly to reflect_coordinates except
-// that it also returns the `d output / d input` via pointer argument
-// `grad_in`.
-// This is useful in the backward pass of grid_sampler.
-template <typename scalar_t>
-static __forceinline__ __device__
-scalar_t reflect_coordinates_set_grad(scalar_t in, int twice_low, int twice_high,
-                                      scalar_t *grad_in) {
-  if (twice_low == twice_high) {
-    *grad_in = static_cast<scalar_t>(0);
-    return static_cast<scalar_t>(0);
-  }
-  int grad_in_mult_;
-  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
-  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
-  in = in - min;
-  if (in < static_cast<scalar_t>(0)) {
-    grad_in_mult_ = -1;
-    in = -in;
-  } else {
-    grad_in_mult_ = 1;
-  }
-  // `fmod` returns same sign as `in`, which is positive after the `if` above.
-  scalar_t extra = ::fmod(in, span);
-  int flips = static_cast<int>(::floor(in / span));
-  if (flips % 2 == 0) {
-    *grad_in = static_cast<scalar_t>(grad_in_mult_);
-    return extra + min;
-  } else {
-    *grad_in = static_cast<scalar_t>(-grad_in_mult_);
-    return span - extra + min;
-  }
-}
-
-// Computes the pixel source index value for a grid coordinate
-template <typename scalar_t>
-static __forceinline__ __device__
-scalar_t grid_sampler_compute_source_index(
-    scalar_t coord,
-    int size,
-    GridSamplerPadding padding_mode,
-    bool align_corners) {
-  coord = grid_sampler_unnormalize(coord, size, align_corners);
-  if (padding_mode == GridSamplerPadding::Border) {
-    // clip coordinates to image borders
-    coord = clip_coordinates(coord, size);
-  } else if (padding_mode == GridSamplerPadding::Reflection) {
-    // reflect coordinates by image borders
-    if (align_corners) {
-      coord = reflect_coordinates(coord, 0, 2*(size - 1));
-    } else {
-      coord = reflect_coordinates(coord, -1, 2*size - 1);
-      // when align_corners=False, reflection does not auto clip coords
-      coord = clip_coordinates(coord, size);
-    }
-  }
-  return coord;
-}
-
-// grid_sampler_compute_source_index_set_grad works similarly to
-// grid_sampler_compute_source_index except that it also returns the
-// `d output / d input` via pointer argument `grad_in`.
-// This is useful in the backward pass of grid_sampler.
-template <typename scalar_t>
-static __forceinline__ __device__
-scalar_t grid_sampler_compute_source_index_set_grad(
-    scalar_t coord,
-    int size,
-    GridSamplerPadding padding_mode,
-    bool align_corners,
-    scalar_t *grad_in) {
-  scalar_t grad_clip, grad_refl;
-  coord = grid_sampler_unnormalize_set_grad(coord, size, align_corners, grad_in);
-  if (padding_mode == GridSamplerPadding::Border) {
-    // clip coordinates to image borders
-    coord = clip_coordinates_set_grad(coord, size, &grad_clip);
-    *grad_in = (*grad_in) * grad_clip;
-  } else if (padding_mode == GridSamplerPadding::Reflection) {
-    // reflect coordinates by image borders
-    if (align_corners) {
-      coord = reflect_coordinates_set_grad(coord, 0, 2*(size - 1), &grad_refl);
-      *grad_in = (*grad_in) * grad_refl;
-    } else {
-      coord = reflect_coordinates_set_grad(coord, -1, 2*size - 1, &grad_refl);
-      // when align_corners=False, reflection does not auto clip coords
-      coord = clip_coordinates_set_grad(coord, size, &grad_clip);
-      *grad_in = (*grad_in) * grad_refl * grad_clip;
-    }
-  }
-  return coord;
-}
-
-static __forceinline__ __device__
-bool within_bounds_2d(int h, int w, int H, int W) {
-  return h >= 0 && h < H && w >= 0 && w < W;
-}
-
-static __forceinline__ __device__
-bool within_bounds_3d(int d, int h, int w, int D, int H, int W) {
-  return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W;
-}
-
-template<typename scalar_t>
-static __forceinline__ __device__
-void safe_add_2d(scalar_t *data, int h, int w,
-                 int sH, int sW, int H, int W,
-                 scalar_t delta) {
-  if (within_bounds_2d(h, w, H, W)) {
-    atomicAdd(data + h * sH + w * sW, delta);
-  }
-}
-
-template<typename scalar_t>
-static __forceinline__ __device__
-void safe_add_3d(scalar_t *data, int d, int h, int w,
-                 int sD, int sH, int sW, int D, int H, int W,
-                 scalar_t delta) {
-  if (within_bounds_3d(d, h, w, D, H, W)) {
-    atomicAdd(data + d * sD + h * sH + w * sW, delta);
-  }
-}
-
-}  // namespace at::mmdetection
diff --git a/mmdet/ops/grid_sampler/src/grid_sampler_ext.cpp b/mmdet/ops/grid_sampler/src/grid_sampler_ext.cpp
deleted file mode 100644
index 7e76a7aab80b738efd5a33317c2b5bb0e3ea5d00..0000000000000000000000000000000000000000
--- a/mmdet/ops/grid_sampler/src/grid_sampler_ext.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-#include <torch/extension.h>
-#include <ATen/DeviceGuard.h>
-
-namespace mmdetection {
-
-using namespace at;
-
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-Tensor grid_sampler_2d_forward_cpu(const Tensor& input, const Tensor& grid,
-                                    int64_t interpolation_mode, int64_t padding_mode,
-                                    bool align_corners);
-
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-Tensor grid_sampler_3d_forward_cpu(const Tensor& input, const Tensor& grid,
-                                    int64_t interpolation_mode, int64_t padding_mode,
-                                    bool align_corners);
-
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-std::tuple<Tensor, Tensor>
-grid_sampler_2d_backward_cpu(const Tensor& grad_output, const Tensor& input,
-                              const Tensor& grid, int64_t interpolation_mode,
-                              int64_t padding_mode, bool align_corners);
-
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-std::tuple<Tensor, Tensor>
-grid_sampler_3d_backward_cpu(const Tensor& grad_output, const Tensor& input,
-                              const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode,
-                              bool align_corners);
-
-#ifdef WITH_CUDA
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-Tensor grid_sampler_2d_forward_cuda(const Tensor& input, const Tensor& grid,
-                            int64_t interpolation_mode, int64_t padding_mode,
-                            bool align_corners);
-
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-Tensor grid_sampler_3d_forward_cuda(const Tensor& input, const Tensor& grid,
-                            int64_t interpolation_mode, int64_t padding_mode,
-                            bool align_corners);
-
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-std::tuple<Tensor, Tensor>
-grid_sampler_2d_backward_cuda(const Tensor& grad_output, const Tensor& input,
-                              const Tensor& grid, int64_t interpolation_mode,
-                              int64_t padding_mode, bool align_corners);
-
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-std::tuple<Tensor, Tensor>
-grid_sampler_3d_backward_cuda(const Tensor& grad_output, const Tensor& input,
-                              const Tensor& grid, int64_t interpolation_mode, int64_t padding_mode,
-                              bool align_corners);
-#endif
-
-// No shape checking needed here. See # NOTE [ grid_sampler Native Functions ].
-Tensor grid_sampler_forward(const Tensor& input, const Tensor& grid,
-                               int64_t interpolation_mode, int64_t padding_mode,
-                               bool align_corners) {
-    if (input.dim() == 4) {
-        if (input.type().is_cuda()) {
-#ifdef WITH_CUDA
-            return grid_sampler_2d_forward_cuda(input, grid, interpolation_mode,
-                padding_mode, align_corners);
-#else
-            AT_ERROR("grid_sampler is not compiled with GPU support");
-#endif
-        }
-        return grid_sampler_2d_forward_cpu(input, grid, interpolation_mode,
-                                           padding_mode, align_corners);
-    } else {
-        if (input.type().is_cuda()) {
-#ifdef WITH_CUDA
-            return grid_sampler_3d_forward_cuda(input, grid, interpolation_mode,
-                padding_mode, align_corners);
-#else
-            AT_ERROR("grid_sampler is not compiled with GPU support");
-#endif
-        }
-        return grid_sampler_3d_forward_cpu(input, grid, interpolation_mode,
-                                           padding_mode, align_corners);
-    }
-}
-
-std::tuple<Tensor, Tensor>
-grid_sampler_backward(const Tensor& grad_output, const Tensor& input,
-                         const Tensor& grid, int64_t interpolation_mode,
-                         int64_t padding_mode, bool align_corners) {
-    if (input.dim() == 4) {
-        if (input.type().is_cuda()) {
-#ifdef WITH_CUDA
-            return grid_sampler_2d_backward_cuda(grad_output, input, grid,
-                interpolation_mode,  padding_mode, align_corners);
-#else
-            AT_ERROR("grid_sampler is not compiled with GPU support");
-#endif
-        }
-        return grid_sampler_2d_backward_cpu(grad_output, input, grid,
-                                            interpolation_mode,  padding_mode, align_corners);
-    } else {
-        if (input.type().is_cuda()) {
-#ifdef WITH_CUDA
-            return grid_sampler_3d_backward_cuda(grad_output, input, grid,
-                interpolation_mode,  padding_mode, align_corners);
-#else
-            AT_ERROR("grid_sampler is not compiled with GPU support");
-#endif
-        }
-        return grid_sampler_3d_backward_cpu(grad_output, input, grid,
-                                            interpolation_mode,  padding_mode, align_corners);
-    }
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("grid_sampler_forward_cuda", &grid_sampler_forward, "grid_sampler_forward");
-  m.def("grid_sampler_backward_cuda", &grid_sampler_backward, "grid_sampler_backward");
-}
-
-}  // namespace mmdetection
diff --git a/mmdet/ops/masked_conv/src/cuda/masked_conv2d_cuda.cpp b/mmdet/ops/masked_conv/src/cuda/masked_conv2d_cuda.cpp
index b2850d916a4862e7c231c3075466796bacb1c952..84bd7c279132c1343e4d80dda50523861bea542c 100644
--- a/mmdet/ops/masked_conv/src/cuda/masked_conv2d_cuda.cpp
+++ b/mmdet/ops/masked_conv/src/cuda/masked_conv2d_cuda.cpp
@@ -17,9 +17,9 @@ int MaskedCol2imForwardLaucher(const at::Tensor col, const int height,
                                const at::Tensor mask_w_idx, const int mask_cnt,
                                at::Tensor im);
 
-#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
+#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
 #define CHECK_CONTIGUOUS(x) \
-  AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
+  TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
 #define CHECK_INPUT(x) \
   CHECK_CUDA(x);       \
   CHECK_CONTIGUOUS(x)
diff --git a/mmdet/ops/masked_conv/src/cuda/masked_conv2d_kernel.cu b/mmdet/ops/masked_conv/src/cuda/masked_conv2d_kernel.cu
index 81c785bbe41461fa8a4d380dbbef60dbe677cf6a..b8323592f528a714d88417f606abaa564c6c744d 100644
--- a/mmdet/ops/masked_conv/src/cuda/masked_conv2d_kernel.cu
+++ b/mmdet/ops/masked_conv/src/cuda/masked_conv2d_kernel.cu
@@ -59,10 +59,10 @@ int MaskedIm2colForwardLaucher(const at::Tensor bottom_data, const int height,
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       bottom_data.scalar_type(), "MaskedIm2colLaucherForward", ([&] {
-        const scalar_t *bottom_data_ = bottom_data.data<scalar_t>();
-        const int64_t *mask_h_idx_ = mask_h_idx.data<int64_t>();
-        const int64_t *mask_w_idx_ = mask_w_idx.data<int64_t>();
-        scalar_t *top_data_ = top_data.data<scalar_t>();
+        const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();
+        const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();
+        const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();
+        scalar_t *top_data_ = top_data.data_ptr<scalar_t>();
         MaskedIm2colForward<scalar_t>
             <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, at::cuda::getCurrentCUDAStream()
 >>>(
@@ -99,10 +99,10 @@ int MaskedCol2imForwardLaucher(const at::Tensor bottom_data, const int height,
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       bottom_data.scalar_type(), "MaskedCol2imLaucherForward", ([&] {
-        const scalar_t *bottom_data_ = bottom_data.data<scalar_t>();
-        const int64_t *mask_h_idx_ = mask_h_idx.data<int64_t>();
-        const int64_t *mask_w_idx_ = mask_w_idx.data<int64_t>();
-        scalar_t *top_data_ = top_data.data<scalar_t>();
+        const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();
+        const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();
+        const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();
+        scalar_t *top_data_ = top_data.data_ptr<scalar_t>();
 
         MaskedCol2imForward<scalar_t>
             <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, at::cuda::getCurrentCUDAStream()>>>(
diff --git a/mmdet/ops/masked_conv/src/masked_conv2d_ext.cpp b/mmdet/ops/masked_conv/src/masked_conv2d_ext.cpp
index 5bf60be580edc682e2f451ce92a17b888a7fa10e..39058ad77552966092dfbf729cc8c8fb14c98a06 100644
--- a/mmdet/ops/masked_conv/src/masked_conv2d_ext.cpp
+++ b/mmdet/ops/masked_conv/src/masked_conv2d_ext.cpp
@@ -19,7 +19,7 @@ int masked_im2col_forward(const at::Tensor im, const at::Tensor mask_h_idx,
                                const at::Tensor mask_w_idx, const int kernel_h,
                                const int kernel_w, const int pad_h,
                                const int pad_w, at::Tensor col) {
-  if (im.type().is_cuda()) {
+  if (im.device().is_cuda()) {
 #ifdef WITH_CUDA
     return masked_im2col_forward_cuda(im, mask_h_idx, mask_w_idx, kernel_h,
       kernel_w, pad_h, pad_w, col);
@@ -34,7 +34,7 @@ int masked_col2im_forward(const at::Tensor col,
                                const at::Tensor mask_h_idx,
                                const at::Tensor mask_w_idx, int height,
                                int width, int channels, at::Tensor im) {
-  if (col.type().is_cuda()) {
+  if (col.device().is_cuda()) {
 #ifdef WITH_CUDA
     return masked_col2im_forward_cuda(col, mask_h_idx, mask_w_idx, height,
       width, channels, im);
diff --git a/mmdet/ops/nms/src/cpu/nms_cpu.cpp b/mmdet/ops/nms/src/cpu/nms_cpu.cpp
index 4d11abec7e69bf46711115a62daebebb95c54e9a..aa652ea396c9533ec8b2bcd3b076b9496041c4d1 100644
--- a/mmdet/ops/nms/src/cpu/nms_cpu.cpp
+++ b/mmdet/ops/nms/src/cpu/nms_cpu.cpp
@@ -6,7 +6,7 @@
 
 template <typename scalar_t>
 at::Tensor nms_cpu_kernel(const at::Tensor& dets, const float threshold) {
-  AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor");
+  AT_ASSERTM(!dets.device().is_cuda(), "dets must be a CPU tensor");
 
   if (dets.numel() == 0) {
     return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
@@ -26,13 +26,13 @@ at::Tensor nms_cpu_kernel(const at::Tensor& dets, const float threshold) {
   at::Tensor suppressed_t =
       at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
 
-  auto suppressed = suppressed_t.data<uint8_t>();
-  auto order = order_t.data<int64_t>();
-  auto x1 = x1_t.data<scalar_t>();
-  auto y1 = y1_t.data<scalar_t>();
-  auto x2 = x2_t.data<scalar_t>();
-  auto y2 = y2_t.data<scalar_t>();
-  auto areas = areas_t.data<scalar_t>();
+  auto suppressed = suppressed_t.data_ptr<uint8_t>();
+  auto order = order_t.data_ptr<int64_t>();
+  auto x1 = x1_t.data_ptr<scalar_t>();
+  auto y1 = y1_t.data_ptr<scalar_t>();
+  auto x2 = x2_t.data_ptr<scalar_t>();
+  auto y2 = y2_t.data_ptr<scalar_t>();
+  auto areas = areas_t.data_ptr<scalar_t>();
 
   for (int64_t _i = 0; _i < ndets; _i++) {
     auto i = order[_i];
@@ -73,7 +73,7 @@ template <typename scalar_t>
 at::Tensor soft_nms_cpu_kernel(const at::Tensor& dets, const float threshold,
                                const unsigned char method, const float sigma,
                                const float min_score) {
-  AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor");
+  AT_ASSERTM(!dets.device().is_cuda(), "dets must be a CPU tensor");
 
   if (dets.numel() == 0) {
     return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU));
@@ -88,16 +88,16 @@ at::Tensor soft_nms_cpu_kernel(const at::Tensor& dets, const float threshold,
   at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
 
   auto ndets = dets.size(0);
-  auto x1 = x1_t.data<scalar_t>();
-  auto y1 = y1_t.data<scalar_t>();
-  auto x2 = x2_t.data<scalar_t>();
-  auto y2 = y2_t.data<scalar_t>();
-  auto scores = scores_t.data<scalar_t>();
-  auto areas = areas_t.data<scalar_t>();
+  auto x1 = x1_t.data_ptr<scalar_t>();
+  auto y1 = y1_t.data_ptr<scalar_t>();
+  auto x2 = x2_t.data_ptr<scalar_t>();
+  auto y2 = y2_t.data_ptr<scalar_t>();
+  auto scores = scores_t.data_ptr<scalar_t>();
+  auto areas = areas_t.data_ptr<scalar_t>();
 
   int64_t pos = 0;
   at::Tensor inds_t = at::arange(ndets, dets.options());
-  auto inds = inds_t.data<scalar_t>();
+  auto inds = inds_t.data_ptr<scalar_t>();
 
   for (int64_t i = 0; i < ndets; i++) {
     auto max_score = scores[i];
diff --git a/mmdet/ops/nms/src/cuda/nms_cuda.cpp b/mmdet/ops/nms/src/cuda/nms_cuda.cpp
index 61ca93a273c4075ca1ea20adfb549c7cb5f8e1a6..d46b81669041eb998660bba5f48d0775de586c7c 100644
--- a/mmdet/ops/nms/src/cuda/nms_cuda.cpp
+++ b/mmdet/ops/nms/src/cuda/nms_cuda.cpp
@@ -1,7 +1,7 @@
 // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 #include <torch/extension.h>
 
-#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
+#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
 
 at::Tensor nms_cuda_forward(const at::Tensor boxes, float nms_overlap_thresh);
 
diff --git a/mmdet/ops/nms/src/cuda/nms_kernel.cu b/mmdet/ops/nms/src/cuda/nms_kernel.cu
index 4a0800f52076ebced136bf99ae3eaa0a6dd8b944..bb6d18abcfa597a4d159580b59c30e82718924d3 100644
--- a/mmdet/ops/nms/src/cuda/nms_kernel.cu
+++ b/mmdet/ops/nms/src/cuda/nms_kernel.cu
@@ -74,7 +74,7 @@ at::Tensor nms_cuda_forward(const at::Tensor boxes, float nms_overlap_thresh) {
   at::DeviceGuard guard(boxes.device());
 
   using scalar_t = float;
-  AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor");
+  AT_ASSERTM(boxes.device().is_cuda(), "boxes must be a CUDA tensor");
   auto scores = boxes.select(1, 4);
   auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
   auto boxes_sorted = boxes.index_select(0, order_t);
@@ -83,7 +83,7 @@ at::Tensor nms_cuda_forward(const at::Tensor boxes, float nms_overlap_thresh) {
 
   const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock);
 
-  scalar_t* boxes_dev = boxes_sorted.data<scalar_t>();
+  scalar_t* boxes_dev = boxes_sorted.data_ptr<scalar_t>();
 
   THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState
 
@@ -114,7 +114,7 @@ at::Tensor nms_cuda_forward(const at::Tensor boxes, float nms_overlap_thresh) {
   memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
 
   at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU));
-  int64_t* keep_out = keep.data<int64_t>();
+  int64_t* keep_out = keep.data_ptr<int64_t>();
 
   int num_to_keep = 0;
   for (int i = 0; i < boxes_num; i++) {
diff --git a/mmdet/ops/nms/src/nms_ext.cpp b/mmdet/ops/nms/src/nms_ext.cpp
index 6d95303a315043defb6e48e145caa5b09a241c0d..6c311f2652d6cc097bf5f135c231936929c3d713 100644
--- a/mmdet/ops/nms/src/nms_ext.cpp
+++ b/mmdet/ops/nms/src/nms_ext.cpp
@@ -13,7 +13,7 @@ at::Tensor nms_cuda(const at::Tensor& dets, const float threshold);
 #endif
 
 at::Tensor nms(const at::Tensor& dets, const float threshold){
-  if (dets.type().is_cuda()) {
+  if (dets.device().is_cuda()) {
 #ifdef WITH_CUDA
     return nms_cuda(dets, threshold);
 #else
@@ -26,7 +26,7 @@ at::Tensor nms(const at::Tensor& dets, const float threshold){
 at::Tensor soft_nms(const at::Tensor& dets, const float threshold,
                         const unsigned char method, const float sigma, const
                         float min_score) {
-  if (dets.type().is_cuda()) {
+  if (dets.device().is_cuda()) {
     AT_ERROR("soft_nms is not implemented on GPU");
   }
   return soft_nms_cpu(dets, threshold, method, sigma, min_score);
diff --git a/mmdet/ops/roi_align/src/cpu/roi_align_v2.cpp b/mmdet/ops/roi_align/src/cpu/roi_align_v2.cpp
index 2c6b557da24eb19837c8ae8299f1da29dd0e8b80..9e01fe17da0b0693ad874a5e1ad5dbb397817dc5 100644
--- a/mmdet/ops/roi_align/src/cpu/roi_align_v2.cpp
+++ b/mmdet/ops/roi_align/src/cpu/roi_align_v2.cpp
@@ -357,11 +357,11 @@ at::Tensor ROIAlignForwardV2CPULaucher(const at::Tensor& input,
 
   if (output.numel() == 0) return output;
 
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "ROIAlign_forward", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "ROIAlign_forward", [&] {
     ROIAlignForward<scalar_t>(
-        output_size, input.contiguous().data<scalar_t>(), spatial_scale,
+        output_size, input.contiguous().data_ptr<scalar_t>(), spatial_scale,
         channels, height, width, pooled_height, pooled_width, sampling_ratio,
-        rois.contiguous().data<scalar_t>(), output.data<scalar_t>(), aligned);
+        rois.contiguous().data_ptr<scalar_t>(), output.data_ptr<scalar_t>(), aligned);
   });
   return output;
 }
@@ -393,11 +393,11 @@ at::Tensor ROIAlignBackwardV2CPULaucher(
   int h_stride = grad.stride(2);
   int w_stride = grad.stride(3);
 
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad.type(), "ROIAlign_backward", [&] {
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad.scalar_type(), "ROIAlign_backward", [&] {
     ROIAlignBackward<scalar_t>(
-        grad.numel(), grad.contiguous().data<scalar_t>(), spatial_scale,
+        grad.numel(), grad.contiguous().data_ptr<scalar_t>(), spatial_scale,
         channels, height, width, pooled_height, pooled_width, sampling_ratio,
-        grad_input.data<scalar_t>(), rois.contiguous().data<scalar_t>(),
+        grad_input.data_ptr<scalar_t>(), rois.contiguous().data_ptr<scalar_t>(),
         n_stride, c_stride, h_stride, w_stride, aligned);
   });
   return grad_input;
diff --git a/mmdet/ops/roi_align/src/cuda/roi_align_kernel.cu b/mmdet/ops/roi_align/src/cuda/roi_align_kernel.cu
index 113fc11047564daff2d701d1459ad0b7ee89b767..7afa33229d84fa04f746fc3477c83dfc19ee01f8 100644
--- a/mmdet/ops/roi_align/src/cuda/roi_align_kernel.cu
+++ b/mmdet/ops/roi_align/src/cuda/roi_align_kernel.cu
@@ -125,9 +125,9 @@ int ROIAlignForwardLaucher(const at::Tensor features, const at::Tensor rois,
   const int output_size = num_rois * pooled_height * pooled_width * channels;
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       features.scalar_type(), "ROIAlignLaucherForward", ([&] {
-        const scalar_t *bottom_data = features.data<scalar_t>();
-        const scalar_t *rois_data = rois.data<scalar_t>();
-        scalar_t *top_data = output.data<scalar_t>();
+        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
+        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
+        scalar_t *top_data = output.data_ptr<scalar_t>();
 
         ROIAlignForwardV1<scalar_t>
             <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0,
@@ -263,9 +263,9 @@ int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       top_grad.scalar_type(), "ROIAlignLaucherBackward", ([&] {
-        const scalar_t *top_diff = top_grad.data<scalar_t>();
-        const scalar_t *rois_data = rois.data<scalar_t>();
-        scalar_t *bottom_diff = bottom_grad.data<scalar_t>();
+        const scalar_t *top_diff = top_grad.data_ptr<scalar_t>();
+        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
+        scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>();
         if (sizeof(scalar_t) == sizeof(double)) {
           fprintf(stderr, "double is not supported\n");
           exit(-1);
diff --git a/mmdet/ops/roi_align/src/cuda/roi_align_kernel_v2.cu b/mmdet/ops/roi_align/src/cuda/roi_align_kernel_v2.cu
index 9a2f71509334156ba11966923aea0767b0e983a4..0189323cd1ead8a932d358ce79477c66e6c93e5d 100644
--- a/mmdet/ops/roi_align/src/cuda/roi_align_kernel_v2.cu
+++ b/mmdet/ops/roi_align/src/cuda/roi_align_kernel_v2.cu
@@ -297,9 +297,9 @@ at::Tensor ROIAlignForwardV2Laucher(const at::Tensor& input,
 
   AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] {
     RoIAlignForwardV2<scalar_t><<<grid, block, 0, stream>>>(
-        output_size, input.contiguous().data<scalar_t>(), spatial_scale,
+        output_size, input.contiguous().data_ptr<scalar_t>(), spatial_scale,
         channels, height, width, pooled_height, pooled_width, sampling_ratio,
-        rois.contiguous().data<scalar_t>(), output.data<scalar_t>(), aligned);
+        rois.contiguous().data_ptr<scalar_t>(), output.data_ptr<scalar_t>(), aligned);
   });
   cudaDeviceSynchronize();
   AT_CUDA_CHECK(cudaGetLastError());
@@ -338,10 +338,10 @@ at::Tensor ROIAlignBackwardV2Laucher(
 
   AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIAlign_backward", [&] {
     RoIAlignBackwardFeatureV2<scalar_t><<<grid, block, 0, stream>>>(
-        grad.numel(), grad.contiguous().data<scalar_t>(), num_rois,
+        grad.numel(), grad.contiguous().data_ptr<scalar_t>(), num_rois,
         spatial_scale, channels, height, width, pooled_height, pooled_width,
-        sampling_ratio, grad_input.data<scalar_t>(),
-        rois.contiguous().data<scalar_t>(), aligned);
+        sampling_ratio, grad_input.data_ptr<scalar_t>(),
+        rois.contiguous().data_ptr<scalar_t>(), aligned);
   });
   AT_CUDA_CHECK(cudaGetLastError());
   return grad_input;
diff --git a/mmdet/ops/roi_align/src/roi_align_ext.cpp b/mmdet/ops/roi_align/src/roi_align_ext.cpp
index f01351a8f16c6989ff9916ba06ac5890dbb3fcc8..18add01bba22424343eab57c8263753d7c93498c 100644
--- a/mmdet/ops/roi_align/src/roi_align_ext.cpp
+++ b/mmdet/ops/roi_align/src/roi_align_ext.cpp
@@ -46,9 +46,9 @@ at::Tensor ROIAlignBackwardV2CPULaucher(
     const int channels, const int height, const int width,
     const int sampling_ratio, bool aligned);
 
-#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
+#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
 #define CHECK_CONTIGUOUS(x) \
-  AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
+  TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
 #define CHECK_INPUT(x) \
   CHECK_CUDA(x);       \
   CHECK_CONTIGUOUS(x)
@@ -56,7 +56,7 @@ at::Tensor ROIAlignBackwardV2CPULaucher(
 int ROIAlign_forwardV1(at::Tensor features, at::Tensor rois, int pooled_height,
                        int pooled_width, float spatial_scale, int sample_num,
                        at::Tensor output) {
-  if (features.type().is_cuda()) {
+  if (features.device().is_cuda()) {
 #ifdef WITH_CUDA
     CHECK_INPUT(features);
     CHECK_INPUT(rois);
@@ -91,7 +91,7 @@ int ROIAlign_forwardV1(at::Tensor features, at::Tensor rois, int pooled_height,
 int ROIAlign_backwardV1(at::Tensor top_grad, at::Tensor rois, int pooled_height,
                         int pooled_width, float spatial_scale, int sample_num,
                         at::Tensor bottom_grad) {
-  if (top_grad.type().is_cuda()) {
+  if (top_grad.device().is_cuda()) {
 #ifdef WITH_CUDA
     CHECK_INPUT(top_grad);
     CHECK_INPUT(rois);
@@ -129,7 +129,7 @@ inline at::Tensor ROIAlign_forwardV2(const at::Tensor& input,
                                      const int pooled_height,
                                      const int pooled_width,
                                      const int sampling_ratio, bool aligned) {
-  if (input.type().is_cuda()) {
+  if (input.device().is_cuda()) {
 #ifdef WITH_CUDA
     return ROIAlignForwardV2Laucher(input, rois, spatial_scale, pooled_height,
                                     pooled_width, sampling_ratio, aligned);
@@ -146,7 +146,7 @@ inline at::Tensor ROIAlign_backwardV2(
     const int pooled_height, const int pooled_width, const int batch_size,
     const int channels, const int height, const int width,
     const int sampling_ratio, bool aligned) {
-  if (grad.type().is_cuda()) {
+  if (grad.device().is_cuda()) {
 #ifdef WITH_CUDA
     return ROIAlignBackwardV2Laucher(grad, rois, spatial_scale, pooled_height,
                                      pooled_width, batch_size, channels, height,
diff --git a/mmdet/ops/roi_pool/src/cuda/roi_pool_kernel.cu b/mmdet/ops/roi_pool/src/cuda/roi_pool_kernel.cu
index 2e34ff0a10f2a9fc350eb2b658cdca678d1642ee..88fab97fbb4c7b965558158b6c5cbd00b86de97a 100644
--- a/mmdet/ops/roi_pool/src/cuda/roi_pool_kernel.cu
+++ b/mmdet/ops/roi_pool/src/cuda/roi_pool_kernel.cu
@@ -88,10 +88,10 @@ int ROIPoolForwardLaucher(const at::Tensor features, const at::Tensor rois,
 
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       features.scalar_type(), "ROIPoolLaucherForward", ([&] {
-        const scalar_t *bottom_data = features.data<scalar_t>();
-        const scalar_t *rois_data = rois.data<scalar_t>();
-        scalar_t *top_data = output.data<scalar_t>();
-        int *argmax_data = argmax.data<int>();
+        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
+        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
+        scalar_t *top_data = output.data_ptr<scalar_t>();
+        int *argmax_data = argmax.data_ptr<int>();
 
         ROIPoolForward<scalar_t><<<GET_BLOCKS(output_size), THREADS_PER_BLOCK,
                                    0, at::cuda::getCurrentCUDAStream()>>>(
@@ -132,10 +132,10 @@ int ROIPoolBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
   const int output_size = num_rois * pooled_h * pooled_w * channels;
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       top_grad.scalar_type(), "ROIPoolLaucherBackward", ([&] {
-        const scalar_t *top_diff = top_grad.data<scalar_t>();
-        const scalar_t *rois_data = rois.data<scalar_t>();
-        const int *argmax_data = argmax.data<int>();
-        scalar_t *bottom_diff = bottom_grad.data<scalar_t>();
+        const scalar_t *top_diff = top_grad.data_ptr<scalar_t>();
+        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
+        const int *argmax_data = argmax.data_ptr<int>();
+        scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>();
         if (sizeof(scalar_t) == sizeof(double)) {
           fprintf(stderr, "double is not supported\n");
           exit(-1);
diff --git a/mmdet/ops/roi_pool/src/roi_pool_ext.cpp b/mmdet/ops/roi_pool/src/roi_pool_ext.cpp
index af7bd8553c3ff0e9753c07bb9dcdabc46edbb4a2..27d6b8a5d07c6ff685653905bd802b8cd277cb13 100644
--- a/mmdet/ops/roi_pool/src/roi_pool_ext.cpp
+++ b/mmdet/ops/roi_pool/src/roi_pool_ext.cpp
@@ -18,9 +18,9 @@ int ROIPoolBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
                            const int pooled_w, at::Tensor bottom_grad);
 #endif
 
-#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
+#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
 #define CHECK_CONTIGUOUS(x) \
-  AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
+  TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
 #define CHECK_INPUT(x) \
   CHECK_CUDA(x);       \
   CHECK_CONTIGUOUS(x)
@@ -29,7 +29,7 @@ int roi_pooling_forward(at::Tensor features, at::Tensor rois,
                              int pooled_height, int pooled_width,
                              float spatial_scale, at::Tensor output,
                              at::Tensor argmax) {
-  if (features.type().is_cuda()) {
+  if (features.device().is_cuda()) {
 #ifdef WITH_CUDA
     CHECK_INPUT(features);
     CHECK_INPUT(rois);
@@ -64,7 +64,7 @@ int roi_pooling_forward(at::Tensor features, at::Tensor rois,
 int roi_pooling_backward(at::Tensor top_grad, at::Tensor rois,
                               at::Tensor argmax, float spatial_scale,
                               at::Tensor bottom_grad) {
-  if (top_grad.type().is_cuda()) {
+  if (top_grad.device().is_cuda()) {
 #ifdef WITH_CUDA
     CHECK_INPUT(top_grad);
     CHECK_INPUT(rois);
diff --git a/mmdet/ops/sigmoid_focal_loss/src/cuda/sigmoid_focal_loss_cuda.cu b/mmdet/ops/sigmoid_focal_loss/src/cuda/sigmoid_focal_loss_cuda.cu
index 5101a113effcfea16f01426456d1cba9cf0aa2f4..797dcf355ebadcc17bb754614aa8f68698b4c773 100644
--- a/mmdet/ops/sigmoid_focal_loss/src/cuda/sigmoid_focal_loss_cuda.cu
+++ b/mmdet/ops/sigmoid_focal_loss/src/cuda/sigmoid_focal_loss_cuda.cu
@@ -100,8 +100,8 @@ at::Tensor SigmoidFocalLoss_forward_cuda(const at::Tensor &logits,
                                          const at::Tensor &targets,
                                          const int num_classes,
                                          const float gamma, const float alpha) {
-  AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor");
-  AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor");
+  AT_ASSERTM(logits.device().is_cuda(), "logits must be a CUDA tensor");
+  AT_ASSERTM(targets.device().is_cuda(), "targets must be a CUDA tensor");
   AT_ASSERTM(logits.dim() == 2, "logits should be NxClass");
 
   const int num_samples = logits.size(0);
@@ -121,9 +121,9 @@ at::Tensor SigmoidFocalLoss_forward_cuda(const at::Tensor &logits,
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       logits.scalar_type(), "SigmoidFocalLoss_forward", [&] {
         SigmoidFocalLossForward<scalar_t><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
-            losses_size, logits.contiguous().data<scalar_t>(),
-            targets.contiguous().data<int64_t>(), num_classes, gamma, alpha,
-            num_samples, losses.data<scalar_t>());
+            losses_size, logits.contiguous().data_ptr<scalar_t>(),
+            targets.contiguous().data_ptr<int64_t>(), num_classes, gamma, alpha,
+            num_samples, losses.data_ptr<scalar_t>());
       });
   THCudaCheck(cudaGetLastError());
   return losses;
@@ -135,9 +135,9 @@ at::Tensor SigmoidFocalLoss_backward_cuda(const at::Tensor &logits,
                                           const int num_classes,
                                           const float gamma,
                                           const float alpha) {
-  AT_ASSERTM(logits.type().is_cuda(), "logits must be a CUDA tensor");
-  AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor");
-  AT_ASSERTM(d_losses.type().is_cuda(), "d_losses must be a CUDA tensor");
+  AT_ASSERTM(logits.device().is_cuda(), "logits must be a CUDA tensor");
+  AT_ASSERTM(targets.device().is_cuda(), "targets must be a CUDA tensor");
+  AT_ASSERTM(d_losses.device().is_cuda(), "d_losses must be a CUDA tensor");
 
   AT_ASSERTM(logits.dim() == 2, "logits should be NxClass");
 
@@ -160,10 +160,10 @@ at::Tensor SigmoidFocalLoss_backward_cuda(const at::Tensor &logits,
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       logits.scalar_type(), "SigmoidFocalLoss_backward", [&] {
         SigmoidFocalLossBackward<scalar_t><<<grid, block, 0, at::cuda::getCurrentCUDAStream()>>>(
-            d_logits_size, logits.contiguous().data<scalar_t>(),
-            targets.contiguous().data<int64_t>(),
-            d_losses.contiguous().data<scalar_t>(), num_classes, gamma, alpha,
-            num_samples, d_logits.data<scalar_t>());
+            d_logits_size, logits.contiguous().data_ptr<scalar_t>(),
+            targets.contiguous().data_ptr<int64_t>(),
+            d_losses.contiguous().data_ptr<scalar_t>(), num_classes, gamma, alpha,
+            num_samples, d_logits.data_ptr<scalar_t>());
       });
 
   THCudaCheck(cudaGetLastError());
diff --git a/mmdet/ops/sigmoid_focal_loss/src/sigmoid_focal_loss_ext.cpp b/mmdet/ops/sigmoid_focal_loss/src/sigmoid_focal_loss_ext.cpp
index faf2e7872975cacf0953ba3095f80bba85e75003..3d66f3f8ff8f402290c247e489a2cd3fb012dd43 100644
--- a/mmdet/ops/sigmoid_focal_loss/src/sigmoid_focal_loss_ext.cpp
+++ b/mmdet/ops/sigmoid_focal_loss/src/sigmoid_focal_loss_ext.cpp
@@ -20,7 +20,7 @@ at::Tensor SigmoidFocalLoss_forward(const at::Tensor &logits,
                                     const at::Tensor &targets,
                                     const int num_classes, const float gamma,
                                     const float alpha) {
-  if (logits.type().is_cuda()) {
+  if (logits.device().is_cuda()) {
 #ifdef WITH_CUDA
     at::DeviceGuard guard(logits.device());
     return SigmoidFocalLoss_forward_cuda(logits, targets, num_classes, gamma,
@@ -37,7 +37,7 @@ at::Tensor SigmoidFocalLoss_backward(const at::Tensor &logits,
                                      const at::Tensor &d_losses,
                                      const int num_classes, const float gamma,
                                      const float alpha) {
-  if (logits.type().is_cuda()) {
+  if (logits.device().is_cuda()) {
 #ifdef WITH_CUDA
     at::DeviceGuard guard(logits.device());
     return SigmoidFocalLoss_backward_cuda(logits, targets, d_losses,
diff --git a/setup.py b/setup.py
index 14af9d1bce63322faebf28a5751f554b02eff009..e70a53110a674ea06a915f2dd5f435c06cd5dde3 100755
--- a/setup.py
+++ b/setup.py
@@ -282,19 +282,6 @@ if __name__ == '__main__':
                     'src/cuda/masked_conv2d_cuda.cpp',
                     'src/cuda/masked_conv2d_kernel.cu'
                 ]),
-            make_cuda_ext(
-                name='affine_grid_ext',
-                module='mmdet.ops.affine_grid',
-                sources=[
-                    'src/affine_grid_ext.cpp', 'src/cpu/affine_grid_cpu.cpp'
-                ]),
-            make_cuda_ext(
-                name='grid_sampler_ext',
-                module='mmdet.ops.grid_sampler',
-                sources=[
-                    'src/grid_sampler_ext.cpp', 'src/cpu/grid_sampler_cpu.cpp'
-                ],
-                sources_cuda=['src/cuda/grid_sampler_cuda.cu']),
             make_cuda_ext(
                 name='carafe_ext',
                 module='mmdet.ops.carafe',