From dd88ed65131da79bf60bfaf847236ad7c41a6dad Mon Sep 17 00:00:00 2001 From: Wenwei Zhang <40779233+ZwwWayne@users.noreply.github.com> Date: Sat, 14 Mar 2020 10:59:56 +0800 Subject: [PATCH] Support Better-Aligned ROIAlign Layer (#2068) * feature (roi_align_v2): use roi_align v2 to support aligned=True * fix (setup.py): fix install bug * recover configs to the original ones * remove cpu version for now * rename files * reformat (roi_align): clean code and reformat * support legacy roi_align * support legacy roi_align * reformat (roi_align): reformat c/cuda/python code * fix reformat request * add unsaved change * pass yapf * merge roi_align function * update note Co-authored-by: beansi <zhangwenwei@sensetime.com> --- mmdet/ops/roi_align/roi_align.py | 103 +++++- mmdet/ops/roi_align/src/roi_align_cuda.cpp | 68 +++- mmdet/ops/roi_align/src/roi_align_kernel.cu | 26 +- .../ops/roi_align/src/roi_align_kernel_v2.cu | 348 ++++++++++++++++++ setup.py | 6 +- 5 files changed, 508 insertions(+), 43 deletions(-) create mode 100644 mmdet/ops/roi_align/src/roi_align_kernel_v2.cu diff --git a/mmdet/ops/roi_align/roi_align.py b/mmdet/ops/roi_align/roi_align.py index a4cf2445..e28cb5f9 100644 --- a/mmdet/ops/roi_align/roi_align.py +++ b/mmdet/ops/roi_align/roi_align.py @@ -1,4 +1,4 @@ -import torch.nn as nn +from torch import nn from torch.autograd import Function from torch.autograd.function import once_differentiable from torch.nn.modules.utils import _pair @@ -9,21 +9,35 @@ from . import roi_align_cuda class RoIAlignFunction(Function): @staticmethod - def forward(ctx, features, rois, out_size, spatial_scale, sample_num=0): + def forward(ctx, + features, + rois, + out_size, + spatial_scale, + sample_num=0, + aligned=True): out_h, out_w = _pair(out_size) assert isinstance(out_h, int) and isinstance(out_w, int) ctx.spatial_scale = spatial_scale ctx.sample_num = sample_num ctx.save_for_backward(rois) ctx.feature_size = features.size() + ctx.aligned = aligned - batch_size, num_channels, data_height, data_width = features.size() - num_rois = rois.size(0) - - output = features.new_zeros(num_rois, num_channels, out_h, out_w) if features.is_cuda: - roi_align_cuda.forward(features, rois, out_h, out_w, spatial_scale, - sample_num, output) + if not aligned: + (batch_size, num_channels, data_height, + data_width) = features.size() + num_rois = rois.size(0) + + output = features.new_zeros(num_rois, num_channels, out_h, + out_w) + roi_align_cuda.forward_v1(features, rois, out_h, out_w, + spatial_scale, sample_num, output) + else: + output = roi_align_cuda.forward_v2(features, rois, + spatial_scale, out_h, out_w, + sample_num, aligned) else: raise NotImplementedError @@ -36,6 +50,7 @@ class RoIAlignFunction(Function): spatial_scale = ctx.spatial_scale sample_num = ctx.sample_num rois = ctx.saved_tensors[0] + aligned = ctx.aligned assert (feature_size is not None and grad_output.is_cuda) batch_size, num_channels, data_height, data_width = feature_size @@ -43,14 +58,19 @@ class RoIAlignFunction(Function): out_h = grad_output.size(2) grad_input = grad_rois = None - if ctx.needs_input_grad[0]: - grad_input = rois.new_zeros(batch_size, num_channels, data_height, - data_width) - roi_align_cuda.backward(grad_output.contiguous(), rois, out_h, - out_w, spatial_scale, sample_num, - grad_input) + if not aligned: + if ctx.needs_input_grad[0]: + grad_input = rois.new_zeros(batch_size, num_channels, + data_height, data_width) + roi_align_cuda.backward_v1(grad_output.contiguous(), rois, + out_h, out_w, spatial_scale, + sample_num, grad_input) + else: + grad_input = roi_align_cuda.backward_v2( + grad_output, rois, spatial_scale, out_h, out_w, batch_size, + num_channels, data_height, data_width, sample_num, aligned) - return grad_input, grad_rois, None, None, None + return grad_input, grad_rois, None, None, None, None roi_align = RoIAlignFunction.apply @@ -62,26 +82,71 @@ class RoIAlign(nn.Module): out_size, spatial_scale, sample_num=0, - use_torchvision=False): + use_torchvision=False, + aligned=False): + """ + Args: + out_size (tuple): h, w + spatial_scale (float): scale the input boxes by this number + sample_num (int): number of inputs samples to take for each + output sample. 2 to take samples densely for current models. + use_torchvision (bool): whether to use roi_align from torchvision + aligned (bool): if False, use the legacy implementation in + MMDetection. If True, align the results more perfectly. + + Note: + The implementation of RoIAlign when aligned=True is modified from + https://github.com/facebookresearch/detectron2/ + + The meaning of aligned=True: + + Given a continuous coordinate c, its two neighboring pixel + indices (in our pixel model) are computed by floor(c - 0.5) and + ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete + indices [0] and [1] (which are sampled from the underlying signal + at continuous coordinates 0.5 and 1.5). But the original roi_align + (aligned=False) does not subtract the 0.5 when computing + neighboring pixel indices and therefore it uses pixels with a + slightly incorrect alignment (relative to our pixel model) when + performing bilinear interpolation. + + With `aligned=True`, + we first appropriately scale the ROI and then shift it by -0.5 + prior to calling roi_align. This produces the correct neighbors; + + The difference does not make a difference to the model's + performance if ROIAlign is used together with conv layers. + """ super(RoIAlign, self).__init__() - self.out_size = _pair(out_size) self.spatial_scale = float(spatial_scale) + self.aligned = aligned self.sample_num = int(sample_num) self.use_torchvision = use_torchvision + assert not (use_torchvision and + aligned), 'Torchvision does not support aligned RoIAlgin' def forward(self, features, rois): + """ + Args: + features: NCHW images + rois: Bx5 boxes. First column is the index into N. The other 4 + columns are xyxy. + """ + assert rois.dim() == 2 and rois.size(1) == 5 + if self.use_torchvision: from torchvision.ops import roi_align as tv_roi_align return tv_roi_align(features, rois, self.out_size, self.spatial_scale, self.sample_num) else: return roi_align(features, rois, self.out_size, self.spatial_scale, - self.sample_num) + self.sample_num, self.aligned) def __repr__(self): format_str = self.__class__.__name__ format_str += '(out_size={}, spatial_scale={}, sample_num={}'.format( self.out_size, self.spatial_scale, self.sample_num) - format_str += ', use_torchvision={})'.format(self.use_torchvision) + format_str += ', use_torchvision={}, aligned={})'.format( + self.use_torchvision, self.aligned) return format_str diff --git a/mmdet/ops/roi_align/src/roi_align_cuda.cpp b/mmdet/ops/roi_align/src/roi_align_cuda.cpp index 829b3ac6..268f6907 100644 --- a/mmdet/ops/roi_align/src/roi_align_cuda.cpp +++ b/mmdet/ops/roi_align/src/roi_align_cuda.cpp @@ -5,6 +5,7 @@ #include <cmath> #include <vector> +#ifdef WITH_CUDA int ROIAlignForwardLaucher(const at::Tensor features, const at::Tensor rois, const float spatial_scale, const int sample_num, const int channels, const int height, @@ -19,6 +20,20 @@ int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois, const int pooled_height, const int pooled_width, at::Tensor bottom_grad); +at::Tensor ROIAlignForwardV2Laucher(const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, bool aligned); + +at::Tensor ROIAlignBackwardV2Laucher( + const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale, + const int pooled_height, const int pooled_width, const int batch_size, + const int channels, const int height, const int width, + const int sampling_ratio, bool aligned); +#endif + #define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ") #define CHECK_CONTIGUOUS(x) \ AT_CHECK(x.is_contiguous(), #x, " must be contiguous ") @@ -26,10 +41,9 @@ int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois, CHECK_CUDA(x); \ CHECK_CONTIGUOUS(x) -int roi_align_forward_cuda(at::Tensor features, at::Tensor rois, - int pooled_height, int pooled_width, - float spatial_scale, int sample_num, - at::Tensor output) { +int ROIAlign_forwardV1(at::Tensor features, at::Tensor rois, int pooled_height, + int pooled_width, float spatial_scale, int sample_num, + at::Tensor output) { CHECK_INPUT(features); CHECK_INPUT(rois); CHECK_INPUT(output); @@ -55,10 +69,9 @@ int roi_align_forward_cuda(at::Tensor features, at::Tensor rois, return 1; } -int roi_align_backward_cuda(at::Tensor top_grad, at::Tensor rois, - int pooled_height, int pooled_width, - float spatial_scale, int sample_num, - at::Tensor bottom_grad) { +int ROIAlign_backwardV1(at::Tensor top_grad, at::Tensor rois, int pooled_height, + int pooled_width, float spatial_scale, int sample_num, + at::Tensor bottom_grad) { CHECK_INPUT(top_grad); CHECK_INPUT(rois); CHECK_INPUT(bottom_grad); @@ -83,7 +96,42 @@ int roi_align_backward_cuda(at::Tensor top_grad, at::Tensor rois, return 1; } +// Interface for Python +inline at::Tensor ROIAlign_forwardV2(const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, bool aligned) { + if (input.type().is_cuda()) { +#ifdef WITH_CUDA + return ROIAlignForwardV2Laucher(input, rois, spatial_scale, pooled_height, + pooled_width, sampling_ratio, aligned); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } +} + +inline at::Tensor ROIAlign_backwardV2( + const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale, + const int pooled_height, const int pooled_width, const int batch_size, + const int channels, const int height, const int width, + const int sampling_ratio, bool aligned) { + if (grad.type().is_cuda()) { +#ifdef WITH_CUDA + return ROIAlignBackwardV2Laucher(grad, rois, spatial_scale, pooled_height, + pooled_width, batch_size, channels, height, + width, sampling_ratio, aligned); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } +} + PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("forward", &roi_align_forward_cuda, "Roi_Align forward (CUDA)"); - m.def("backward", &roi_align_backward_cuda, "Roi_Align backward (CUDA)"); + m.def("forward_v1", &ROIAlign_forwardV1, "Roi_Align V1 forward (CUDA)"); + m.def("backward_v1", &ROIAlign_backwardV1, "Roi_Align V1 backward (CUDA)"); + m.def("forward_v2", &ROIAlign_forwardV2, "Roi_Align V2 forward (CUDA)"); + m.def("backward_v2", &ROIAlign_backwardV2, "Roi_Align V2 backward (CUDA)"); } diff --git a/mmdet/ops/roi_align/src/roi_align_kernel.cu b/mmdet/ops/roi_align/src/roi_align_kernel.cu index 3208b280..b2ac72e3 100644 --- a/mmdet/ops/roi_align/src/roi_align_kernel.cu +++ b/mmdet/ops/roi_align/src/roi_align_kernel.cu @@ -62,13 +62,11 @@ __device__ scalar_t bilinear_interpolate(const scalar_t *bottom_data, } template <typename scalar_t> -__global__ void ROIAlignForward(const int nthreads, const scalar_t *bottom_data, - const scalar_t *bottom_rois, - const scalar_t spatial_scale, - const int sample_num, const int channels, - const int height, const int width, - const int pooled_height, const int pooled_width, - scalar_t *top_data) { +__global__ void ROIAlignForwardV1( + const int nthreads, const scalar_t *bottom_data, + const scalar_t *bottom_rois, const scalar_t spatial_scale, + const int sample_num, const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, scalar_t *top_data) { CUDA_1D_KERNEL_LOOP(index, nthreads) { // (n, c, ph, pw) is an element in the aligned output int pw = index % pooled_width; @@ -131,8 +129,9 @@ int ROIAlignForwardLaucher(const at::Tensor features, const at::Tensor rois, const scalar_t *rois_data = rois.data<scalar_t>(); scalar_t *top_data = output.data<scalar_t>(); - ROIAlignForward<scalar_t> - <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, at::cuda::getCurrentCUDAStream()>>>( + ROIAlignForwardV1<scalar_t> + <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, + at::cuda::getCurrentCUDAStream()>>>( output_size, bottom_data, rois_data, scalar_t(spatial_scale), sample_num, channels, height, width, pooled_height, pooled_width, top_data); @@ -186,7 +185,7 @@ __device__ void bilinear_interpolate_gradient(const int height, const int width, } template <typename scalar_t> -__global__ void ROIAlignBackward( +__global__ void ROIAlignBackwardV1( const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois, const scalar_t spatial_scale, const int sample_num, const int channels, const int height, const int width, const int pooled_height, @@ -272,12 +271,13 @@ int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois, exit(-1); } - ROIAlignBackward<scalar_t> - <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, at::cuda::getCurrentCUDAStream()>>>( + ROIAlignBackwardV1<scalar_t> + <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, + at::cuda::getCurrentCUDAStream()>>>( output_size, top_diff, rois_data, spatial_scale, sample_num, channels, height, width, pooled_height, pooled_width, bottom_diff); })); THCudaCheck(cudaGetLastError()); return 1; -} +} \ No newline at end of file diff --git a/mmdet/ops/roi_align/src/roi_align_kernel_v2.cu b/mmdet/ops/roi_align/src/roi_align_kernel_v2.cu new file mode 100644 index 00000000..bc3dbeec --- /dev/null +++ b/mmdet/ops/roi_align/src/roi_align_kernel_v2.cu @@ -0,0 +1,348 @@ +// Modified from +// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlign +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +#include <ATen/ATen.h> +#include <ATen/cuda/CUDAContext.h> +#include <c10/cuda/CUDAGuard.h> +#include <ATen/cuda/CUDAApplyUtils.cuh> + +// TODO make it in a common file +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ + i += blockDim.x * gridDim.x) + +template <typename T> +__device__ T bilinear_interpolate(const T* bottom_data, const int height, + const int width, T y, T x, + const int index /* index for debug only*/) { + // deal with cases that inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + // empty + return 0; + } + + if (y <= 0) y = 0; + if (x <= 0) x = 0; + + int y_low = (int)y; + int x_low = (int)x; + int y_high; + int x_high; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T)x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + // do bilinear interpolation + T v1 = bottom_data[y_low * width + x_low]; + T v2 = bottom_data[y_low * width + x_high]; + T v3 = bottom_data[y_high * width + x_low]; + T v4 = bottom_data[y_high * width + x_high]; + T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + + return val; +} + +template <typename T> +__global__ void RoIAlignForwardV2( + const int nthreads, const T* bottom_data, const T spatial_scale, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const int sampling_ratio, + const T* bottom_rois, T* top_data, bool aligned) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const T* offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + + // Do not use rounding; this implementation detail is critical + T offset = aligned ? (T)0.5 : (T)0.0; + T roi_start_w = offset_bottom_rois[1] * spatial_scale - offset; + T roi_start_h = offset_bottom_rois[2] * spatial_scale - offset; + T roi_end_w = offset_bottom_rois[3] * spatial_scale - offset; + T roi_end_h = offset_bottom_rois[4] * spatial_scale - offset; + + T roi_width = roi_end_w - roi_start_w; + T roi_height = roi_end_h - roi_start_h; + if (!aligned) { // for backward-compatibility only + roi_width = max(roi_width, (T)1.); + roi_height = max(roi_height, (T)1.); + } + T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height); + T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width); + + const T* offset_bottom_data = + bottom_data + (roi_batch_ind * channels + c) * height * width; + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + // We do average (integral) pooling inside a bin + // When the grid is empty, output zeros. + const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4 + + T output_val = 0.; + for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1 + { + const T y = roi_start_h + ph * bin_size_h + + static_cast<T>(iy + .5f) * bin_size_h / + static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T x = roi_start_w + pw * bin_size_w + + static_cast<T>(ix + .5f) * bin_size_w / + static_cast<T>(roi_bin_grid_w); + + T val = bilinear_interpolate(offset_bottom_data, height, width, y, x, + index); + output_val += val; + } + } + output_val /= count; + + top_data[index] = output_val; + } +} + +template <typename T> +__device__ void bilinear_interpolate_gradient( + const int height, const int width, T y, T x, T& w1, T& w2, T& w3, T& w4, + int& x_low, int& x_high, int& y_low, int& y_high, + const int index /* index for debug only*/) { + // deal with cases that inverse elements are out of feature map boundary + if (y < -1.0 || y > height || x < -1.0 || x > width) { + // empty + w1 = w2 = w3 = w4 = 0.; + x_low = x_high = y_low = y_high = -1; + return; + } + + if (y <= 0) y = 0; + if (x <= 0) x = 0; + + y_low = (int)y; + x_low = (int)x; + + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = (T)y_low; + } else { + y_high = y_low + 1; + } + + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = (T)x_low; + } else { + x_high = x_low + 1; + } + + T ly = y - y_low; + T lx = x - x_low; + T hy = 1. - ly, hx = 1. - lx; + + // reference in forward + // T v1 = bottom_data[y_low * width + x_low]; + // T v2 = bottom_data[y_low * width + x_high]; + // T v3 = bottom_data[y_high * width + x_low]; + // T v4 = bottom_data[y_high * width + x_high]; + // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + + w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + + return; +} + +template <typename T> +__global__ void RoIAlignBackwardFeatureV2( + const int nthreads, const T* top_diff, const int num_rois, + const T spatial_scale, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int sampling_ratio, T* bottom_diff, const T* bottom_rois, + bool aligned) { + CUDA_1D_KERNEL_LOOP(index, nthreads) { + // (n, c, ph, pw) is an element in the pooled output + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int c = (index / pooled_width / pooled_height) % channels; + int n = index / pooled_width / pooled_height / channels; + + const T* offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + + // Do not use rounding; this implementation detail is critical + T offset = aligned ? (T)0.5 : (T)0.0; + T roi_start_w = offset_bottom_rois[1] * spatial_scale - offset; + T roi_start_h = offset_bottom_rois[2] * spatial_scale - offset; + T roi_end_w = offset_bottom_rois[3] * spatial_scale - offset; + T roi_end_h = offset_bottom_rois[4] * spatial_scale - offset; + + T roi_width = roi_end_w - roi_start_w; + T roi_height = roi_end_h - roi_start_h; + if (!aligned) { // for backward-compatibility only + roi_width = max(roi_width, (T)1.); + roi_height = max(roi_height, (T)1.); + } + T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height); + T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width); + + T* offset_bottom_diff = + bottom_diff + (roi_batch_ind * channels + c) * height * width; + + int top_offset = (n * channels + c) * pooled_height * pooled_width; + const T* offset_top_diff = top_diff + top_offset; + const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw]; + + // We use roi_bin_grid to sample the grid and mimic integral + int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); // e.g., = 2 + int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + + // We do average (integral) pooling inside a bin + const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4 + + for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1 + { + const T y = roi_start_h + ph * bin_size_h + + static_cast<T>(iy + .5f) * bin_size_h / + static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5 + for (int ix = 0; ix < roi_bin_grid_w; ix++) { + const T x = roi_start_w + pw * bin_size_w + + static_cast<T>(ix + .5f) * bin_size_w / + static_cast<T>(roi_bin_grid_w); + + T w1, w2, w3, w4; + int x_low, x_high, y_low, y_high; + + bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, + x_low, x_high, y_low, y_high, index); + + T g1 = top_diff_this_bin * w1 / count; + T g2 = top_diff_this_bin * w2 / count; + T g3 = top_diff_this_bin * w3 / count; + T g4 = top_diff_this_bin * w4 / count; + + if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) { + atomicAdd(offset_bottom_diff + y_low * width + x_low, + static_cast<T>(g1)); + atomicAdd(offset_bottom_diff + y_low * width + x_high, + static_cast<T>(g2)); + atomicAdd(offset_bottom_diff + y_high * width + x_low, + static_cast<T>(g3)); + atomicAdd(offset_bottom_diff + y_high * width + x_high, + static_cast<T>(g4)); + } // if + } // ix + } // iy + } // CUDA_1D_KERNEL_LOOP +} // RoIAlignBackward + +at::Tensor ROIAlignForwardV2Laucher(const at::Tensor& input, + const at::Tensor& rois, + const float spatial_scale, + const int pooled_height, + const int pooled_width, + const int sampling_ratio, bool aligned) { + AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor"); + AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor"); + at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2}; + + at::CheckedFrom c = "ROIAlign_forward_cuda"; + at::checkAllSameGPU(c, {input_t, rois_t}); + at::checkAllSameType(c, {input_t, rois_t}); + at::cuda::CUDAGuard device_guard(input.device()); + + auto num_rois = rois.size(0); + auto channels = input.size(1); + auto height = input.size(2); + auto width = input.size(3); + + auto output = at::empty({num_rois, channels, pooled_height, pooled_width}, + input.options()); + auto output_size = num_rois * pooled_height * pooled_width * channels; + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + dim3 grid(std::min(at::cuda::ATenCeilDiv(output_size, 512L), 4096L)); + dim3 block(512); + + if (output.numel() == 0) { + AT_CUDA_CHECK(cudaGetLastError()); + return output; + } + + AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] { + RoIAlignForwardV2<scalar_t><<<grid, block, 0, stream>>>( + output_size, input.contiguous().data<scalar_t>(), spatial_scale, + channels, height, width, pooled_height, pooled_width, sampling_ratio, + rois.contiguous().data<scalar_t>(), output.data<scalar_t>(), aligned); + }); + cudaDeviceSynchronize(); + AT_CUDA_CHECK(cudaGetLastError()); + return output; +} + +// TODO remove the dependency on input and use instead its sizes -> save memory +at::Tensor ROIAlignBackwardV2Laucher( + const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale, + const int pooled_height, const int pooled_width, const int batch_size, + const int channels, const int height, const int width, + const int sampling_ratio, bool aligned) { + AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor"); + AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor"); + + at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2}; + at::CheckedFrom c = "ROIAlign_backward_cuda"; + at::checkAllSameGPU(c, {grad_t, rois_t}); + at::checkAllSameType(c, {grad_t, rois_t}); + at::cuda::CUDAGuard device_guard(grad.device()); + + auto num_rois = rois.size(0); + auto grad_input = + at::zeros({batch_size, channels, height, width}, grad.options()); + + cudaStream_t stream = at::cuda::getCurrentCUDAStream(); + + dim3 grid(std::min(at::cuda::ATenCeilDiv(grad.numel(), 512L), 4096L)); + dim3 block(512); + + // handle possibly empty gradients + if (grad.numel() == 0) { + AT_CUDA_CHECK(cudaGetLastError()); + return grad_input; + } + + AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIAlign_backward", [&] { + RoIAlignBackwardFeatureV2<scalar_t><<<grid, block, 0, stream>>>( + grad.numel(), grad.contiguous().data<scalar_t>(), num_rois, + spatial_scale, channels, height, width, pooled_height, pooled_width, + sampling_ratio, grad_input.data<scalar_t>(), + rois.contiguous().data<scalar_t>(), aligned); + }); + AT_CUDA_CHECK(cudaGetLastError()); + return grad_input; +} diff --git a/setup.py b/setup.py index a11263f2..5a5ddbe0 100755 --- a/setup.py +++ b/setup.py @@ -237,7 +237,11 @@ if __name__ == '__main__': make_cuda_ext( name='roi_align_cuda', module='mmdet.ops.roi_align', - sources=['src/roi_align_cuda.cpp', 'src/roi_align_kernel.cu']), + sources=[ + 'src/roi_align_cuda.cpp', + 'src/roi_align_kernel.cu', + 'src/roi_align_kernel_v2.cu', + ]), make_cuda_ext( name='roi_pool_cuda', module='mmdet.ops.roi_pool', -- GitLab