Skip to content
Snippets Groups Projects
Unverified Commit dd88ed65 authored by Wenwei Zhang's avatar Wenwei Zhang Committed by GitHub
Browse files

Support Better-Aligned ROIAlign Layer (#2068)


* feature (roi_align_v2): use roi_align v2 to support aligned=True

* fix (setup.py): fix install bug

* recover configs to the original ones

* remove cpu version for now

* rename files

* reformat (roi_align): clean code and reformat

* support legacy roi_align

* support legacy roi_align

* reformat (roi_align): reformat c/cuda/python code

* fix reformat request

* add unsaved change

* pass yapf

* merge roi_align function

* update note

Co-authored-by: default avatarbeansi <zhangwenwei@sensetime.com>
parent 4171548a
No related branches found
No related tags found
No related merge requests found
import torch.nn as nn
from torch import nn
from torch.autograd import Function
from torch.autograd.function import once_differentiable
from torch.nn.modules.utils import _pair
......@@ -9,21 +9,35 @@ from . import roi_align_cuda
class RoIAlignFunction(Function):
@staticmethod
def forward(ctx, features, rois, out_size, spatial_scale, sample_num=0):
def forward(ctx,
features,
rois,
out_size,
spatial_scale,
sample_num=0,
aligned=True):
out_h, out_w = _pair(out_size)
assert isinstance(out_h, int) and isinstance(out_w, int)
ctx.spatial_scale = spatial_scale
ctx.sample_num = sample_num
ctx.save_for_backward(rois)
ctx.feature_size = features.size()
ctx.aligned = aligned
batch_size, num_channels, data_height, data_width = features.size()
num_rois = rois.size(0)
output = features.new_zeros(num_rois, num_channels, out_h, out_w)
if features.is_cuda:
roi_align_cuda.forward(features, rois, out_h, out_w, spatial_scale,
sample_num, output)
if not aligned:
(batch_size, num_channels, data_height,
data_width) = features.size()
num_rois = rois.size(0)
output = features.new_zeros(num_rois, num_channels, out_h,
out_w)
roi_align_cuda.forward_v1(features, rois, out_h, out_w,
spatial_scale, sample_num, output)
else:
output = roi_align_cuda.forward_v2(features, rois,
spatial_scale, out_h, out_w,
sample_num, aligned)
else:
raise NotImplementedError
......@@ -36,6 +50,7 @@ class RoIAlignFunction(Function):
spatial_scale = ctx.spatial_scale
sample_num = ctx.sample_num
rois = ctx.saved_tensors[0]
aligned = ctx.aligned
assert (feature_size is not None and grad_output.is_cuda)
batch_size, num_channels, data_height, data_width = feature_size
......@@ -43,14 +58,19 @@ class RoIAlignFunction(Function):
out_h = grad_output.size(2)
grad_input = grad_rois = None
if ctx.needs_input_grad[0]:
grad_input = rois.new_zeros(batch_size, num_channels, data_height,
data_width)
roi_align_cuda.backward(grad_output.contiguous(), rois, out_h,
out_w, spatial_scale, sample_num,
grad_input)
if not aligned:
if ctx.needs_input_grad[0]:
grad_input = rois.new_zeros(batch_size, num_channels,
data_height, data_width)
roi_align_cuda.backward_v1(grad_output.contiguous(), rois,
out_h, out_w, spatial_scale,
sample_num, grad_input)
else:
grad_input = roi_align_cuda.backward_v2(
grad_output, rois, spatial_scale, out_h, out_w, batch_size,
num_channels, data_height, data_width, sample_num, aligned)
return grad_input, grad_rois, None, None, None
return grad_input, grad_rois, None, None, None, None
roi_align = RoIAlignFunction.apply
......@@ -62,26 +82,71 @@ class RoIAlign(nn.Module):
out_size,
spatial_scale,
sample_num=0,
use_torchvision=False):
use_torchvision=False,
aligned=False):
"""
Args:
out_size (tuple): h, w
spatial_scale (float): scale the input boxes by this number
sample_num (int): number of inputs samples to take for each
output sample. 2 to take samples densely for current models.
use_torchvision (bool): whether to use roi_align from torchvision
aligned (bool): if False, use the legacy implementation in
MMDetection. If True, align the results more perfectly.
Note:
The implementation of RoIAlign when aligned=True is modified from
https://github.com/facebookresearch/detectron2/
The meaning of aligned=True:
Given a continuous coordinate c, its two neighboring pixel
indices (in our pixel model) are computed by floor(c - 0.5) and
ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete
indices [0] and [1] (which are sampled from the underlying signal
at continuous coordinates 0.5 and 1.5). But the original roi_align
(aligned=False) does not subtract the 0.5 when computing
neighboring pixel indices and therefore it uses pixels with a
slightly incorrect alignment (relative to our pixel model) when
performing bilinear interpolation.
With `aligned=True`,
we first appropriately scale the ROI and then shift it by -0.5
prior to calling roi_align. This produces the correct neighbors;
The difference does not make a difference to the model's
performance if ROIAlign is used together with conv layers.
"""
super(RoIAlign, self).__init__()
self.out_size = _pair(out_size)
self.spatial_scale = float(spatial_scale)
self.aligned = aligned
self.sample_num = int(sample_num)
self.use_torchvision = use_torchvision
assert not (use_torchvision and
aligned), 'Torchvision does not support aligned RoIAlgin'
def forward(self, features, rois):
"""
Args:
features: NCHW images
rois: Bx5 boxes. First column is the index into N. The other 4
columns are xyxy.
"""
assert rois.dim() == 2 and rois.size(1) == 5
if self.use_torchvision:
from torchvision.ops import roi_align as tv_roi_align
return tv_roi_align(features, rois, self.out_size,
self.spatial_scale, self.sample_num)
else:
return roi_align(features, rois, self.out_size, self.spatial_scale,
self.sample_num)
self.sample_num, self.aligned)
def __repr__(self):
format_str = self.__class__.__name__
format_str += '(out_size={}, spatial_scale={}, sample_num={}'.format(
self.out_size, self.spatial_scale, self.sample_num)
format_str += ', use_torchvision={})'.format(self.use_torchvision)
format_str += ', use_torchvision={}, aligned={})'.format(
self.use_torchvision, self.aligned)
return format_str
......@@ -5,6 +5,7 @@
#include <cmath>
#include <vector>
#ifdef WITH_CUDA
int ROIAlignForwardLaucher(const at::Tensor features, const at::Tensor rois,
const float spatial_scale, const int sample_num,
const int channels, const int height,
......@@ -19,6 +20,20 @@ int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
const int pooled_height, const int pooled_width,
at::Tensor bottom_grad);
at::Tensor ROIAlignForwardV2Laucher(const at::Tensor& input,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int sampling_ratio, bool aligned);
at::Tensor ROIAlignBackwardV2Laucher(
const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale,
const int pooled_height, const int pooled_width, const int batch_size,
const int channels, const int height, const int width,
const int sampling_ratio, bool aligned);
#endif
#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) \
AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
......@@ -26,10 +41,9 @@ int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
int roi_align_forward_cuda(at::Tensor features, at::Tensor rois,
int pooled_height, int pooled_width,
float spatial_scale, int sample_num,
at::Tensor output) {
int ROIAlign_forwardV1(at::Tensor features, at::Tensor rois, int pooled_height,
int pooled_width, float spatial_scale, int sample_num,
at::Tensor output) {
CHECK_INPUT(features);
CHECK_INPUT(rois);
CHECK_INPUT(output);
......@@ -55,10 +69,9 @@ int roi_align_forward_cuda(at::Tensor features, at::Tensor rois,
return 1;
}
int roi_align_backward_cuda(at::Tensor top_grad, at::Tensor rois,
int pooled_height, int pooled_width,
float spatial_scale, int sample_num,
at::Tensor bottom_grad) {
int ROIAlign_backwardV1(at::Tensor top_grad, at::Tensor rois, int pooled_height,
int pooled_width, float spatial_scale, int sample_num,
at::Tensor bottom_grad) {
CHECK_INPUT(top_grad);
CHECK_INPUT(rois);
CHECK_INPUT(bottom_grad);
......@@ -83,7 +96,42 @@ int roi_align_backward_cuda(at::Tensor top_grad, at::Tensor rois,
return 1;
}
// Interface for Python
inline at::Tensor ROIAlign_forwardV2(const at::Tensor& input,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int sampling_ratio, bool aligned) {
if (input.type().is_cuda()) {
#ifdef WITH_CUDA
return ROIAlignForwardV2Laucher(input, rois, spatial_scale, pooled_height,
pooled_width, sampling_ratio, aligned);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
}
inline at::Tensor ROIAlign_backwardV2(
const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale,
const int pooled_height, const int pooled_width, const int batch_size,
const int channels, const int height, const int width,
const int sampling_ratio, bool aligned) {
if (grad.type().is_cuda()) {
#ifdef WITH_CUDA
return ROIAlignBackwardV2Laucher(grad, rois, spatial_scale, pooled_height,
pooled_width, batch_size, channels, height,
width, sampling_ratio, aligned);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &roi_align_forward_cuda, "Roi_Align forward (CUDA)");
m.def("backward", &roi_align_backward_cuda, "Roi_Align backward (CUDA)");
m.def("forward_v1", &ROIAlign_forwardV1, "Roi_Align V1 forward (CUDA)");
m.def("backward_v1", &ROIAlign_backwardV1, "Roi_Align V1 backward (CUDA)");
m.def("forward_v2", &ROIAlign_forwardV2, "Roi_Align V2 forward (CUDA)");
m.def("backward_v2", &ROIAlign_backwardV2, "Roi_Align V2 backward (CUDA)");
}
......@@ -62,13 +62,11 @@ __device__ scalar_t bilinear_interpolate(const scalar_t *bottom_data,
}
template <typename scalar_t>
__global__ void ROIAlignForward(const int nthreads, const scalar_t *bottom_data,
const scalar_t *bottom_rois,
const scalar_t spatial_scale,
const int sample_num, const int channels,
const int height, const int width,
const int pooled_height, const int pooled_width,
scalar_t *top_data) {
__global__ void ROIAlignForwardV1(
const int nthreads, const scalar_t *bottom_data,
const scalar_t *bottom_rois, const scalar_t spatial_scale,
const int sample_num, const int channels, const int height, const int width,
const int pooled_height, const int pooled_width, scalar_t *top_data) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the aligned output
int pw = index % pooled_width;
......@@ -131,8 +129,9 @@ int ROIAlignForwardLaucher(const at::Tensor features, const at::Tensor rois,
const scalar_t *rois_data = rois.data<scalar_t>();
scalar_t *top_data = output.data<scalar_t>();
ROIAlignForward<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, at::cuda::getCurrentCUDAStream()>>>(
ROIAlignForwardV1<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0,
at::cuda::getCurrentCUDAStream()>>>(
output_size, bottom_data, rois_data, scalar_t(spatial_scale),
sample_num, channels, height, width, pooled_height,
pooled_width, top_data);
......@@ -186,7 +185,7 @@ __device__ void bilinear_interpolate_gradient(const int height, const int width,
}
template <typename scalar_t>
__global__ void ROIAlignBackward(
__global__ void ROIAlignBackwardV1(
const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,
const scalar_t spatial_scale, const int sample_num, const int channels,
const int height, const int width, const int pooled_height,
......@@ -272,12 +271,13 @@ int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
exit(-1);
}
ROIAlignBackward<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, at::cuda::getCurrentCUDAStream()>>>(
ROIAlignBackwardV1<scalar_t>
<<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0,
at::cuda::getCurrentCUDAStream()>>>(
output_size, top_diff, rois_data, spatial_scale, sample_num,
channels, height, width, pooled_height, pooled_width,
bottom_diff);
}));
THCudaCheck(cudaGetLastError());
return 1;
}
}
\ No newline at end of file
// Modified from
// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlign
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <ATen/cuda/CUDAApplyUtils.cuh>
// TODO make it in a common file
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
i += blockDim.x * gridDim.x)
template <typename T>
__device__ T bilinear_interpolate(const T* bottom_data, const int height,
const int width, T y, T x,
const int index /* index for debug only*/) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
return 0;
}
if (y <= 0) y = 0;
if (x <= 0) x = 0;
int y_low = (int)y;
int x_low = (int)x;
int y_high;
int x_high;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
// do bilinear interpolation
T v1 = bottom_data[y_low * width + x_low];
T v2 = bottom_data[y_low * width + x_high];
T v3 = bottom_data[y_high * width + x_low];
T v4 = bottom_data[y_high * width + x_high];
T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
template <typename T>
__global__ void RoIAlignForwardV2(
const int nthreads, const T* bottom_data, const T spatial_scale,
const int channels, const int height, const int width,
const int pooled_height, const int pooled_width, const int sampling_ratio,
const T* bottom_rois, T* top_data, bool aligned) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T* offset_bottom_rois = bottom_rois + n * 5;
int roi_batch_ind = offset_bottom_rois[0];
// Do not use rounding; this implementation detail is critical
T offset = aligned ? (T)0.5 : (T)0.0;
T roi_start_w = offset_bottom_rois[1] * spatial_scale - offset;
T roi_start_h = offset_bottom_rois[2] * spatial_scale - offset;
T roi_end_w = offset_bottom_rois[3] * spatial_scale - offset;
T roi_end_h = offset_bottom_rois[4] * spatial_scale - offset;
T roi_width = roi_end_w - roi_start_w;
T roi_height = roi_end_h - roi_start_h;
if (!aligned) { // for backward-compatibility only
roi_width = max(roi_width, (T)1.);
roi_height = max(roi_height, (T)1.);
}
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
const T* offset_bottom_data =
bottom_data + (roi_batch_ind * channels + c) * height * width;
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// We do average (integral) pooling inside a bin
// When the grid is empty, output zeros.
const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
T output_val = 0.;
for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
{
const T y = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T x = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
T val = bilinear_interpolate(offset_bottom_data, height, width, y, x,
index);
output_val += val;
}
}
output_val /= count;
top_data[index] = output_val;
}
}
template <typename T>
__device__ void bilinear_interpolate_gradient(
const int height, const int width, T y, T x, T& w1, T& w2, T& w3, T& w4,
int& x_low, int& x_high, int& y_low, int& y_high,
const int index /* index for debug only*/) {
// deal with cases that inverse elements are out of feature map boundary
if (y < -1.0 || y > height || x < -1.0 || x > width) {
// empty
w1 = w2 = w3 = w4 = 0.;
x_low = x_high = y_low = y_high = -1;
return;
}
if (y <= 0) y = 0;
if (x <= 0) x = 0;
y_low = (int)y;
x_low = (int)x;
if (y_low >= height - 1) {
y_high = y_low = height - 1;
y = (T)y_low;
} else {
y_high = y_low + 1;
}
if (x_low >= width - 1) {
x_high = x_low = width - 1;
x = (T)x_low;
} else {
x_high = x_low + 1;
}
T ly = y - y_low;
T lx = x - x_low;
T hy = 1. - ly, hx = 1. - lx;
// reference in forward
// T v1 = bottom_data[y_low * width + x_low];
// T v2 = bottom_data[y_low * width + x_high];
// T v3 = bottom_data[y_high * width + x_low];
// T v4 = bottom_data[y_high * width + x_high];
// T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
return;
}
template <typename T>
__global__ void RoIAlignBackwardFeatureV2(
const int nthreads, const T* top_diff, const int num_rois,
const T spatial_scale, const int channels, const int height,
const int width, const int pooled_height, const int pooled_width,
const int sampling_ratio, T* bottom_diff, const T* bottom_rois,
bool aligned) {
CUDA_1D_KERNEL_LOOP(index, nthreads) {
// (n, c, ph, pw) is an element in the pooled output
int pw = index % pooled_width;
int ph = (index / pooled_width) % pooled_height;
int c = (index / pooled_width / pooled_height) % channels;
int n = index / pooled_width / pooled_height / channels;
const T* offset_bottom_rois = bottom_rois + n * 5;
int roi_batch_ind = offset_bottom_rois[0];
// Do not use rounding; this implementation detail is critical
T offset = aligned ? (T)0.5 : (T)0.0;
T roi_start_w = offset_bottom_rois[1] * spatial_scale - offset;
T roi_start_h = offset_bottom_rois[2] * spatial_scale - offset;
T roi_end_w = offset_bottom_rois[3] * spatial_scale - offset;
T roi_end_h = offset_bottom_rois[4] * spatial_scale - offset;
T roi_width = roi_end_w - roi_start_w;
T roi_height = roi_end_h - roi_start_h;
if (!aligned) { // for backward-compatibility only
roi_width = max(roi_width, (T)1.);
roi_height = max(roi_height, (T)1.);
}
T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
T* offset_bottom_diff =
bottom_diff + (roi_batch_ind * channels + c) * height * width;
int top_offset = (n * channels + c) * pooled_height * pooled_width;
const T* offset_top_diff = top_diff + top_offset;
const T top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
// We use roi_bin_grid to sample the grid and mimic integral
int roi_bin_grid_h = (sampling_ratio > 0)
? sampling_ratio
: ceil(roi_height / pooled_height); // e.g., = 2
int roi_bin_grid_w =
(sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
// We do average (integral) pooling inside a bin
const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
{
const T y = roi_start_h + ph * bin_size_h +
static_cast<T>(iy + .5f) * bin_size_h /
static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
for (int ix = 0; ix < roi_bin_grid_w; ix++) {
const T x = roi_start_w + pw * bin_size_w +
static_cast<T>(ix + .5f) * bin_size_w /
static_cast<T>(roi_bin_grid_w);
T w1, w2, w3, w4;
int x_low, x_high, y_low, y_high;
bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
x_low, x_high, y_low, y_high, index);
T g1 = top_diff_this_bin * w1 / count;
T g2 = top_diff_this_bin * w2 / count;
T g3 = top_diff_this_bin * w3 / count;
T g4 = top_diff_this_bin * w4 / count;
if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
atomicAdd(offset_bottom_diff + y_low * width + x_low,
static_cast<T>(g1));
atomicAdd(offset_bottom_diff + y_low * width + x_high,
static_cast<T>(g2));
atomicAdd(offset_bottom_diff + y_high * width + x_low,
static_cast<T>(g3));
atomicAdd(offset_bottom_diff + y_high * width + x_high,
static_cast<T>(g4));
} // if
} // ix
} // iy
} // CUDA_1D_KERNEL_LOOP
} // RoIAlignBackward
at::Tensor ROIAlignForwardV2Laucher(const at::Tensor& input,
const at::Tensor& rois,
const float spatial_scale,
const int pooled_height,
const int pooled_width,
const int sampling_ratio, bool aligned) {
AT_ASSERTM(input.device().is_cuda(), "input must be a CUDA tensor");
AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
at::CheckedFrom c = "ROIAlign_forward_cuda";
at::checkAllSameGPU(c, {input_t, rois_t});
at::checkAllSameType(c, {input_t, rois_t});
at::cuda::CUDAGuard device_guard(input.device());
auto num_rois = rois.size(0);
auto channels = input.size(1);
auto height = input.size(2);
auto width = input.size(3);
auto output = at::empty({num_rois, channels, pooled_height, pooled_width},
input.options());
auto output_size = num_rois * pooled_height * pooled_width * channels;
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min(at::cuda::ATenCeilDiv(output_size, 512L), 4096L));
dim3 block(512);
if (output.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return output;
}
AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] {
RoIAlignForwardV2<scalar_t><<<grid, block, 0, stream>>>(
output_size, input.contiguous().data<scalar_t>(), spatial_scale,
channels, height, width, pooled_height, pooled_width, sampling_ratio,
rois.contiguous().data<scalar_t>(), output.data<scalar_t>(), aligned);
});
cudaDeviceSynchronize();
AT_CUDA_CHECK(cudaGetLastError());
return output;
}
// TODO remove the dependency on input and use instead its sizes -> save memory
at::Tensor ROIAlignBackwardV2Laucher(
const at::Tensor& grad, const at::Tensor& rois, const float spatial_scale,
const int pooled_height, const int pooled_width, const int batch_size,
const int channels, const int height, const int width,
const int sampling_ratio, bool aligned) {
AT_ASSERTM(grad.device().is_cuda(), "grad must be a CUDA tensor");
AT_ASSERTM(rois.device().is_cuda(), "rois must be a CUDA tensor");
at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2};
at::CheckedFrom c = "ROIAlign_backward_cuda";
at::checkAllSameGPU(c, {grad_t, rois_t});
at::checkAllSameType(c, {grad_t, rois_t});
at::cuda::CUDAGuard device_guard(grad.device());
auto num_rois = rois.size(0);
auto grad_input =
at::zeros({batch_size, channels, height, width}, grad.options());
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
dim3 grid(std::min(at::cuda::ATenCeilDiv(grad.numel(), 512L), 4096L));
dim3 block(512);
// handle possibly empty gradients
if (grad.numel() == 0) {
AT_CUDA_CHECK(cudaGetLastError());
return grad_input;
}
AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIAlign_backward", [&] {
RoIAlignBackwardFeatureV2<scalar_t><<<grid, block, 0, stream>>>(
grad.numel(), grad.contiguous().data<scalar_t>(), num_rois,
spatial_scale, channels, height, width, pooled_height, pooled_width,
sampling_ratio, grad_input.data<scalar_t>(),
rois.contiguous().data<scalar_t>(), aligned);
});
AT_CUDA_CHECK(cudaGetLastError());
return grad_input;
}
......@@ -237,7 +237,11 @@ if __name__ == '__main__':
make_cuda_ext(
name='roi_align_cuda',
module='mmdet.ops.roi_align',
sources=['src/roi_align_cuda.cpp', 'src/roi_align_kernel.cu']),
sources=[
'src/roi_align_cuda.cpp',
'src/roi_align_kernel.cu',
'src/roi_align_kernel_v2.cu',
]),
make_cuda_ext(
name='roi_pool_cuda',
module='mmdet.ops.roi_pool',
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment