Change V2.0 coors (#2380)

* Refactor (all): change coordinate system * Fix (mask_head): fix cat -1 bug in mask_paste * Fix (unittest) : modify unittest and pass CI * reformat to pass CI * Fix round coordinates bugs * clean file * Fix (test): use cpu version of aligned roi_align in tests * Refactor (mask): clean np.stack * Refactor (head): reformat code and fix missing -1 * Reformat: reformat and add doc strings * Refactor (mask_head): more clea docstring

Change V2.0 coors (#2380)
7ed8d51e · Wenwei Zhang · GitHub · 5db9b2e3 · 7ed8d51e · 7ed8d51e
Unverified Commit 7ed8d51e authored 4 years ago by Wenwei Zhang Committed by GitHub 4 years ago
--- a/mmdet/core/anchor/anchor_generator.py
+++ b/mmdet/core/anchor/anchor_generator.py
@@ -8,10 +8,10 @@ class AnchorGenerator(object):
        >>> self = AnchorGenerator(9, [1.], [1.])
        >>> all_anchors = self.grid_anchors((2, 2), device='cpu')
        >>> print(all_anchors)
-        tensor([[ 0.,  0.,  8.,  8.],
+        tensor([[-4.5000, -4.5000,  4.5000,  4.5000],
-                [16.,  0., 24.,  8.],
+                [11.5000, -4.5000, 20.5000,  4.5000],
-                [ 0., 16.,  8., 24.],
+                [-4.5000, 11.5000,  4.5000, 20.5000],
-                [16., 16., 24., 24.]])
+                [11.5000, 11.5000, 20.5000, 20.5000]])
    """
    def __init__(self, base_size, scales, ratios, scale_major=True, ctr=None):
@@ -30,8 +30,8 @@ class AnchorGenerator(object):
        w = self.base_size
        h = self.base_size
        if self.ctr is None:
-            x_ctr = 0.5 * (w - 1)
+            x_ctr = 0.
-            y_ctr = 0.5 * (h - 1)
+            y_ctr = 0.
        else:
            x_ctr, y_ctr = self.ctr
@@ -44,14 +44,13 @@ class AnchorGenerator(object):
            ws = (w * self.scales[:, None] * w_ratios[None, :]).view(-1)
            hs = (h * self.scales[:, None] * h_ratios[None, :]).view(-1)
-        # yapf: disable
+        # use float anchor and the anchor's center is aligned with the
-        base_anchors = torch.stack(
+        # pixel center
-            [
+        base_anchors = [
-                x_ctr - 0.5 * (ws - 1), y_ctr - 0.5 * (hs - 1),
+            x_ctr - 0.5 * ws, y_ctr - 0.5 * hs, x_ctr + 0.5 * ws,
-                x_ctr + 0.5 * (ws - 1), y_ctr + 0.5 * (hs - 1)
+            y_ctr + 0.5 * hs
-            ],
+        ]
-            dim=-1).round()
+        base_anchors = torch.stack(base_anchors, dim=-1)
-        # yapf: enable
        return base_anchors

--- a/mmdet/core/anchor/guided_anchor_target.py
+++ b/mmdet/core/anchor/guided_anchor_target.py
@@ -22,10 +22,10 @@ def calc_region(bbox, ratio, featmap_size=None):
    x2 = torch.round(ratio * bbox[0] + (1 - ratio) * bbox[2]).long()
    y2 = torch.round(ratio * bbox[1] + (1 - ratio) * bbox[3]).long()
    if featmap_size is not None:
-        x1 = x1.clamp(min=0, max=featmap_size[1] - 1)
+        x1 = x1.clamp(min=0, max=featmap_size[1])
-        y1 = y1.clamp(min=0, max=featmap_size[0] - 1)
+        y1 = y1.clamp(min=0, max=featmap_size[0])
-        x2 = x2.clamp(min=0, max=featmap_size[1] - 1)
+        x2 = x2.clamp(min=0, max=featmap_size[1])
-        y2 = y2.clamp(min=0, max=featmap_size[0] - 1)
+        y2 = y2.clamp(min=0, max=featmap_size[0])
    return (x1, y1, x2, y2)
@@ -76,8 +76,8 @@ def ga_loc_target(gt_bboxes_list,
        all_ignore_map.append(ignore_map)
    for img_id in range(img_per_gpu):
        gt_bboxes = gt_bboxes_list[img_id]
-        scale = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0] + 1) *
+        scale = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0]) *
-                           (gt_bboxes[:, 3] - gt_bboxes[:, 1] + 1))
+                           (gt_bboxes[:, 3] - gt_bboxes[:, 1]))
        min_anchor_size = scale.new_full(
            (1, ), float(anchor_scale * anchor_strides[0]))
        # assign gt bboxes to different feature levels w.r.t. their scales

--- a/mmdet/core/bbox/geometry.py
+++ b/mmdet/core/bbox/geometry.py
@@ -30,8 +30,8 @@ def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False):
        >>>     [10, 10, 20, 20],
        >>> ])
        >>> bbox_overlaps(bboxes1, bboxes2)
-        tensor([[0.5238, 0.0500, 0.0041],
+        tensor([[0.5000, 0.0000, 0.0000],
-                [0.0323, 0.0452, 1.0000],
+                [0.0000, 0.0000, 1.0000],
                [0.0000, 0.0000, 0.0000]])
    Example:
@@ -58,14 +58,14 @@ def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False):
        lt = torch.max(bboxes1[:, :2], bboxes2[:, :2])  # [rows, 2]
        rb = torch.min(bboxes1[:, 2:], bboxes2[:, 2:])  # [rows, 2]
-        wh = (rb - lt + 1).clamp(min=0)  # [rows, 2]
+        wh = (rb - lt).clamp(min=0)  # [rows, 2]
        overlap = wh[:, 0] * wh[:, 1]
-        area1 = (bboxes1[:, 2] - bboxes1[:, 0] + 1) * (
+        area1 = (bboxes1[:, 2] - bboxes1[:, 0]) * (
-            bboxes1[:, 3] - bboxes1[:, 1] + 1)
+            bboxes1[:, 3] - bboxes1[:, 1])
        if mode == 'iou':
-            area2 = (bboxes2[:, 2] - bboxes2[:, 0] + 1) * (
+            area2 = (bboxes2[:, 2] - bboxes2[:, 0]) * (
-                bboxes2[:, 3] - bboxes2[:, 1] + 1)
+                bboxes2[:, 3] - bboxes2[:, 1])
            ious = overlap / (area1 + area2 - overlap)
        else:
            ious = overlap / area1
@@ -73,14 +73,14 @@ def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False):
        lt = torch.max(bboxes1[:, None, :2], bboxes2[:, :2])  # [rows, cols, 2]
        rb = torch.min(bboxes1[:, None, 2:], bboxes2[:, 2:])  # [rows, cols, 2]
-        wh = (rb - lt + 1).clamp(min=0)  # [rows, cols, 2]
+        wh = (rb - lt).clamp(min=0)  # [rows, cols, 2]
        overlap = wh[:, :, 0] * wh[:, :, 1]
-        area1 = (bboxes1[:, 2] - bboxes1[:, 0] + 1) * (
+        area1 = (bboxes1[:, 2] - bboxes1[:, 0]) * (
-            bboxes1[:, 3] - bboxes1[:, 1] + 1)
+            bboxes1[:, 3] - bboxes1[:, 1])
        if mode == 'iou':
-            area2 = (bboxes2[:, 2] - bboxes2[:, 0] + 1) * (
+            area2 = (bboxes2[:, 2] - bboxes2[:, 0]) * (
-                bboxes2[:, 3] - bboxes2[:, 1] + 1)
+                bboxes2[:, 3] - bboxes2[:, 1])
            ious = overlap / (area1[:, None] + area2 - overlap)
        else:
            ious = overlap / (area1[:, None])

--- a/mmdet/core/bbox/transforms.py
+++ b/mmdet/core/bbox/transforms.py
@@ -10,13 +10,13 @@ def bbox2delta(proposals, gt, means=[0, 0, 0, 0], stds=[1, 1, 1, 1]):
    gt = gt.float()
    px = (proposals[..., 0] + proposals[..., 2]) * 0.5
    py = (proposals[..., 1] + proposals[..., 3]) * 0.5
-    pw = proposals[..., 2] - proposals[..., 0] + 1.0
+    pw = proposals[..., 2] - proposals[..., 0]
-    ph = proposals[..., 3] - proposals[..., 1] + 1.0
+    ph = proposals[..., 3] - proposals[..., 1]
    gx = (gt[..., 0] + gt[..., 2]) * 0.5
    gy = (gt[..., 1] + gt[..., 3]) * 0.5
-    gw = gt[..., 2] - gt[..., 0] + 1.0
+    gw = gt[..., 2] - gt[..., 0]
-    gh = gt[..., 3] - gt[..., 1] + 1.0
+    gh = gt[..., 3] - gt[..., 1]
    dx = (gx - px) / pw
    dy = (gy - py) / ph
@@ -71,9 +71,9 @@ def delta2bbox(rois,
        >>>                        [ 0.7, -1.9, -0.5,  0.3]])
        >>> delta2bbox(rois, deltas, max_shape=(32, 32))
        tensor([[0.0000, 0.0000, 1.0000, 1.0000],
-                [0.2817, 0.2817, 4.7183, 4.7183],
+                [0.1409, 0.1409, 2.8591, 2.8591],
-                [0.0000, 0.6321, 7.3891, 0.3679],
+                [0.0000, 0.3161, 4.1945, 0.6839],
-                [5.8967, 2.9251, 5.5033, 3.2749]])
+                [5.0000, 5.0000, 5.0000, 5.0000]])
    """
    means = deltas.new_tensor(means).repeat(1, deltas.size(1) // 4)
    stds = deltas.new_tensor(stds).repeat(1, deltas.size(1) // 4)
@@ -89,8 +89,8 @@ def delta2bbox(rois,
    px = ((rois[:, 0] + rois[:, 2]) * 0.5).unsqueeze(1).expand_as(dx)
    py = ((rois[:, 1] + rois[:, 3]) * 0.5).unsqueeze(1).expand_as(dy)
    # Compute width/height of each roi
-    pw = (rois[:, 2] - rois[:, 0] + 1.0).unsqueeze(1).expand_as(dw)
+    pw = (rois[:, 2] - rois[:, 0]).unsqueeze(1).expand_as(dw)
-    ph = (rois[:, 3] - rois[:, 1] + 1.0).unsqueeze(1).expand_as(dh)
+    ph = (rois[:, 3] - rois[:, 1]).unsqueeze(1).expand_as(dh)
    # Use exp(network energy) to enlarge/shrink each roi
    gw = pw * dw.exp()
    gh = ph * dh.exp()
@@ -98,15 +98,15 @@ def delta2bbox(rois,
    gx = torch.addcmul(px, 1, pw, dx)  # gx = px + pw * dx
    gy = torch.addcmul(py, 1, ph, dy)  # gy = py + ph * dy
    # Convert center-xy/width/height to top-left, bottom-right
-    x1 = gx - gw * 0.5 + 0.5
+    x1 = gx - gw * 0.5
-    y1 = gy - gh * 0.5 + 0.5
+    y1 = gy - gh * 0.5
-    x2 = gx + gw * 0.5 - 0.5
+    x2 = gx + gw * 0.5
-    y2 = gy + gh * 0.5 - 0.5
+    y2 = gy + gh * 0.5
    if max_shape is not None:
-        x1 = x1.clamp(min=0, max=max_shape[1] - 1)
+        x1 = x1.clamp(min=0, max=max_shape[1])
-        y1 = y1.clamp(min=0, max=max_shape[0] - 1)
+        y1 = y1.clamp(min=0, max=max_shape[0])
-        x2 = x2.clamp(min=0, max=max_shape[1] - 1)
+        x2 = x2.clamp(min=0, max=max_shape[1])
-        y2 = y2.clamp(min=0, max=max_shape[0] - 1)
+        y2 = y2.clamp(min=0, max=max_shape[0])
    bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view_as(deltas)
    return bboxes
@@ -124,8 +124,8 @@ def bbox_flip(bboxes, img_shape):
    if isinstance(bboxes, torch.Tensor):
        assert bboxes.shape[-1] % 4 == 0
        flipped = bboxes.clone()
-        flipped[:, 0::4] = img_shape[1] - bboxes[:, 2::4] - 1
+        flipped[:, 0::4] = img_shape[1] - bboxes[:, 2::4]
-        flipped[:, 2::4] = img_shape[1] - bboxes[:, 0::4] - 1
+        flipped[:, 2::4] = img_shape[1] - bboxes[:, 0::4]
        return flipped
    elif isinstance(bboxes, np.ndarray):
        return mmcv.bbox_flip(bboxes, img_shape)
@@ -216,8 +216,8 @@ def distance2bbox(points, distance, max_shape=None):
    x2 = points[:, 0] + distance[:, 2]
    y2 = points[:, 1] + distance[:, 3]
    if max_shape is not None:
-        x1 = x1.clamp(min=0, max=max_shape[1] - 1)
+        x1 = x1.clamp(min=0, max=max_shape[1])
-        y1 = y1.clamp(min=0, max=max_shape[0] - 1)
+        y1 = y1.clamp(min=0, max=max_shape[0])
-        x2 = x2.clamp(min=0, max=max_shape[1] - 1)
+        x2 = x2.clamp(min=0, max=max_shape[1])
-        y2 = y2.clamp(min=0, max=max_shape[0] - 1)
+        y2 = y2.clamp(min=0, max=max_shape[0])
    return torch.stack([x1, y1, x2, y2], -1)
--- a/mmdet/core/evaluation/bbox_overlaps.py
+++ b/mmdet/core/evaluation/bbox_overlaps.py
@@ -28,17 +28,15 @@ def bbox_overlaps(bboxes1, bboxes2, mode='iou'):
        bboxes1, bboxes2 = bboxes2, bboxes1
        ious = np.zeros((cols, rows), dtype=np.float32)
        exchange = True
-    area1 = (bboxes1[:, 2] - bboxes1[:, 0] + 1) * (
+    area1 = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1])
-        bboxes1[:, 3] - bboxes1[:, 1] + 1)
+    area2 = (bboxes2[:, 2] - bboxes2[:, 0]) * (bboxes2[:, 3] - bboxes2[:, 1])
-    area2 = (bboxes2[:, 2] - bboxes2[:, 0] + 1) * (
-        bboxes2[:, 3] - bboxes2[:, 1] + 1)
    for i in range(bboxes1.shape[0]):
        x_start = np.maximum(bboxes1[i, 0], bboxes2[:, 0])
        y_start = np.maximum(bboxes1[i, 1], bboxes2[:, 1])
        x_end = np.minimum(bboxes1[i, 2], bboxes2[:, 2])
        y_end = np.minimum(bboxes1[i, 3], bboxes2[:, 3])
-        overlap = np.maximum(x_end - x_start + 1, 0) * np.maximum(
+        overlap = np.maximum(x_end - x_start, 0) * np.maximum(
-            y_end - y_start + 1, 0)
+            y_end - y_start, 0)
        if mode == 'iou':
            union = area1[i] + area2 - overlap
        else:

--- a/mmdet/core/evaluation/mean_ap.py
+++ b/mmdet/core/evaluation/mean_ap.py
@@ -98,14 +98,14 @@ def tpfp_imagenet(det_bboxes,
        if area_ranges == [(None, None)]:
            fp[...] = 1
        else:
-            det_areas = (det_bboxes[:, 2] - det_bboxes[:, 0] + 1) * (
+            det_areas = (det_bboxes[:, 2] - det_bboxes[:, 0]) * (
-                det_bboxes[:, 3] - det_bboxes[:, 1] + 1)
+                det_bboxes[:, 3] - det_bboxes[:, 1])
            for i, (min_area, max_area) in enumerate(area_ranges):
                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
        return tp, fp
    ious = bbox_overlaps(det_bboxes, gt_bboxes - 1)
-    gt_w = gt_bboxes[:, 2] - gt_bboxes[:, 0] + 1
+    gt_w = gt_bboxes[:, 2] - gt_bboxes[:, 0]
-    gt_h = gt_bboxes[:, 3] - gt_bboxes[:, 1] + 1
+    gt_h = gt_bboxes[:, 3] - gt_bboxes[:, 1]
    iou_thrs = np.minimum((gt_w * gt_h) / ((gt_w + 10.0) * (gt_h + 10.0)),
                          default_iou_thr)
    # sort all detections by scores in descending order
@@ -144,7 +144,7 @@ def tpfp_imagenet(det_bboxes,
                fp[k, i] = 1
            else:
                bbox = det_bboxes[i, :4]
-                area = (bbox[2] - bbox[0] + 1) * (bbox[3] - bbox[1] + 1)
+                area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
                if area >= min_area and area < max_area:
                    fp[k, i] = 1
    return tp, fp
@@ -194,8 +194,8 @@ def tpfp_default(det_bboxes,
        if area_ranges == [(None, None)]:
            fp[...] = 1
        else:
-            det_areas = (det_bboxes[:, 2] - det_bboxes[:, 0] + 1) * (
+            det_areas = (det_bboxes[:, 2] - det_bboxes[:, 0]) * (
-                det_bboxes[:, 3] - det_bboxes[:, 1] + 1)
+                det_bboxes[:, 3] - det_bboxes[:, 1])
            for i, (min_area, max_area) in enumerate(area_ranges):
                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
        return tp, fp
@@ -213,8 +213,8 @@ def tpfp_default(det_bboxes,
        if min_area is None:
            gt_area_ignore = np.zeros_like(gt_ignore_inds, dtype=bool)
        else:
-            gt_areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0] + 1) * (
+            gt_areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * (
-                gt_bboxes[:, 3] - gt_bboxes[:, 1] + 1)
+                gt_bboxes[:, 3] - gt_bboxes[:, 1])
            gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
        for i in sort_inds:
            if ious_max[i] >= iou_thr:
@@ -231,7 +231,7 @@ def tpfp_default(det_bboxes,
                fp[k, i] = 1
            else:
                bbox = det_bboxes[i, :4]
-                area = (bbox[2] - bbox[0] + 1) * (bbox[3] - bbox[1] + 1)
+                area = (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])
                if area >= min_area and area < max_area:
                    fp[k, i] = 1
    return tp, fp
@@ -332,8 +332,8 @@ def eval_map(det_results,
            if area_ranges is None:
                num_gts[0] += bbox.shape[0]
            else:
-                gt_areas = (bbox[:, 2] - bbox[:, 0] + 1) * (
+                gt_areas = (bbox[:, 2] - bbox[:, 0]) * (
-                    bbox[:, 3] - bbox[:, 1] + 1)
+                    bbox[:, 3] - bbox[:, 1])
                for k, (min_area, max_area) in enumerate(area_ranges):
                    num_gts[k] += np.sum((gt_areas >= min_area)
                                         & (gt_areas < max_area))

--- a/mmdet/core/mask/mask_target.py
+++ b/mmdet/core/mask/mask_target.py
@@ -13,21 +13,21 @@ def mask_target(pos_proposals_list, pos_assigned_gt_inds_list, gt_masks_list,
 def mask_target_single(pos_proposals, pos_assigned_gt_inds, gt_masks, cfg):
+    device = pos_proposals.device
    mask_size = _pair(cfg.mask_size)
    num_pos = pos_proposals.size(0)
    if num_pos > 0:
        proposals_np = pos_proposals.cpu().numpy()
        maxh, maxw = gt_masks.height, gt_masks.width
-        proposals_np[:, [0, 2]] = np.clip(proposals_np[:, [0, 2]], 0, maxw - 1)
+        proposals_np[:, [0, 2]] = np.clip(proposals_np[:, [0, 2]], 0, maxw)
-        proposals_np[:, [1, 3]] = np.clip(proposals_np[:, [1, 3]], 0, maxh - 1)
+        proposals_np[:, [1, 3]] = np.clip(proposals_np[:, [1, 3]], 0, maxh)
-        proposals_np = proposals_np.astype(np.int32)
        pos_assigned_gt_inds = pos_assigned_gt_inds.cpu().numpy()
        mask_targets = gt_masks.crop_and_resize(
-            proposals_np, mask_size, inds=pos_assigned_gt_inds).to_ndarray()
+            proposals_np, mask_size, device=device,
+            inds=pos_assigned_gt_inds).to_ndarray()
-        mask_targets = torch.from_numpy(np.stack(mask_targets)).float().to(
+        mask_targets = torch.from_numpy(mask_targets).float().to(device)
-            pos_proposals.device)
    else:
        mask_targets = pos_proposals.new_zeros((0, ) + mask_size)

--- a/mmdet/core/mask/structures.py
+++ b/mmdet/core/mask/structures.py
@@ -5,6 +5,8 @@ import numpy as np
 import pycocotools.mask as maskUtils
 import torch
+from mmdet.ops.roi_align import roi_align
 class BaseInstanceMasks(metaclass=ABCMeta):
@@ -185,11 +187,11 @@ class BitmapMasks(BaseInstanceMasks):
        # clip the boundary
        bbox = bbox.copy()
-        bbox[0::2] = np.clip(bbox[0::2], 0, self.width - 1)
+        bbox[0::2] = np.clip(bbox[0::2], 0, self.width)
-        bbox[1::2] = np.clip(bbox[1::2], 0, self.height - 1)
+        bbox[1::2] = np.clip(bbox[1::2], 0, self.height)
        x1, y1, x2, y2 = bbox
-        w = np.maximum(x2 - x1 + 1, 1)
+        w = np.maximum(x2 - x1, 1)
-        h = np.maximum(y2 - y1 + 1, 1)
+        h = np.maximum(y2 - y1, 1)
        if len(self.masks) == 0:
            cropped_masks = np.empty((0, h, w), dtype=np.uint8)
@@ -201,6 +203,7 @@ class BitmapMasks(BaseInstanceMasks):
                        bboxes,
                        out_shape,
                        inds,
+                        device='cpu',
                        interpolation='bilinear'):
        """Crop and resize masks by the given bboxes.
@@ -209,9 +212,10 @@ class BitmapMasks(BaseInstanceMasks):
        assigned bbox and resize to the size of (mask_h, mask_w)
        Args:
-            bboxes (ndarray): bboxes in format [x1, y1, x2, y2], shape (N, 4)
+            bboxes (Tensor): bboxes in format [x1, y1, x2, y2], shape (N, 4)
            out_shape (tuple[int]): target (h, w) of resized mask
            inds (ndarray): indexes to assign masks to each bbox
+            device (str): device of bboxes
            interpolation (str): see `mmcv.imresize`
        Return:
@@ -221,19 +225,26 @@ class BitmapMasks(BaseInstanceMasks):
            empty_masks = np.empty((0, *out_shape), dtype=np.uint8)
            return BitmapMasks(empty_masks, *out_shape)
-        resized_masks = []
+        # convert bboxes to tensor
-        for i in range(len(bboxes)):
+        if isinstance(bboxes, np.ndarray):
-            mask = self.masks[inds[i]]
+            bboxes = torch.from_numpy(bboxes).to(device=device)
-            bbox = bboxes[i, :].astype(np.int32)
+        if isinstance(inds, np.ndarray):
-            x1, y1, x2, y2 = bbox
+            inds = torch.from_numpy(inds).to(device=device)
-            w = np.maximum(x2 - x1 + 1, 1)
-            h = np.maximum(y2 - y1 + 1, 1)
+        num_bbox = bboxes.shape[0]
-            resized_masks.append(
+        fake_inds = torch.arange(
-                mmcv.imresize(
+            num_bbox, device=device).to(dtype=bboxes.dtype)[:, None]
-                    mask[y1:y1 + h, x1:x1 + w],
+        rois = torch.cat([fake_inds, bboxes], dim=1)  # Nx5
-                    out_shape,
+        rois = rois.to(device=device)
-                    interpolation=interpolation))
+        if num_bbox > 0:
-        return BitmapMasks(np.stack(resized_masks), *out_shape)
+            gt_masks_th = torch.from_numpy(self.masks).to(device).index_select(
+                0, inds).to(dtype=rois.dtype)
+            targets = roi_align(gt_masks_th[:, None, :, :], rois, out_shape,
+                                1.0, 0, True).squeeze(1)
+            resized_masks = (targets >= 0.5).cpu().numpy()
+        else:
+            resized_masks = []
+        return BitmapMasks(resized_masks, *out_shape)
    def expand(self, expanded_h, expanded_w, top, left):
        """see `transforms.Expand`."""
@@ -355,7 +366,7 @@ class PolygonMasks(BaseInstanceMasks):
                flipped_poly_per_obj = []
                for p in poly_per_obj:
                    p = p.copy()
-                    p[idx::2] = dim - p[idx::2] - 1
+                    p[idx::2] = dim - p[idx::2]
                    flipped_poly_per_obj.append(p)
                flipped_masks.append(flipped_poly_per_obj)
            flipped_masks = PolygonMasks(flipped_masks, self.height,
@@ -369,11 +380,11 @@ class PolygonMasks(BaseInstanceMasks):
        # clip the boundary
        bbox = bbox.copy()
-        bbox[0::2] = np.clip(bbox[0::2], 0, self.width - 1)
+        bbox[0::2] = np.clip(bbox[0::2], 0, self.width)
-        bbox[1::2] = np.clip(bbox[1::2], 0, self.height - 1)
+        bbox[1::2] = np.clip(bbox[1::2], 0, self.height)
        x1, y1, x2, y2 = bbox
-        w = np.maximum(x2 - x1 + 1, 1)
+        w = np.maximum(x2 - x1, 1)
-        h = np.maximum(y2 - y1 + 1, 1)
+        h = np.maximum(y2 - y1, 1)
        if len(self.masks) == 0:
            cropped_masks = PolygonMasks([], h, w)
@@ -402,6 +413,7 @@ class PolygonMasks(BaseInstanceMasks):
                        bboxes,
                        out_shape,
                        inds,
+                        device='cpu',
                        interpolation='bilinear'):
        """see BitmapMasks.crop_and_resize"""
        out_h, out_w = out_shape
@@ -413,8 +425,8 @@ class PolygonMasks(BaseInstanceMasks):
            mask = self.masks[inds[i]]
            bbox = bboxes[i, :].astype(np.int32)
            x1, y1, x2, y2 = bbox
-            w = np.maximum(x2 - x1 + 1, 1)
+            w = np.maximum(x2 - x1, 1)
-            h = np.maximum(y2 - y1 + 1, 1)
+            h = np.maximum(y2 - y1, 1)
            h_scale = out_h / h
            w_scale = out_w / w

--- a/mmdet/datasets/cityscapes.py
+++ b/mmdet/datasets/cityscapes.py
@@ -60,7 +60,7 @@ class CityscapesDataset(CocoDataset):
            x1, y1, w, h = ann['bbox']
            if ann['area'] <= 0 or w < 1 or h < 1:
                continue
-            bbox = [x1, y1, x1 + w - 1, y1 + h - 1]
+            bbox = [x1, y1, x1 + w, y1 + h]
            if ann.get('iscrowd', False):
                gt_bboxes_ignore.append(bbox)
            else:

--- a/mmdet/datasets/coco.py
+++ b/mmdet/datasets/coco.py
@@ -86,7 +86,7 @@ class CocoDataset(CustomDataset):
            x1, y1, w, h = ann['bbox']
            if ann['area'] <= 0 or w < 1 or h < 1:
                continue
-            bbox = [x1, y1, x1 + w - 1, y1 + h - 1]
+            bbox = [x1, y1, x1 + w, y1 + h]
            if ann.get('iscrowd', False):
                gt_bboxes_ignore.append(bbox)
            else:
@@ -122,8 +122,8 @@ class CocoDataset(CustomDataset):
        return [
            _bbox[0],
            _bbox[1],
-            _bbox[2] - _bbox[0] + 1,
+            _bbox[2] - _bbox[0],
-            _bbox[3] - _bbox[1] + 1,
+            _bbox[3] - _bbox[1],
        ]
    def _proposal2json(self, results):
@@ -249,7 +249,7 @@ class CocoDataset(CustomDataset):
                if ann.get('ignore', False) or ann['iscrowd']:
                    continue
                x1, y1, w, h = ann['bbox']
-                bboxes.append([x1, y1, x1 + w - 1, y1 + h - 1])
+                bboxes.append([x1, y1, x1 + w, y1 + h])
            bboxes = np.array(bboxes, dtype=np.float32)
            if bboxes.shape[0] == 0:
                bboxes = np.zeros((0, 4))

--- a/mmdet/datasets/pipelines/instaboost.py
+++ b/mmdet/datasets/pipelines/instaboost.py
@@ -44,7 +44,8 @@ class InstaBoost(object):
            bbox = bboxes[i]
            mask = masks[i]
            x1, y1, x2, y2 = bbox
-            bbox = [x1, y1, x2 - x1 + 1, y2 - y1 + 1]
+            # assert (x2 - x1) >= 1 and (y2 - y1) >= 1
+            bbox = [x1, y1, x2 - x1, y2 - y1]
            anns.append({
                'category_id': label,
                'segmentation': mask,
@@ -59,7 +60,10 @@ class InstaBoost(object):
        gt_masks_ann = []
        for ann in anns:
            x1, y1, w, h = ann['bbox']
-            bbox = [x1, y1, x1 + w - 1, y1 + h - 1]
+            # TODO: more essential bug need to be fixed in instaboost
+            if w <= 0 or h <= 0:
+                continue
+            bbox = [x1, y1, x1 + w, y1 + h]
            gt_bboxes.append(bbox)
            gt_labels.append(ann['category_id'])
            gt_masks_ann.append(ann['segmentation'])
@@ -73,6 +77,7 @@ class InstaBoost(object):
    def __call__(self, results):
        img = results['img']
+        orig_type = img.dtype
        anns = self._load_anns(results)
        if np.random.choice([0, 1], p=[1 - self.aug_ratio, self.aug_ratio]):
            try:
@@ -81,8 +86,9 @@ class InstaBoost(object):
                raise ImportError('Please run "pip install instaboostfast" '
                                  'to install instaboostfast first.')
            anns, img = instaboost.get_new_data(
-                anns, img, self.cfg, background=None)
+                anns, img.astype(np.uint8), self.cfg, background=None)
-        results = self._parse_anns(results, anns, img)
+        results = self._parse_anns(results, anns, img.astype(orig_type))
        return results
    def __repr__(self):

--- a/mmdet/datasets/pipelines/transforms.py
+++ b/mmdet/datasets/pipelines/transforms.py
@@ -143,8 +143,8 @@ class Resize(object):
        img_shape = results['img_shape']
        for key in results.get('bbox_fields', []):
            bboxes = results[key] * results['scale_factor']
-            bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1] - 1)
+            bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
-            bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0] - 1)
+            bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
            results[key] = bboxes
    def _resize_masks(self, results):
@@ -215,12 +215,12 @@ class RandomFlip(object):
        flipped = bboxes.copy()
        if direction == 'horizontal':
            w = img_shape[1]
-            flipped[..., 0::4] = w - bboxes[..., 2::4] - 1
+            flipped[..., 0::4] = w - bboxes[..., 2::4]
-            flipped[..., 2::4] = w - bboxes[..., 0::4] - 1
+            flipped[..., 2::4] = w - bboxes[..., 0::4]
        elif direction == 'vertical':
            h = img_shape[0]
-            flipped[..., 1::4] = h - bboxes[..., 3::4] - 1
+            flipped[..., 1::4] = h - bboxes[..., 3::4]
-            flipped[..., 3::4] = h - bboxes[..., 1::4] - 1
+            flipped[..., 3::4] = h - bboxes[..., 1::4]
        else:
            raise ValueError(
                'Invalid flipping direction "{}"'.format(direction))
@@ -372,8 +372,8 @@ class RandomCrop(object):
            bbox_offset = np.array([offset_w, offset_h, offset_w, offset_h],
                                   dtype=np.float32)
            bboxes = results[key] - bbox_offset
-            bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1] - 1)
+            bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
-            bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0] - 1)
+            bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
            results[key] = bboxes
        # crop semantic seg

--- a/mmdet/datasets/xml_style.py
+++ b/mmdet/datasets/xml_style.py
@@ -47,6 +47,7 @@ class XMLDataset(CustomDataset):
            label = self.cat2label[name]
            difficult = int(obj.find('difficult').text)
            bnd_box = obj.find('bndbox')
+            # TODO: check whether it is necessary to use int
            # Coordinates may be float type
            bbox = [
                int(float(bnd_box.find('xmin').text)),

--- a/mmdet/models/anchor_heads/fcos_head.py
+++ b/mmdet/models/anchor_heads/fcos_head.py
@@ -369,8 +369,8 @@ class FCOSHead(nn.Module):
            return gt_labels.new_zeros(num_points), \
                   gt_bboxes.new_zeros((num_points, 4))
-        areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0] + 1) * (
+        areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * (
-            gt_bboxes[:, 3] - gt_bboxes[:, 1] + 1)
+            gt_bboxes[:, 3] - gt_bboxes[:, 1])
        # TODO: figure out why these two are different
        # areas = areas[None].expand(num_points, num_gts)
        areas = areas[None].repeat(num_points, 1)

--- a/mmdet/models/anchor_heads/ga_rpn_head.py
+++ b/mmdet/models/anchor_heads/ga_rpn_head.py
@@ -103,8 +103,8 @@ class GARPNHead(GuidedAnchorHead):
                                   self.target_stds, img_shape)
            # filter out too small bboxes
            if cfg.min_bbox_size > 0:
-                w = proposals[:, 2] - proposals[:, 0] + 1
+                w = proposals[:, 2] - proposals[:, 0]
-                h = proposals[:, 3] - proposals[:, 1] + 1
+                h = proposals[:, 3] - proposals[:, 1]
                valid_inds = torch.nonzero((w >= cfg.min_bbox_size) &
                                           (h >= cfg.min_bbox_size)).squeeze()
                proposals = proposals[valid_inds, :]

--- a/mmdet/models/anchor_heads/rpn_head.py
+++ b/mmdet/models/anchor_heads/rpn_head.py
@@ -82,8 +82,8 @@ class RPNHead(AnchorHead):
            proposals = delta2bbox(anchors, rpn_bbox_pred, self.target_means,
                                   self.target_stds, img_shape)
            if cfg.min_bbox_size > 0:
-                w = proposals[:, 2] - proposals[:, 0] + 1
+                w = proposals[:, 2] - proposals[:, 0]
-                h = proposals[:, 3] - proposals[:, 1] + 1
+                h = proposals[:, 3] - proposals[:, 1]
                valid_inds = torch.nonzero((w >= cfg.min_bbox_size) &
                                           (h >= cfg.min_bbox_size)).squeeze()
                proposals = proposals[valid_inds, :]

--- a/mmdet/models/anchor_heads/ssd_head.py
+++ b/mmdet/models/anchor_heads/ssd_head.py
@@ -75,7 +75,7 @@ class SSDHead(AnchorHead):
        for k in range(len(anchor_strides)):
            base_size = min_sizes[k]
            stride = anchor_strides[k]
-            ctr = ((stride - 1) / 2., (stride - 1) / 2.)
+            ctr = ((stride) / 2., (stride) / 2.)
            scales = [1., np.sqrt(max_sizes[k] / min_sizes[k])]
            ratios = [1.]
            for r in anchor_ratios[k]:

--- a/mmdet/models/bbox_heads/bbox_head.py
+++ b/mmdet/models/bbox_heads/bbox_head.py
@@ -154,8 +154,8 @@ class BBoxHead(nn.Module):
        else:
            bboxes = rois[:, 1:].clone()
            if img_shape is not None:
-                bboxes[:, [0, 2]].clamp_(min=0, max=img_shape[1] - 1)
+                bboxes[:, [0, 2]].clamp_(min=0, max=img_shape[1])
-                bboxes[:, [1, 3]].clamp_(min=0, max=img_shape[0] - 1)
+                bboxes[:, [1, 3]].clamp_(min=0, max=img_shape[0])
        if rescale:
            if isinstance(scale_factor, float):

--- a/mmdet/models/losses/iou_loss.py
+++ b/mmdet/models/losses/iou_loss.py
@@ -40,13 +40,13 @@ def bounded_iou_loss(pred, target, beta=0.2, eps=1e-3):
    """
    pred_ctrx = (pred[:, 0] + pred[:, 2]) * 0.5
    pred_ctry = (pred[:, 1] + pred[:, 3]) * 0.5
-    pred_w = pred[:, 2] - pred[:, 0] + 1
+    pred_w = pred[:, 2] - pred[:, 0]
-    pred_h = pred[:, 3] - pred[:, 1] + 1
+    pred_h = pred[:, 3] - pred[:, 1]
    with torch.no_grad():
        target_ctrx = (target[:, 0] + target[:, 2]) * 0.5
        target_ctry = (target[:, 1] + target[:, 3]) * 0.5
-        target_w = target[:, 2] - target[:, 0] + 1
+        target_w = target[:, 2] - target[:, 0]
-        target_h = target[:, 3] - target[:, 1] + 1
+        target_h = target[:, 3] - target[:, 1]
    dx = target_ctrx - pred_ctrx
    dy = target_ctry - pred_ctry
@@ -91,12 +91,12 @@ def giou_loss(pred, target, eps=1e-7):
    # overlap
    lt = torch.max(pred[:, :2], target[:, :2])
    rb = torch.min(pred[:, 2:], target[:, 2:])
-    wh = (rb - lt + 1).clamp(min=0)
+    wh = (rb - lt).clamp(min=0)
    overlap = wh[:, 0] * wh[:, 1]
    # union
-    ap = (pred[:, 2] - pred[:, 0] + 1) * (pred[:, 3] - pred[:, 1] + 1)
+    ap = (pred[:, 2] - pred[:, 0]) * (pred[:, 3] - pred[:, 1])
-    ag = (target[:, 2] - target[:, 0] + 1) * (target[:, 3] - target[:, 1] + 1)
+    ag = (target[:, 2] - target[:, 0]) * (target[:, 3] - target[:, 1])
    union = ap + ag - overlap + eps
    # IoU
@@ -105,7 +105,7 @@ def giou_loss(pred, target, eps=1e-7):
    # enclose area
    enclose_x1y1 = torch.min(pred[:, :2], target[:, :2])
    enclose_x2y2 = torch.max(pred[:, 2:], target[:, 2:])
-    enclose_wh = (enclose_x2y2 - enclose_x1y1 + 1).clamp(min=0)
+    enclose_wh = (enclose_x2y2 - enclose_x1y1).clamp(min=0)
    enclose_area = enclose_wh[:, 0] * enclose_wh[:, 1] + eps
    # GIoU

--- a/mmdet/models/mask_heads/fcn_mask_head.py
+++ b/mmdet/models/mask_heads/fcn_mask_head.py
-import mmcv
 import numpy as np
 import pycocotools.mask as mask_util
 import torch
@@ -8,9 +7,15 @@ from torch.nn.modules.utils import _pair
 from mmdet.core import auto_fp16, force_fp32, mask_target
 from mmdet.ops import ConvModule, build_upsample_layer
 from mmdet.ops.carafe import CARAFEPack
+from mmdet.ops.grid_sampler import grid_sample
 from ..builder import build_loss
 from ..registry import HEADS
+BYTES_PER_FLOAT = 4
+# TODO: This memory limit may be too much or too little. It would be better to
+# determine it based on available resources.
+GPU_MEM_LIMIT = 1024**3  # 1 GB memory limit
 @HEADS.register_module
 class FCNMaskHead(nn.Module):
@@ -144,7 +149,7 @@ class FCNMaskHead(nn.Module):
        """Get segmentation masks from mask_pred and bboxes.
        Args:
-            mask_pred (Tensor or ndarray): shape (n, #class+1, h, w).
+            mask_pred (Tensor or ndarray): shape (n, #class, h, w).
                For single-scale testing, mask_pred is the direct output of
                model, whose type is Tensor, while for multi-scale testing,
                it will be converted to numpy array outside of this method.
@@ -158,15 +163,15 @@ class FCNMaskHead(nn.Module):
            list[list]: encoded masks
        """
        if isinstance(mask_pred, torch.Tensor):
-            mask_pred = mask_pred.sigmoid().cpu().numpy()
+            mask_pred = mask_pred.sigmoid()
-        assert isinstance(mask_pred, np.ndarray)
+        else:
-        # when enabling mixed precision training, mask_pred may be float16
+            mask_pred = det_bboxes.new_tensor(mask_pred)
-        # numpy array
-        mask_pred = mask_pred.astype(np.float32)
-        cls_segms = [[] for _ in range(self.num_classes - 1)]
+        device = mask_pred.device
-        bboxes = det_bboxes.cpu().numpy()[:, :4]
+        cls_segms = [[] for _ in range(self.num_classes)
-        labels = det_labels.cpu().numpy() + 1
+                     ]  # BG is not included in num_classes
+        bboxes = det_bboxes[:, :4]
+        labels = det_labels + 1  # TODO: remove + 1 in cat -1
        if rescale:
            img_h, img_w = ori_shape[:2]
@@ -175,34 +180,130 @@ class FCNMaskHead(nn.Module):
            img_w = np.round(ori_shape[1] * scale_factor).astype(np.int32)
            scale_factor = 1.0
-        for i in range(bboxes.shape[0]):
+        if not isinstance(scale_factor, (float, torch.Tensor)):
-            if not isinstance(scale_factor, (float, np.ndarray)):
+            scale_factor = bboxes.new_tensor(scale_factor)
-                scale_factor = scale_factor.cpu().numpy()
+        bboxes = bboxes / scale_factor
-            bbox = (bboxes[i, :] / scale_factor).astype(np.int32)
-            label = labels[i]
-            w = max(bbox[2] - bbox[0] + 1, 1)
-            h = max(bbox[3] - bbox[1] + 1, 1)
-            if not self.class_agnostic:
+        N = len(mask_pred)
-                mask_pred_ = mask_pred[i, label, :, :]
+        # The actual implementation split the input into chunks,
-            else:
+        # and paste them chunk by chunk.
-                mask_pred_ = mask_pred[i, 0, :, :]
+        if device.type == 'cpu':
+            # CPU is most efficient when they are pasted one by one with
+            # skip_empty=True, so that it performs minimal number of
+            # operations.
+            num_chunks = N
+        else:
+            # GPU benefits from parallelism for larger chunks,
+            # but may have memory issue
+            num_chunks = int(
+                np.ceil(N * img_h * img_w * BYTES_PER_FLOAT / GPU_MEM_LIMIT))
+            assert (num_chunks <=
+                    N), 'Default GPU_MEM_LIMIT is too small; try increasing it'
+        chunks = torch.chunk(torch.arange(N, device=device), num_chunks)
-            bbox_mask = mmcv.imresize(mask_pred_, (w, h))
+        threshold = rcnn_test_cfg.mask_thr_binary
-            bbox_mask = (bbox_mask > rcnn_test_cfg.mask_thr_binary).astype(
+        im_mask = torch.zeros(
-                np.uint8)
+            N,
+            img_h,
+            img_w,
+            device=device,
+            dtype=torch.bool if threshold >= 0 else torch.uint8)
-            if rcnn_test_cfg.get('crop_mask', False):
+        if not self.class_agnostic:
-                im_mask = bbox_mask
+            mask_pred = mask_pred[range(N), labels][:, None]
-            else:
-                im_mask = np.zeros((img_h, img_w), dtype=np.uint8)
-                im_mask[bbox[1]:bbox[1] + h, bbox[0]:bbox[0] + w] = bbox_mask
-            if rcnn_test_cfg.get('rle_mask_encode', True):
+        for inds in chunks:
-                rle = mask_util.encode(
+            masks_chunk, spatial_inds = _do_paste_mask(
-                    np.array(im_mask[:, :, np.newaxis], order='F'))[0]
+                mask_pred[inds],
-                cls_segms[label - 1].append(rle)
+                bboxes[inds],
+                img_h,
+                img_w,
+                skip_empty=device.type == 'cpu')
+            if threshold >= 0:
+                masks_chunk = (masks_chunk >= threshold).to(dtype=torch.bool)
            else:
-                cls_segms[label - 1].append(im_mask)
+                # for visualization and debugging
+                masks_chunk = (masks_chunk * 255).to(dtype=torch.uint8)
+            im_mask[(inds, ) + spatial_inds] = masks_chunk
+        for i in range(N):
+            rle = mask_util.encode(
+                np.array(
+                    im_mask[i][:, :, None].cpu().numpy(),
+                    order='F',
+                    dtype='uint8'))[0]
+            cls_segms[labels[i] - 1].append(rle)  # TODO: remove -1 in cat -1
        return cls_segms
+def _do_paste_mask(masks, boxes, img_h, img_w, skip_empty=True):
+    """Paste instance masks acoording to boxes.
+    This implementation is modified from
+    https://github.com/facebookresearch/detectron2/
+    Args:
+        masks (Tensor): N, 1, H, W
+        boxes (Tensor): N, 4
+        img_h (int): Height of the image to be pasted.
+        img_w (int): Width of the image to be pasted.
+        skip_empty (bool): Only paste masks within the region that
+            tightly bound all boxes, and returns the results this region only.
+            An important optimization for CPU.
+    Returns:
+        tuple: (Tensor, tuple). The first item is mask tensor, the second one
+            is the slice object.
+        If skip_empty == False, the whole image will be pasted. It will
+            return a mask of shape (N, img_h, img_w) and an empty tuple.
+        If skip_empty == True, only area around the mask will be pasted.
+            A mask of shape (N, h', w') and its start and end coordinates
+            in the original image will be returned.
+    """
+    # On GPU, paste all masks together (up to chunk size)
+    # by using the entire image to sample the masks
+    # Compared to pasting them one by one,
+    # this has more operations but is faster on COCO-scale dataset.
+    device = masks.device
+    if skip_empty:
+        x0_int, y0_int = torch.clamp(
+            boxes.min(dim=0).values.floor()[:2] - 1,
+            min=0).to(dtype=torch.int32)
+        x1_int = torch.clamp(
+            boxes[:, 2].max().ceil() + 1, max=img_w).to(dtype=torch.int32)
+        y1_int = torch.clamp(
+            boxes[:, 3].max().ceil() + 1, max=img_h).to(dtype=torch.int32)
+    else:
+        x0_int, y0_int = 0, 0
+        x1_int, y1_int = img_w, img_h
+    x0, y0, x1, y1 = torch.split(boxes, 1, dim=1)  # each is Nx1
+    N = masks.shape[0]
+    img_y = torch.arange(
+        y0_int, y1_int, device=device, dtype=torch.float32) + 0.5
+    img_x = torch.arange(
+        x0_int, x1_int, device=device, dtype=torch.float32) + 0.5
+    img_y = (img_y - y0) / (y1 - y0) * 2 - 1
+    img_x = (img_x - x0) / (x1 - x0) * 2 - 1
+    # img_x, img_y have shapes (N, w), (N, h)
+    if torch.isinf(img_x).any():
+        inds = torch.where(torch.isinf(img_x))
+        img_x[inds] = 0
+    if torch.isinf(img_y).any():
+        inds = torch.where(torch.isinf(img_y))
+        img_y[inds] = 0
+    gx = img_x[:, None, :].expand(N, img_y.size(1), img_x.size(1))
+    gy = img_y[:, :, None].expand(N, img_y.size(1), img_x.size(1))
+    grid = torch.stack([gx, gy], dim=3)
+    img_masks = grid_sample(
+        masks.to(dtype=torch.float32), grid, align_corners=False)
+    if skip_empty:
+        return img_masks[:, 0], (slice(y0_int, y1_int), slice(x0_int, x1_int))
+    else:
+        return img_masks[:, 0], ()