diff --git a/.gitignore b/.gitignore
index 894a44cc066a027465cd26d634948d56d13af9af..ffbae97a51e885187c5fc0c0485e58bf6067e310 100644
--- a/.gitignore
+++ b/.gitignore
@@ -102,3 +102,6 @@ venv.bak/
 
 # mypy
 .mypy_cache/
+
+# cython generated cpp
+mmdet/ops/nms/*.cpp
\ No newline at end of file
diff --git a/compile.sh b/compile.sh
new file mode 100755
index 0000000000000000000000000000000000000000..8bf418054a26fc2ab5741298f3f3863273cd1c0a
--- /dev/null
+++ b/compile.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+PYTHON=${PYTHON:-"python"}
+
+echo "Building roi align op..."
+cd mmdet/ops/roi_align
+if [ -d "build" ]; then
+    rm -r build
+fi
+$PYTHON setup.py build_ext --inplace
+
+echo "Building roi pool op..."
+cd ../roi_pool
+if [ -d "build" ]; then
+    rm -r build
+fi
+$PYTHON setup.py build_ext --inplace
+
+echo "Building nms op..."
+cd ../nms
+make clean
+make PYTHON=${PYTHON}
diff --git a/mmdet/__init__.py b/mmdet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..58f3ace6c03d093337c9fa417ccbe8bc267b6c69
--- /dev/null
+++ b/mmdet/__init__.py
@@ -0,0 +1 @@
+from .version import __version__
diff --git a/mmdet/core/__init__.py b/mmdet/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7992d8deb3ba0f6586c1bef0705f33a41a78d917
--- /dev/null
+++ b/mmdet/core/__init__.py
@@ -0,0 +1,6 @@
+from .anchor_generator import *
+from .bbox_ops import *
+from .mask_ops import *
+from .eval import *
+from .nn import *
+from .targets import *
diff --git a/mmdet/core/anchor_generator.py b/mmdet/core/anchor_generator.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7a1fa256fb6d4df69be77a341728ed194b54b7e
--- /dev/null
+++ b/mmdet/core/anchor_generator.py
@@ -0,0 +1,80 @@
+import torch
+
+
+class AnchorGenerator(object):
+
+    def __init__(self, base_size, scales, ratios, scale_major=True):
+        self.base_size = base_size
+        self.scales = torch.Tensor(scales)
+        self.ratios = torch.Tensor(ratios)
+        self.scale_major = scale_major
+        self.base_anchors = self.gen_base_anchors()
+
+    @property
+    def num_base_anchors(self):
+        return self.base_anchors.size(0)
+
+    def gen_base_anchors(self):
+        base_anchor = torch.Tensor(
+            [0, 0, self.base_size - 1, self.base_size - 1])
+
+        w = base_anchor[2] - base_anchor[0] + 1
+        h = base_anchor[3] - base_anchor[1] + 1
+        x_ctr = base_anchor[0] + 0.5 * (w - 1)
+        y_ctr = base_anchor[1] + 0.5 * (h - 1)
+
+        h_ratios = torch.sqrt(self.ratios)
+        w_ratios = 1 / h_ratios
+        if self.scale_major:
+            ws = (w * w_ratios[:, None] * self.scales[None, :]).view(-1)
+            hs = (h * h_ratios[:, None] * self.scales[None, :]).view(-1)
+        else:
+            ws = (w * self.scales[:, None] * w_ratios[None, :]).view(-1)
+            hs = (h * self.scales[:, None] * h_ratios[None, :]).view(-1)
+
+        base_anchors = torch.stack(
+            [
+                x_ctr - 0.5 * (ws - 1), y_ctr - 0.5 * (hs - 1),
+                x_ctr + 0.5 * (ws - 1), y_ctr + 0.5 * (hs - 1)
+            ],
+            dim=-1).round()
+
+        return base_anchors
+
+    def _meshgrid(self, x, y, row_major=True):
+        xx = x.repeat(len(y))
+        yy = y.view(-1, 1).repeat(1, len(x)).view(-1)
+        if row_major:
+            return xx, yy
+        else:
+            return yy, xx
+
+    def grid_anchors(self, featmap_size, stride=16, device='cuda'):
+        feat_h, feat_w = featmap_size
+        shift_x = torch.arange(0, feat_w, device=device) * stride
+        shift_y = torch.arange(0, feat_h, device=device) * stride
+        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
+        shifts = torch.stack([shift_xx, shift_yy, shift_xx, shift_yy], dim=-1)
+        # first feat_w elements correspond to the first row of shifts
+        # add A anchors (1, A, 4) to K shifts (K, 1, 4) to get
+        # shifted anchors (K, A, 4), reshape to (K*A, 4)
+        base_anchors = self.base_anchors.to(device)
+        all_anchors = base_anchors[None, :, :] + shifts[:, None, :]
+        all_anchors = all_anchors.view(-1, 4)
+        # first A rows correspond to A anchors of (0, 0) in feature map,
+        # then (0, 1), (0, 2), ...
+        return all_anchors
+
+    def valid_flags(self, featmap_size, valid_size, device='cuda'):
+        feat_h, feat_w = featmap_size
+        valid_h, valid_w = valid_size
+        assert valid_h <= feat_h and valid_w <= feat_w
+        valid_x = torch.zeros(feat_w, dtype=torch.uint8, device=device)
+        valid_y = torch.zeros(feat_h, dtype=torch.uint8, device=device)
+        valid_x[:valid_w] = 1
+        valid_y[:valid_h] = 1
+        valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)
+        valid = valid_xx & valid_yy
+        valid = valid[:, None].expand(
+            valid.size(0), self.num_base_anchors).contiguous().view(-1)
+        return valid
diff --git a/mmdet/core/bbox_ops/__init__.py b/mmdet/core/bbox_ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4bf9aeb74a5db787f687bacf0147ae1e2b1054bf
--- /dev/null
+++ b/mmdet/core/bbox_ops/__init__.py
@@ -0,0 +1,12 @@
+from .geometry import bbox_overlaps
+from .sampling import (random_choice, bbox_assign, bbox_assign_via_overlaps,
+                       bbox_sampling, sample_positives, sample_negatives)
+from .transforms import (bbox_transform, bbox_transform_inv, bbox_flip,
+                         bbox_mapping, bbox_mapping_back, bbox2roi, roi2bbox)
+
+__all__ = [
+    'bbox_overlaps', 'random_choice', 'bbox_assign',
+    'bbox_assign_via_overlaps', 'bbox_sampling', 'sample_positives',
+    'sample_negatives', 'bbox_transform', 'bbox_transform_inv', 'bbox_flip',
+    'bbox_mapping', 'bbox_mapping_back', 'bbox2roi', 'roi2bbox'
+]
diff --git a/mmdet/core/bbox_ops/geometry.py b/mmdet/core/bbox_ops/geometry.py
new file mode 100644
index 0000000000000000000000000000000000000000..a852a06fb0c216569cf5f32385c356114c534904
--- /dev/null
+++ b/mmdet/core/bbox_ops/geometry.py
@@ -0,0 +1,63 @@
+import torch
+
+
+def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False):
+    """Calculate overlap between two set of bboxes.
+
+    If ``is_aligned`` is ``False``, then calculate the ious between each bbox
+    of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
+    bboxes1 and bboxes2.
+
+    Args:
+        bboxes1 (Tensor): shape (m, 4)
+        bboxes2 (Tensor): shape (n, 4), if is_aligned is ``True``, then m and n
+            must be equal.
+        mode (str): "iou" (intersection over union) or iof (intersection over
+            foreground).
+
+    Returns:
+        ious(Tensor): shape (n, k) if is_aligned == False else shape (n, 1)
+    """
+
+    assert mode in ['iou', 'iof']
+
+    rows = bboxes1.size(0)
+    cols = bboxes2.size(0)
+    if is_aligned:
+        assert rows == cols
+
+    if rows * cols == 0:
+        return bboxes1.new(rows, 1) if is_aligned else bboxes1.new(rows, cols)
+
+    if is_aligned:
+        lt = torch.max(bboxes1[:, :2], bboxes2[:, :2])  # [rows, 2]
+        rb = torch.min(bboxes1[:, 2:], bboxes2[:, 2:])  # [rows, 2]
+
+        wh = (rb - lt + 1).clamp(min=0)  # [rows, 2]
+        overlap = wh[:, 0] * wh[:, 1]
+        area1 = (bboxes1[:, 2] - bboxes1[:, 0] + 1) * (
+            bboxes1[:, 3] - bboxes1[:, 1] + 1)
+
+        if mode == 'iou':
+            area2 = (bboxes2[:, 2] - bboxes2[:, 0] + 1) * (
+                bboxes2[:, 3] - bboxes2[:, 1] + 1)
+            ious = overlap / (area1 + area2 - overlap)
+        else:
+            ious = overlap / area1
+    else:
+        lt = torch.max(bboxes1[:, None, :2], bboxes2[:, :2])  # [rows, cols, 2]
+        rb = torch.min(bboxes1[:, None, 2:], bboxes2[:, 2:])  # [rows, cols, 2]
+
+        wh = (rb - lt + 1).clamp(min=0)  # [rows, cols, 2]
+        overlap = wh[:, :, 0] * wh[:, :, 1]
+        area1 = (bboxes1[:, 2] - bboxes1[:, 0] + 1) * (
+            bboxes1[:, 3] - bboxes1[:, 1] + 1)
+
+        if mode == 'iou':
+            area2 = (bboxes2[:, 2] - bboxes2[:, 0] + 1) * (
+                bboxes2[:, 3] - bboxes2[:, 1] + 1)
+            ious = overlap / (area1[:, None] + area2 - overlap)
+        else:
+            ious = overlap / (area1[:, None])
+
+    return ious
diff --git a/mmdet/core/bbox_ops/sampling.py b/mmdet/core/bbox_ops/sampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..9825e3bd15ec87dc6bc9c31be4b2f11422fcda13
--- /dev/null
+++ b/mmdet/core/bbox_ops/sampling.py
@@ -0,0 +1,255 @@
+import numpy as np
+import torch
+
+from .geometry import bbox_overlaps
+
+
+def random_choice(gallery, num):
+    assert len(gallery) >= num
+    if isinstance(gallery, list):
+        gallery = np.array(gallery)
+    cands = np.arange(len(gallery))
+    np.random.shuffle(cands)
+    rand_inds = cands[:num]
+    if not isinstance(gallery, np.ndarray):
+        rand_inds = torch.from_numpy(rand_inds).long()
+        if gallery.is_cuda:
+            rand_inds = rand_inds.cuda(gallery.get_device())
+    return gallery[rand_inds]
+
+
+def bbox_assign(proposals,
+                gt_bboxes,
+                gt_crowd_bboxes=None,
+                gt_labels=None,
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=.0,
+                crowd_thr=-1):
+    """Assign a corresponding gt bbox or background to each proposal/anchor
+    This function assign a gt bbox to every proposal, each proposals will be
+    assigned with -1, 0, or a positive number. -1 means don't care, 0 means
+    negative sample, positive number is the index (1-based) of assigned gt.
+    If gt_crowd_bboxes is not None, proposals which have iof(intersection over foreground)
+    with crowd bboxes over crowd_thr will be ignored
+    Args:
+        proposals(Tensor): proposals or RPN anchors, shape (n, 4)
+        gt_bboxes(Tensor): shape (k, 4)
+        gt_crowd_bboxes(Tensor): shape(m, 4)
+        gt_labels(Tensor, optional): shape (k, )
+        pos_iou_thr(float): iou threshold for positive bboxes
+        neg_iou_thr(float or tuple): iou threshold for negative bboxes
+        min_pos_iou(float): minimum iou for a bbox to be considered as a positive bbox,
+                            for RPN, it is usually set as 0, for Fast R-CNN,
+                            it is usually set as pos_iou_thr
+        crowd_thr: ignore proposals which have iof(intersection over foreground) with 
+        crowd bboxes over crowd_thr
+    Returns:
+        tuple: (assigned_gt_inds, argmax_overlaps, max_overlaps), shape (n, )
+    """
+
+    # calculate overlaps between the proposals and the gt boxes
+    overlaps = bbox_overlaps(proposals, gt_bboxes)
+    if overlaps.numel() == 0:
+        raise ValueError('No gt bbox or proposals')
+
+    # ignore proposals according to crowd bboxes
+    if (crowd_thr > 0) and (gt_crowd_bboxes is
+                            not None) and (gt_crowd_bboxes.numel() > 0):
+        crowd_overlaps = bbox_overlaps(proposals, gt_crowd_bboxes, mode='iof')
+        crowd_max_overlaps, _ = crowd_overlaps.max(dim=1)
+        crowd_bboxes_inds = torch.nonzero(
+            crowd_max_overlaps > crowd_thr).long()
+        if crowd_bboxes_inds.numel() > 0:
+            overlaps[crowd_bboxes_inds, :] = -1
+
+    return bbox_assign_via_overlaps(overlaps, gt_labels, pos_iou_thr,
+                                    neg_iou_thr, min_pos_iou)
+
+
+def bbox_assign_via_overlaps(overlaps,
+                             gt_labels=None,
+                             pos_iou_thr=0.5,
+                             neg_iou_thr=0.5,
+                             min_pos_iou=.0):
+    """Assign a corresponding gt bbox or background to each proposal/anchor
+    This function assign a gt bbox to every proposal, each proposals will be
+    assigned with -1, 0, or a positive number. -1 means don't care, 0 means
+    negative sample, positive number is the index (1-based) of assigned gt.
+    The assignment is done in following steps, the order matters:
+    1. assign every anchor to -1
+    2. assign proposals whose iou with all gts < neg_iou_thr to 0
+    3. for each anchor, if the iou with its nearest gt >= pos_iou_thr,
+    assign it to that bbox
+    4. for each gt bbox, assign its nearest proposals(may be more than one)
+    to itself
+    Args:
+        overlaps(Tensor): overlaps between n proposals and k gt_bboxes, shape(n, k)
+        gt_labels(Tensor, optional): shape (k, )
+        pos_iou_thr(float): iou threshold for positive bboxes
+        neg_iou_thr(float or tuple): iou threshold for negative bboxes
+        min_pos_iou(float): minimum iou for a bbox to be considered as a positive bbox,
+                            for RPN, it is usually set as 0, for Fast R-CNN,
+                            it is usually set as pos_iou_thr
+    Returns:
+        tuple: (assigned_gt_inds, argmax_overlaps, max_overlaps), shape (n, )
+    """
+    num_bboxes, num_gts = overlaps.size(0), overlaps.size(1)
+    # 1. assign -1 by default
+    assigned_gt_inds = overlaps.new(num_bboxes).long().fill_(-1)
+
+    if overlaps.numel() == 0:
+        raise ValueError('No gt bbox or proposals')
+
+    assert overlaps.size() == (num_bboxes, num_gts)
+    # for each anchor, which gt best overlaps with it
+    # for each anchor, the max iou of all gts
+    max_overlaps, argmax_overlaps = overlaps.max(dim=1)
+    # for each gt, which anchor best overlaps with it
+    # for each gt, the max iou of all proposals
+    gt_max_overlaps, gt_argmax_overlaps = overlaps.max(dim=0)
+
+    # 2. assign negative: below
+    if isinstance(neg_iou_thr, float):
+        assigned_gt_inds[(max_overlaps >= 0)
+                         & (max_overlaps < neg_iou_thr)] = 0
+    elif isinstance(neg_iou_thr, tuple):
+        assert len(neg_iou_thr) == 2
+        assigned_gt_inds[(max_overlaps >= neg_iou_thr[0])
+                         & (max_overlaps < neg_iou_thr[1])] = 0
+
+    # 3. assign positive: above positive IoU threshold
+    pos_inds = max_overlaps >= pos_iou_thr
+    assigned_gt_inds[pos_inds] = argmax_overlaps[pos_inds] + 1
+
+    # 4. assign fg: for each gt, proposals with highest IoU
+    for i in range(num_gts):
+        if gt_max_overlaps[i] >= min_pos_iou:
+            assigned_gt_inds[overlaps[:, i] == gt_max_overlaps[i]] = i + 1
+
+    if gt_labels is None:
+        return assigned_gt_inds, argmax_overlaps, max_overlaps
+    else:
+        assigned_labels = assigned_gt_inds.new(num_bboxes).fill_(0)
+        pos_inds = torch.nonzero(assigned_gt_inds > 0).squeeze()
+        if pos_inds.numel() > 0:
+            assigned_labels[pos_inds] = gt_labels[assigned_gt_inds[pos_inds] -
+                                                  1]
+        return assigned_gt_inds, assigned_labels, argmax_overlaps, max_overlaps
+
+
+def sample_positives(assigned_gt_inds, num_expected, balance_sampling=True):
+    """Balance sampling for positive bboxes/anchors
+    1. calculate average positive num for each gt: num_per_gt
+    2. sample at most num_per_gt positives for each gt
+    3. random sampling from rest anchors if not enough fg
+    """
+    pos_inds = torch.nonzero(assigned_gt_inds > 0)
+    if pos_inds.numel() != 0:
+        pos_inds = pos_inds.squeeze(1)
+    if pos_inds.numel() <= num_expected:
+        return pos_inds
+    elif not balance_sampling:
+        return random_choice(pos_inds, num_expected)
+    else:
+        unique_gt_inds = torch.unique(assigned_gt_inds[pos_inds].cpu())
+        num_gts = len(unique_gt_inds)
+        num_per_gt = int(round(num_expected / float(num_gts)) + 1)
+        sampled_inds = []
+        for i in unique_gt_inds:
+            inds = torch.nonzero(assigned_gt_inds == i.item())
+            if inds.numel() != 0:
+                inds = inds.squeeze(1)
+            else:
+                continue
+            if len(inds) > num_per_gt:
+                inds = random_choice(inds, num_per_gt)
+            sampled_inds.append(inds)
+        sampled_inds = torch.cat(sampled_inds)
+        if len(sampled_inds) < num_expected:
+            num_extra = num_expected - len(sampled_inds)
+            extra_inds = np.array(
+                list(set(pos_inds.cpu()) - set(sampled_inds.cpu())))
+            if len(extra_inds) > num_extra:
+                extra_inds = random_choice(extra_inds, num_extra)
+            extra_inds = torch.from_numpy(extra_inds).to(
+                assigned_gt_inds.device).long()
+            sampled_inds = torch.cat([sampled_inds, extra_inds])
+        elif len(sampled_inds) > num_expected:
+            sampled_inds = random_choice(sampled_inds, num_expected)
+        return sampled_inds
+
+
+def sample_negatives(assigned_gt_inds,
+                     num_expected,
+                     max_overlaps=None,
+                     balance_thr=0,
+                     hard_fraction=0.5):
+    """Balance sampling for negative bboxes/anchors
+    negative samples are split into 2 set: hard(balance_thr <= iou < neg_iou_thr)
+    and easy(iou < balance_thr), around equal number of bg are sampled
+    from each set.
+    """
+    neg_inds = torch.nonzero(assigned_gt_inds == 0)
+    if neg_inds.numel() != 0:
+        neg_inds = neg_inds.squeeze(1)
+    if len(neg_inds) <= num_expected:
+        return neg_inds
+    elif balance_thr <= 0:
+        # uniform sampling among all negative samples
+        return random_choice(neg_inds, num_expected)
+    else:
+        assert max_overlaps is not None
+        max_overlaps = max_overlaps.cpu().numpy()
+        # balance sampling for negative samples
+        neg_set = set(neg_inds.cpu().numpy())
+        easy_set = set(
+            np.where(
+                np.logical_and(max_overlaps >= 0,
+                               max_overlaps < balance_thr))[0])
+        hard_set = set(np.where(max_overlaps >= balance_thr)[0])
+        easy_neg_inds = list(easy_set & neg_set)
+        hard_neg_inds = list(hard_set & neg_set)
+
+        num_expected_hard = int(num_expected * hard_fraction)
+        if len(hard_neg_inds) > num_expected_hard:
+            sampled_hard_inds = random_choice(hard_neg_inds, num_expected_hard)
+        else:
+            sampled_hard_inds = np.array(hard_neg_inds, dtype=np.int)
+        num_expected_easy = num_expected - len(sampled_hard_inds)
+        if len(easy_neg_inds) > num_expected_easy:
+            sampled_easy_inds = random_choice(easy_neg_inds, num_expected_easy)
+        else:
+            sampled_easy_inds = np.array(easy_neg_inds, dtype=np.int)
+        sampled_inds = np.concatenate((sampled_easy_inds, sampled_hard_inds))
+        if len(sampled_inds) < num_expected:
+            num_extra = num_expected - len(sampled_inds)
+            extra_inds = np.array(list(neg_set - set(sampled_inds)))
+            if len(extra_inds) > num_extra:
+                extra_inds = random_choice(extra_inds, num_extra)
+            sampled_inds = np.concatenate((sampled_inds, extra_inds))
+        sampled_inds = torch.from_numpy(sampled_inds).long().to(
+            assigned_gt_inds.device)
+        return sampled_inds
+
+
+def bbox_sampling(assigned_gt_inds,
+                  num_expected,
+                  pos_fraction,
+                  neg_pos_ub,
+                  pos_balance_sampling=True,
+                  max_overlaps=None,
+                  neg_balance_thr=0,
+                  neg_hard_fraction=0.5):
+    num_expected_pos = int(num_expected * pos_fraction)
+    pos_inds = sample_positives(assigned_gt_inds, num_expected_pos,
+                                pos_balance_sampling)
+    num_sampled_pos = pos_inds.numel()
+    num_neg_max = int(
+        neg_pos_ub *
+        num_sampled_pos) if num_sampled_pos > 0 else int(neg_pos_ub)
+    num_expected_neg = min(num_neg_max, num_expected - num_sampled_pos)
+    neg_inds = sample_negatives(assigned_gt_inds, num_expected_neg,
+                                max_overlaps, neg_balance_thr,
+                                neg_hard_fraction)
+    return pos_inds, neg_inds
diff --git a/mmdet/core/bbox_ops/transforms.py b/mmdet/core/bbox_ops/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f83a1dc56efdc214fe96c60b9a587a1cb81602b
--- /dev/null
+++ b/mmdet/core/bbox_ops/transforms.py
@@ -0,0 +1,128 @@
+import mmcv
+import numpy as np
+import torch
+
+
+def bbox_transform(proposals, gt, means=[0, 0, 0, 0], stds=[1, 1, 1, 1]):
+    assert proposals.size() == gt.size()
+
+    proposals = proposals.float()
+    gt = gt.float()
+    px = (proposals[..., 0] + proposals[..., 2]) * 0.5
+    py = (proposals[..., 1] + proposals[..., 3]) * 0.5
+    pw = proposals[..., 2] - proposals[..., 0] + 1.0
+    ph = proposals[..., 3] - proposals[..., 1] + 1.0
+
+    gx = (gt[..., 0] + gt[..., 2]) * 0.5
+    gy = (gt[..., 1] + gt[..., 3]) * 0.5
+    gw = gt[..., 2] - gt[..., 0] + 1.0
+    gh = gt[..., 3] - gt[..., 1] + 1.0
+
+    dx = (gx - px) / pw
+    dy = (gy - py) / ph
+    dw = torch.log(gw / pw)
+    dh = torch.log(gh / ph)
+    deltas = torch.stack([dx, dy, dw, dh], dim=-1)
+
+    means = deltas.new_tensor(means).unsqueeze(0)
+    stds = deltas.new_tensor(stds).unsqueeze(0)
+    deltas = deltas.sub_(means).div_(stds)
+
+    return deltas
+
+
+def bbox_transform_inv(rois,
+                       deltas,
+                       means=[0, 0, 0, 0],
+                       stds=[1, 1, 1, 1],
+                       max_shape=None,
+                       wh_ratio_clip=16 / 1000):
+    means = deltas.new_tensor(means).repeat(1, deltas.size(1) // 4)
+    stds = deltas.new_tensor(stds).repeat(1, deltas.size(1) // 4)
+    denorm_deltas = deltas * stds + means
+    dx = denorm_deltas[:, 0::4]
+    dy = denorm_deltas[:, 1::4]
+    dw = denorm_deltas[:, 2::4]
+    dh = denorm_deltas[:, 3::4]
+    max_ratio = np.abs(np.log(wh_ratio_clip))
+    dw = dw.clamp(min=-max_ratio, max=max_ratio)
+    dh = dh.clamp(min=-max_ratio, max=max_ratio)
+    px = ((rois[:, 0] + rois[:, 2]) * 0.5).unsqueeze(1).expand_as(dx)
+    py = ((rois[:, 1] + rois[:, 3]) * 0.5).unsqueeze(1).expand_as(dy)
+    pw = (rois[:, 2] - rois[:, 0] + 1.0).unsqueeze(1).expand_as(dw)
+    ph = (rois[:, 3] - rois[:, 1] + 1.0).unsqueeze(1).expand_as(dh)
+    gw = pw * dw.exp()
+    gh = ph * dh.exp()
+    gx = torch.addcmul(px, 1, pw, dx)  # gx = px + pw * dx
+    gy = torch.addcmul(py, 1, ph, dy)  # gy = py + ph * dy
+    x1 = gx - gw * 0.5 + 0.5
+    y1 = gy - gh * 0.5 + 0.5
+    x2 = gx + gw * 0.5 - 0.5
+    y2 = gy + gh * 0.5 - 0.5
+    if max_shape is not None:
+        x1 = x1.clamp(min=0, max=max_shape[1] - 1)
+        y1 = y1.clamp(min=0, max=max_shape[0] - 1)
+        x2 = x2.clamp(min=0, max=max_shape[1] - 1)
+        y2 = y2.clamp(min=0, max=max_shape[0] - 1)
+    bboxes = torch.stack([x1, y1, x2, y2], dim=-1).view_as(deltas)
+    return bboxes
+
+
+def bbox_flip(bboxes, img_shape):
+    """Flip bboxes horizontally
+    Args:
+        bboxes(Tensor): shape (..., 4*k)
+        img_shape(Tensor): image shape
+    """
+    if isinstance(bboxes, torch.Tensor):
+        assert bboxes.shape[-1] % 4 == 0
+        flipped = bboxes.clone()
+        flipped[:, 0::4] = img_shape[1] - bboxes[:, 2::4] - 1
+        flipped[:, 2::4] = img_shape[1] - bboxes[:, 0::4] - 1
+        return flipped
+    elif isinstance(bboxes, np.ndarray):
+        return mmcv.bbox_flip(bboxes, img_shape)
+
+
+def bbox_mapping(bboxes, img_shape, flip):
+    """Map bboxes from the original image scale to testing scale"""
+    new_bboxes = bboxes * img_shape[-1]
+    if flip:
+        new_bboxes = bbox_flip(new_bboxes, img_shape)
+    return new_bboxes
+
+
+def bbox_mapping_back(bboxes, img_shape, flip):
+    """Map bboxes from testing scale to original image scale"""
+    new_bboxes = bbox_flip(bboxes, img_shape) if flip else bboxes
+    new_bboxes = new_bboxes / img_shape[-1]
+    return new_bboxes
+
+
+def bbox2roi(bbox_list):
+    """Convert a list of bboxes to roi format.
+    Args:
+        bbox_list (Tensor): a list of bboxes corresponding to a list of images
+    Returns:
+        Tensor: shape (n, 5), [batch_ind, x1, y1, x2, y2]
+    """
+    rois_list = []
+    for img_id, bboxes in enumerate(bbox_list):
+        if bboxes.size(0) > 0:
+            img_inds = bboxes.new_full((bboxes.size(0), 1), img_id)
+            rois = torch.cat([img_inds, bboxes[:, :4]], dim=-1)
+        else:
+            rois = bboxes.new_zeros((0, 5))
+        rois_list.append(rois)
+    rois = torch.cat(rois_list, 0)
+    return rois
+
+
+def roi2bbox(rois):
+    bbox_list = []
+    img_ids = torch.unique(rois[:, 0].cpu(), sorted=True)
+    for img_id in img_ids:
+        inds = (rois[:, 0] == img_id.item())
+        bbox = rois[inds, 1:]
+        bbox_list.append(bbox)
+    return bbox_list
diff --git a/mmdet/core/eval/__init__.py b/mmdet/core/eval/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe4893a0af68ffff2633fcd702f7cf73cce93e76
--- /dev/null
+++ b/mmdet/core/eval/__init__.py
@@ -0,0 +1,13 @@
+from .class_names import (voc_classes, imagenet_det_classes,
+                          imagenet_vid_classes, coco_classes, dataset_aliases,
+                          get_classes)
+from .mean_ap import average_precision, eval_map, print_map_summary
+from .recall import (eval_recalls, print_recall_summary, plot_num_recall,
+                     plot_iou_recall)
+
+__all__ = [
+    'voc_classes', 'imagenet_det_classes', 'imagenet_vid_classes',
+    'coco_classes', 'dataset_aliases', 'get_classes', 'average_precision',
+    'eval_map', 'print_map_summary', 'eval_recalls', 'print_recall_summary',
+    'plot_num_recall', 'plot_iou_recall'
+]
diff --git a/mmdet/core/eval/bbox_overlaps.py b/mmdet/core/eval/bbox_overlaps.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad4c70523fdaa5d89a2b80ada559e1822d0ecd22
--- /dev/null
+++ b/mmdet/core/eval/bbox_overlaps.py
@@ -0,0 +1,49 @@
+import numpy as np
+
+
+def bbox_overlaps(bboxes1, bboxes2, mode='iou'):
+    """Calculate the ious between each bbox of bboxes1 and bboxes2.
+
+    Args:
+        bboxes1(ndarray): shape (n, 4)
+        bboxes2(ndarray): shape (k, 4)
+        mode(str): iou (intersection over union) or iof (intersection
+            over foreground)
+
+    Returns:
+        ious(ndarray): shape (n, k)
+    """
+
+    assert mode in ['iou', 'iof']
+
+    bboxes1 = bboxes1.astype(np.float32)
+    bboxes2 = bboxes2.astype(np.float32)
+    rows = bboxes1.shape[0]
+    cols = bboxes2.shape[0]
+    ious = np.zeros((rows, cols), dtype=np.float32)
+    if rows * cols == 0:
+        return ious
+    exchange = False
+    if bboxes1.shape[0] > bboxes2.shape[0]:
+        bboxes1, bboxes2 = bboxes2, bboxes1
+        ious = np.zeros((cols, rows), dtype=np.float32)
+        exchange = True
+    area1 = (bboxes1[:, 2] - bboxes1[:, 0] + 1) * (
+        bboxes1[:, 3] - bboxes1[:, 1] + 1)
+    area2 = (bboxes2[:, 2] - bboxes2[:, 0] + 1) * (
+        bboxes2[:, 3] - bboxes2[:, 1] + 1)
+    for i in range(bboxes1.shape[0]):
+        x_start = np.maximum(bboxes1[i, 0], bboxes2[:, 0])
+        y_start = np.maximum(bboxes1[i, 1], bboxes2[:, 1])
+        x_end = np.minimum(bboxes1[i, 2], bboxes2[:, 2])
+        y_end = np.minimum(bboxes1[i, 3], bboxes2[:, 3])
+        overlap = np.maximum(x_end - x_start + 1, 0) * np.maximum(
+            y_end - y_start + 1, 0)
+        if mode == 'iou':
+            union = area1[i] + area2 - overlap
+        else:
+            union = area1[i] if not exchange else area2
+        ious[i, :] = overlap / union
+    if exchange:
+        ious = ious.T
+    return ious
diff --git a/mmdet/core/eval/class_names.py b/mmdet/core/eval/class_names.py
new file mode 100644
index 0000000000000000000000000000000000000000..b68e9135dca366e93217e0c06959bea990ffda5e
--- /dev/null
+++ b/mmdet/core/eval/class_names.py
@@ -0,0 +1,103 @@
+import mmcv
+
+
+def voc_classes():
+    return [
+        'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat',
+        'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person',
+        'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'
+    ]
+
+
+def imagenet_det_classes():
+    return [
+        'accordion', 'airplane', 'ant', 'antelope', 'apple', 'armadillo',
+        'artichoke', 'axe', 'baby_bed', 'backpack', 'bagel', 'balance_beam',
+        'banana', 'band_aid', 'banjo', 'baseball', 'basketball', 'bathing_cap',
+        'beaker', 'bear', 'bee', 'bell_pepper', 'bench', 'bicycle', 'binder',
+        'bird', 'bookshelf', 'bow_tie', 'bow', 'bowl', 'brassiere', 'burrito',
+        'bus', 'butterfly', 'camel', 'can_opener', 'car', 'cart', 'cattle',
+        'cello', 'centipede', 'chain_saw', 'chair', 'chime', 'cocktail_shaker',
+        'coffee_maker', 'computer_keyboard', 'computer_mouse', 'corkscrew',
+        'cream', 'croquet_ball', 'crutch', 'cucumber', 'cup_or_mug', 'diaper',
+        'digital_clock', 'dishwasher', 'dog', 'domestic_cat', 'dragonfly',
+        'drum', 'dumbbell', 'electric_fan', 'elephant', 'face_powder', 'fig',
+        'filing_cabinet', 'flower_pot', 'flute', 'fox', 'french_horn', 'frog',
+        'frying_pan', 'giant_panda', 'goldfish', 'golf_ball', 'golfcart',
+        'guacamole', 'guitar', 'hair_dryer', 'hair_spray', 'hamburger',
+        'hammer', 'hamster', 'harmonica', 'harp', 'hat_with_a_wide_brim',
+        'head_cabbage', 'helmet', 'hippopotamus', 'horizontal_bar', 'horse',
+        'hotdog', 'iPod', 'isopod', 'jellyfish', 'koala_bear', 'ladle',
+        'ladybug', 'lamp', 'laptop', 'lemon', 'lion', 'lipstick', 'lizard',
+        'lobster', 'maillot', 'maraca', 'microphone', 'microwave', 'milk_can',
+        'miniskirt', 'monkey', 'motorcycle', 'mushroom', 'nail', 'neck_brace',
+        'oboe', 'orange', 'otter', 'pencil_box', 'pencil_sharpener', 'perfume',
+        'person', 'piano', 'pineapple', 'ping-pong_ball', 'pitcher', 'pizza',
+        'plastic_bag', 'plate_rack', 'pomegranate', 'popsicle', 'porcupine',
+        'power_drill', 'pretzel', 'printer', 'puck', 'punching_bag', 'purse',
+        'rabbit', 'racket', 'ray', 'red_panda', 'refrigerator',
+        'remote_control', 'rubber_eraser', 'rugby_ball', 'ruler',
+        'salt_or_pepper_shaker', 'saxophone', 'scorpion', 'screwdriver',
+        'seal', 'sheep', 'ski', 'skunk', 'snail', 'snake', 'snowmobile',
+        'snowplow', 'soap_dispenser', 'soccer_ball', 'sofa', 'spatula',
+        'squirrel', 'starfish', 'stethoscope', 'stove', 'strainer',
+        'strawberry', 'stretcher', 'sunglasses', 'swimming_trunks', 'swine',
+        'syringe', 'table', 'tape_player', 'tennis_ball', 'tick', 'tie',
+        'tiger', 'toaster', 'traffic_light', 'train', 'trombone', 'trumpet',
+        'turtle', 'tv_or_monitor', 'unicycle', 'vacuum', 'violin',
+        'volleyball', 'waffle_iron', 'washer', 'water_bottle', 'watercraft',
+        'whale', 'wine_bottle', 'zebra'
+    ]
+
+
+def imagenet_vid_classes():
+    return [
+        'airplane', 'antelope', 'bear', 'bicycle', 'bird', 'bus', 'car',
+        'cattle', 'dog', 'domestic_cat', 'elephant', 'fox', 'giant_panda',
+        'hamster', 'horse', 'lion', 'lizard', 'monkey', 'motorcycle', 'rabbit',
+        'red_panda', 'sheep', 'snake', 'squirrel', 'tiger', 'train', 'turtle',
+        'watercraft', 'whale', 'zebra'
+    ]
+
+
+def coco_classes():
+    return [
+        'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train',
+        'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign',
+        'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep',
+        'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella',
+        'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard',
+        'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard',
+        'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork',
+        'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange',
+        'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair',
+        'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv',
+        'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave',
+        'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase',
+        'scissors', 'teddy bear', 'hair drier', 'toothbrush'
+    ]
+
+
+dataset_aliases = {
+    'voc': ['voc', 'pascal_voc', 'voc07', 'voc12'],
+    'imagenet_det': ['det', 'imagenet_det', 'ilsvrc_det'],
+    'imagenet_vid': ['vid', 'imagenet_vid', 'ilsvrc_vid'],
+    'coco': ['coco', 'mscoco', 'ms_coco']
+}
+
+
+def get_classes(dataset):
+    """Get class names of a dataset."""
+    alias2name = {}
+    for name, aliases in dataset_aliases.items():
+        for alias in aliases:
+            alias2name[alias] = name
+
+    if mmcv.is_str(dataset):
+        if dataset in alias2name:
+            labels = eval(alias2name[dataset] + '_labels()')
+        else:
+            raise ValueError('Unrecognized dataset: {}'.format(dataset))
+    else:
+        raise TypeError('dataset must a str, but got {}'.format(type(dataset)))
+    return labels
diff --git a/mmdet/core/eval/mean_ap.py b/mmdet/core/eval/mean_ap.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a33f7640409993db3e11cedd587f1cd14c38aa5
--- /dev/null
+++ b/mmdet/core/eval/mean_ap.py
@@ -0,0 +1,372 @@
+import numpy as np
+from terminaltables import AsciiTable
+
+from .bbox_overlaps import bbox_overlaps
+from .class_names import get_classes
+
+
+def average_precision(recalls, precisions, mode='area'):
+    """Calculate average precision (for single or multiple scales).
+
+    Args:
+        recalls(ndarray): shape (num_scales, num_dets) or (num_dets, )
+        precisions(ndarray): shape (num_scales, num_dets) or (num_dets, )
+        mode(str): 'area' or '11points', 'area' means calculating the area
+            under precision-recall curve, '11points' means calculating
+            the average precision of recalls at [0, 0.1, ..., 1]
+
+    Returns:
+        float or ndarray: calculated average precision
+    """
+    no_scale = False
+    if recalls.ndim == 1:
+        no_scale = True
+        recalls = recalls[np.newaxis, :]
+        precisions = precisions[np.newaxis, :]
+    assert recalls.shape == precisions.shape and recalls.ndim == 2
+    num_scales = recalls.shape[0]
+    ap = np.zeros(num_scales, dtype=np.float32)
+    if mode == 'area':
+        zeros = np.zeros((num_scales, 1), dtype=recalls.dtype)
+        ones = np.ones((num_scales, 1), dtype=recalls.dtype)
+        mrec = np.hstack((zeros, recalls, ones))
+        mpre = np.hstack((zeros, precisions, zeros))
+        for i in range(mpre.shape[1] - 1, 0, -1):
+            mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i])
+        for i in range(num_scales):
+            ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0]
+            ap[i] = np.sum(
+                (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1])
+    elif mode == '11points':
+        for i in range(num_scales):
+            for thr in np.arange(0, 1 + 1e-3, 0.1):
+                precs = precisions[i, recalls[i, :] >= thr]
+                prec = precs.max() if precs.size > 0 else 0
+                ap[i] += prec
+            ap /= 11
+    else:
+        raise ValueError(
+            'Unrecognized mode, only "area" and "11points" are supported')
+    if no_scale:
+        ap = ap[0]
+    return ap
+
+
+def tpfp_imagenet(det_bboxes,
+                  gt_bboxes,
+                  gt_ignore,
+                  default_iou_thr,
+                  area_ranges=None):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox(ndarray): the detected bbox
+        gt_bboxes(ndarray): ground truth bboxes of this image
+        gt_ignore(ndarray): indicate if gts are ignored for evaluation or not
+        default_iou_thr(float): the iou thresholds for medium and large bboxes
+        area_ranges(list or None): gt bbox area ranges
+
+    Returns:
+        tuple: two arrays (tp, fp) whose elements are 0 and 1
+    """
+    num_dets = det_bboxes.shape[0]
+    num_gts = gt_bboxes.shape[0]
+    if area_ranges is None:
+        area_ranges = [(None, None)]
+    num_scales = len(area_ranges)
+    # tp and fp are of shape (num_scales, num_gts), each row is tp or fp
+    # of a certain scale.
+    tp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    fp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    if gt_bboxes.shape[0] == 0:
+        if area_ranges == [(None, None)]:
+            fp[...] = 1
+        else:
+            det_areas = (det_bboxes[:, 2] - det_bboxes[:, 0] + 1) * (
+                det_bboxes[:, 3] - det_bboxes[:, 1] + 1)
+            for i, (min_area, max_area) in enumerate(area_ranges):
+                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+        return tp, fp
+    ious = bbox_overlaps(det_bboxes, gt_bboxes - 1)
+    gt_w = gt_bboxes[:, 2] - gt_bboxes[:, 0] + 1
+    gt_h = gt_bboxes[:, 3] - gt_bboxes[:, 1] + 1
+    iou_thrs = np.minimum((gt_w * gt_h) / ((gt_w + 10.0) * (gt_h + 10.0)),
+                          default_iou_thr)
+    # sort all detections by scores in descending order
+    sort_inds = np.argsort(-det_bboxes[:, -1])
+    for k, (min_area, max_area) in enumerate(area_ranges):
+        gt_covered = np.zeros(num_gts, dtype=bool)
+        # if no area range is specified, gt_area_ignore is all False
+        if min_area is None:
+            gt_area_ignore = np.zeros_like(gt_ignore, dtype=bool)
+        else:
+            gt_areas = gt_w * gt_h
+            gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+        for i in sort_inds:
+            max_iou = -1
+            matched_gt = -1
+            # find best overlapped available gt
+            for j in range(num_gts):
+                # different from PASCAL VOC: allow finding other gts if the
+                # best overlaped ones are already matched by other det bboxes
+                if gt_covered[j]:
+                    continue
+                elif ious[i, j] >= iou_thrs[j] and ious[i, j] > max_iou:
+                    max_iou = ious[i, j]
+                    matched_gt = j
+            # there are 4 cases for a det bbox:
+            # 1. this det bbox matches a gt, tp = 1, fp = 0
+            # 2. this det bbox matches an ignored gt, tp = 0, fp = 0
+            # 3. this det bbox matches no gt and within area range, tp = 0, fp = 1
+            # 4. this det bbox matches no gt but is beyond area range, tp = 0, fp = 0
+            if matched_gt >= 0:
+                gt_covered[matched_gt] = 1
+                if not (gt_ignore[matched_gt] or gt_area_ignore[matched_gt]):
+                    tp[k, i] = 1
+            elif min_area is None:
+                fp[k, i] = 1
+            else:
+                bbox = det_bboxes[i, :4]
+                area = (bbox[2] - bbox[0] + 1) * (bbox[3] - bbox[1] + 1)
+                if area >= min_area and area < max_area:
+                    fp[k, i] = 1
+    return tp, fp
+
+
+def tpfp_default(det_bboxes, gt_bboxes, gt_ignore, iou_thr, area_ranges=None):
+    """Check if detected bboxes are true positive or false positive.
+
+    Args:
+        det_bbox(ndarray): the detected bbox
+        gt_bboxes(ndarray): ground truth bboxes of this image
+        gt_ignore(ndarray): indicate if gts are ignored for evaluation or not
+        iou_thr(float): the iou thresholds
+
+    Returns:
+        tuple: (tp, fp), two arrays whose elements are 0 and 1
+    """
+    num_dets = det_bboxes.shape[0]
+    num_gts = gt_bboxes.shape[0]
+    if area_ranges is None:
+        area_ranges = [(None, None)]
+    num_scales = len(area_ranges)
+    # tp and fp are of shape (num_scales, num_gts), each row is tp or fp of
+    # a certain scale
+    tp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    fp = np.zeros((num_scales, num_dets), dtype=np.float32)
+    # if there is no gt bboxes in this image, then all det bboxes
+    # within area range are false positives
+    if gt_bboxes.shape[0] == 0:
+        if area_ranges == [(None, None)]:
+            fp[...] = 1
+        else:
+            det_areas = (det_bboxes[:, 2] - det_bboxes[:, 0] + 1) * (
+                det_bboxes[:, 3] - det_bboxes[:, 1] + 1)
+            for i, (min_area, max_area) in enumerate(area_ranges):
+                fp[i, (det_areas >= min_area) & (det_areas < max_area)] = 1
+        return tp, fp
+    ious = bbox_overlaps(det_bboxes, gt_bboxes)
+    ious_max = ious.max(axis=1)
+    ious_argmax = ious.argmax(axis=1)
+    sort_inds = np.argsort(-det_bboxes[:, -1])
+    for k, (min_area, max_area) in enumerate(area_ranges):
+        gt_covered = np.zeros(num_gts, dtype=bool)
+        # if no area range is specified, gt_area_ignore is all False
+        if min_area is None:
+            gt_area_ignore = np.zeros_like(gt_ignore, dtype=bool)
+        else:
+            gt_areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0] + 1) * (
+                gt_bboxes[:, 3] - gt_bboxes[:, 1] + 1)
+            gt_area_ignore = (gt_areas < min_area) | (gt_areas >= max_area)
+        for i in sort_inds:
+            if ious_max[i] >= iou_thr:
+                matched_gt = ious_argmax[i]
+                if not (gt_ignore[matched_gt] or gt_area_ignore[matched_gt]):
+                    if not gt_covered[matched_gt]:
+                        gt_covered[matched_gt] = True
+                        tp[k, i] = 1
+                    else:
+                        fp[k, i] = 1
+                # otherwise ignore this detected bbox, tp = 0, fp = 0
+            elif min_area is None:
+                fp[k, i] = 1
+            else:
+                bbox = det_bboxes[i, :4]
+                area = (bbox[2] - bbox[0] + 1) * (bbox[3] - bbox[1] + 1)
+                if area >= min_area and area < max_area:
+                    fp[k, i] = 1
+    return tp, fp
+
+
+def get_cls_results(det_results, gt_bboxes, gt_labels, gt_ignore, class_id):
+    """Get det results and gt information of a certain class."""
+    cls_dets = [det[class_id]
+                for det in det_results]  # det bboxes of this class
+    cls_gts = []  # gt bboxes of this class
+    cls_gt_ignore = []
+    for j in range(len(gt_bboxes)):
+        gt_bbox = gt_bboxes[j]
+        cls_inds = (gt_labels[j] == class_id + 1)
+        cls_gt = gt_bbox[cls_inds, :] if gt_bbox.shape[0] > 0 else gt_bbox
+        cls_gts.append(cls_gt)
+        if gt_ignore is None:
+            cls_gt_ignore.append(np.zeros(cls_gt.shape[0], dtype=np.int32))
+        else:
+            cls_gt_ignore.append(gt_ignore[j][cls_inds])
+    return cls_dets, cls_gts, cls_gt_ignore
+
+
+def eval_map(det_results,
+             gt_bboxes,
+             gt_labels,
+             gt_ignore=None,
+             scale_ranges=None,
+             iou_thr=0.5,
+             dataset=None,
+             print_summary=True):
+    """Evaluate mAP of a dataset.
+
+    Args:
+        det_results(list): a list of list, [[cls1_det, cls2_det, ...], ...]
+        gt_bboxes(list): ground truth bboxes of each image, a list of K*4 array
+        gt_labels(list): ground truth labels of each image, a list of K array
+        gt_ignore(list): gt ignore indicators of each image, a list of K array
+        scale_ranges(list, optional): [(min1, max1), (min2, max2), ...]
+        iou_thr(float): IoU threshold
+        dataset(None or str): dataset name, there are minor differences in
+            metrics for different datsets, e.g. "voc07", "imagenet_det", etc.
+        print_summary(bool): whether to print the mAP summary
+
+    Returns:
+        tuple: (mAP, [dict, dict, ...])
+    """
+    assert len(det_results) == len(gt_bboxes) == len(gt_labels)
+    if gt_ignore is not None:
+        assert len(gt_ignore) == len(gt_labels)
+        for i in range(len(gt_ignore)):
+            assert len(gt_labels[i]) == len(gt_ignore[i])
+    area_ranges = ([(rg[0]**2, rg[1]**2) for rg in scale_ranges]
+                   if scale_ranges is not None else None)
+    num_scales = len(scale_ranges) if scale_ranges is not None else 1
+    eval_results = []
+    num_classes = len(det_results[0])  # positive class num
+    gt_labels = [
+        label if label.ndim == 1 else label[:, 0] for label in gt_labels
+    ]
+    for i in range(num_classes):
+        # get gt and det bboxes of this class
+        cls_dets, cls_gts, cls_gt_ignore = get_cls_results(
+            det_results, gt_bboxes, gt_labels, gt_ignore, i)
+        # calculate tp and fp for each image
+        tpfp_func = (tpfp_imagenet
+                     if dataset in ['det', 'vid'] else tpfp_default)
+        tpfp = [
+            tpfp_func(cls_dets[j], cls_gts[j], cls_gt_ignore[j], iou_thr,
+                      area_ranges) for j in range(len(cls_dets))
+        ]
+        tp, fp = tuple(zip(*tpfp))
+        # calculate gt number of each scale, gts ignored or beyond scale are not counted
+        num_gts = np.zeros(num_scales, dtype=int)
+        for j, bbox in enumerate(cls_gts):
+            if area_ranges is None:
+                num_gts[0] += np.sum(np.logical_not(cls_gt_ignore[j]))
+            else:
+                gt_areas = (bbox[:, 2] - bbox[:, 0] + 1) * (
+                    bbox[:, 3] - bbox[:, 1] + 1)
+                for k, (min_area, max_area) in enumerate(area_ranges):
+                    num_gts[k] += np.sum(
+                        np.logical_not(cls_gt_ignore[j]) &
+                        (gt_areas >= min_area) & (gt_areas < max_area))
+        # sort all det bboxes by score, also sort tp and fp
+        cls_dets = np.vstack(cls_dets)
+        num_dets = cls_dets.shape[0]
+        sort_inds = np.argsort(-cls_dets[:, -1])
+        tp = np.hstack(tp)[:, sort_inds]
+        fp = np.hstack(fp)[:, sort_inds]
+        # calculate recall and precision with tp and fp
+        tp = np.cumsum(tp, axis=1)
+        fp = np.cumsum(fp, axis=1)
+        eps = np.finfo(np.float32).eps
+        recalls = tp / np.maximum(num_gts[:, np.newaxis], eps)
+        precisions = tp / np.maximum((tp + fp), eps)
+        # calculate AP
+        if scale_ranges is None:
+            recalls = recalls[0, :]
+            precisions = precisions[0, :]
+            num_gts = num_gts.item()
+        mode = 'area' if dataset != 'voc07' else '11points'
+        ap = average_precision(recalls, precisions, mode)
+        eval_results.append({
+            'num_gts': num_gts,
+            'num_dets': num_dets,
+            'recall': recalls,
+            'precision': precisions,
+            'ap': ap
+        })
+    if scale_ranges is not None:
+        # shape (num_classes, num_scales)
+        all_ap = np.vstack([cls_result['ap'] for cls_result in eval_results])
+        all_num_gts = np.vstack(
+            [cls_result['num_gts'] for cls_result in eval_results])
+        mean_ap = [
+            all_ap[all_num_gts[:, i] > 0, i].mean()
+            if np.any(all_num_gts[:, i] > 0) else 0.0
+            for i in range(num_scales)
+        ]
+    else:
+        aps = []
+        for cls_result in eval_results:
+            if cls_result['num_gts'] > 0:
+                aps.append(cls_result['ap'])
+        mean_ap = np.array(aps).mean().item() if aps else 0.0
+    if print_summary:
+        print_map_summary(mean_ap, eval_results, dataset)
+
+    return mean_ap, eval_results
+
+
+def print_map_summary(mean_ap, results, dataset=None):
+    """Print mAP and results of each class.
+
+    Args:
+        mean_ap(float): calculated from `eval_map`
+        results(list): calculated from `eval_map`
+        dataset(None or str or list): dataset name.
+    """
+    num_scales = len(results[0]['ap']) if isinstance(results[0]['ap'],
+                                                     np.ndarray) else 1
+    num_classes = len(results)
+
+    recalls = np.zeros((num_scales, num_classes), dtype=np.float32)
+    precisions = np.zeros((num_scales, num_classes), dtype=np.float32)
+    aps = np.zeros((num_scales, num_classes), dtype=np.float32)
+    num_gts = np.zeros((num_scales, num_classes), dtype=int)
+    for i, cls_result in enumerate(results):
+        if cls_result['recall'].size > 0:
+            recalls[:, i] = np.array(cls_result['recall'], ndmin=2)[:, -1]
+            precisions[:, i] = np.array(
+                cls_result['precision'], ndmin=2)[:, -1]
+        aps[:, i] = cls_result['ap']
+        num_gts[:, i] = cls_result['num_gts']
+
+    if dataset is None:
+        label_names = [str(i) for i in range(1, num_classes + 1)]
+    else:
+        label_names = get_classes(dataset)
+
+    if not isinstance(mean_ap, list):
+        mean_ap = [mean_ap]
+    header = ['class', 'gts', 'dets', 'recall', 'precision', 'ap']
+    for i in range(num_scales):
+        table_data = [header]
+        for j in range(num_classes):
+            row_data = [
+                label_names[j], num_gts[i, j], results[j]['num_dets'],
+                '{:.3f}'.format(recalls[i, j]), '{:.3f}'.format(
+                    precisions[i, j]), '{:.3f}'.format(aps[i, j])
+            ]
+            table_data.append(row_data)
+        table_data.append(['mAP', '', '', '', '', '{:.3f}'.format(mean_ap[i])])
+        table = AsciiTable(table_data)
+        table.inner_footing_row_border = True
+        print(table.table)
diff --git a/mmdet/core/eval/recall.py b/mmdet/core/eval/recall.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a56f42fdef33341d4b9ec7a654832282b44a7c2
--- /dev/null
+++ b/mmdet/core/eval/recall.py
@@ -0,0 +1,185 @@
+import numpy as np
+from terminaltables import AsciiTable
+
+from .bbox_overlaps import bbox_overlaps
+
+
+def _recalls(all_ious, proposal_nums, thrs):
+
+    img_num = all_ious.shape[0]
+    total_gt_num = sum([ious.shape[0] for ious in all_ious])
+
+    _ious = np.zeros((proposal_nums.size, total_gt_num), dtype=np.float32)
+    for k, proposal_num in enumerate(proposal_nums):
+        tmp_ious = np.zeros(0)
+        for i in range(img_num):
+            ious = all_ious[i][:, :proposal_num].copy()
+            gt_ious = np.zeros((ious.shape[0]))
+            if ious.size == 0:
+                tmp_ious = np.hstack((tmp_ious, gt_ious))
+                continue
+            for j in range(ious.shape[0]):
+                gt_max_overlaps = ious.argmax(axis=1)
+                max_ious = ious[np.arange(0, ious.shape[0]), gt_max_overlaps]
+                gt_idx = max_ious.argmax()
+                gt_ious[j] = max_ious[gt_idx]
+                box_idx = gt_max_overlaps[gt_idx]
+                ious[gt_idx, :] = -1
+                ious[:, box_idx] = -1
+            tmp_ious = np.hstack((tmp_ious, gt_ious))
+        _ious[k, :] = tmp_ious
+
+    _ious = np.fliplr(np.sort(_ious, axis=1))
+    recalls = np.zeros((proposal_nums.size, thrs.size))
+    for i, thr in enumerate(thrs):
+        recalls[:, i] = (_ious >= thr).sum(axis=1) / float(total_gt_num)
+
+    return recalls
+
+
+def set_recall_param(proposal_nums, iou_thrs):
+    """Check proposal_nums and iou_thrs and set correct format.
+    """
+    if isinstance(proposal_nums, list):
+        _proposal_nums = np.array(proposal_nums)
+    elif isinstance(proposal_nums, int):
+        _proposal_nums = np.array([proposal_nums])
+    else:
+        _proposal_nums = proposal_nums
+
+    if iou_thrs is None:
+        _iou_thrs = np.array([0.5])
+    elif isinstance(iou_thrs, list):
+        _iou_thrs = np.array(iou_thrs)
+    elif isinstance(iou_thrs, float):
+        _iou_thrs = np.array([iou_thrs])
+    else:
+        _iou_thrs = iou_thrs
+
+    return _proposal_nums, _iou_thrs
+
+
+def eval_recalls(gts,
+                 proposals,
+                 proposal_nums=None,
+                 iou_thrs=None,
+                 print_summary=True):
+    """Calculate recalls.
+
+    Args:
+        gts(list or ndarray): a list of arrays of shape (n, 4)
+        proposals(list or ndarray): a list of arrays of shape (k, 4) or (k, 5)
+        proposal_nums(int or list of int or ndarray): top N proposals
+        thrs(float or list or ndarray): iou thresholds
+
+    Returns:
+        ndarray: recalls of different ious and proposal nums
+    """
+
+    img_num = len(gts)
+    assert img_num == len(proposals)
+
+    proposal_nums, iou_thrs = set_recall_param(proposal_nums, iou_thrs)
+
+    all_ious = []
+    for i in range(img_num):
+        if proposals[i].ndim == 2 and proposals[i].shape[1] == 5:
+            scores = proposals[i][:, 4]
+            sort_idx = np.argsort(scores)[::-1]
+            img_proposal = proposals[i][sort_idx, :]
+        else:
+            img_proposal = proposals[i]
+        prop_num = min(img_proposal.shape[0], proposal_nums[-1])
+        if gts[i] is None or gts[i].shape[0] == 0:
+            ious = np.zeros((0, img_proposal.shape[0]), dtype=np.float32)
+        else:
+            ious = bbox_overlaps(gts[i], img_proposal[:prop_num, :4])
+        all_ious.append(ious)
+    all_ious = np.array(all_ious)
+    recalls = _recalls(all_ious, proposal_nums, iou_thrs)
+    if print_summary:
+        print_recall_summary(recalls, proposal_nums, iou_thrs)
+    return recalls
+
+
+def print_recall_summary(recalls,
+                         proposal_nums,
+                         iou_thrs,
+                         row_idxs=None,
+                         col_idxs=None):
+    """Print recalls in a table.
+
+    Args:
+        recalls(ndarray): calculated from `bbox_recalls`
+        proposal_nums(ndarray or list): top N proposals
+        iou_thrs(ndarray or list): iou thresholds
+        row_idxs(ndarray): which rows(proposal nums) to print
+        col_idxs(ndarray): which cols(iou thresholds) to print
+    """
+    proposal_nums = np.array(proposal_nums, dtype=np.int32)
+    iou_thrs = np.array(iou_thrs)
+    if row_idxs is None:
+        row_idxs = np.arange(proposal_nums.size)
+    if col_idxs is None:
+        col_idxs = np.arange(iou_thrs.size)
+    row_header = [''] + iou_thrs[col_idxs].tolist()
+    table_data = [row_header]
+    for i, num in enumerate(proposal_nums[row_idxs]):
+        row = [
+            '{:.3f}'.format(val)
+            for val in recalls[row_idxs[i], col_idxs].tolist()
+        ]
+        row.insert(0, num)
+        table_data.append(row)
+    table = AsciiTable(table_data)
+    print(table.table)
+
+
+def plot_num_recall(recalls, proposal_nums):
+    """Plot Proposal_num-Recalls curve.
+
+    Args:
+        recalls(ndarray or list): shape (k,)
+        proposal_nums(ndarray or list): same shape as `recalls`
+    """
+    if isinstance(proposal_nums, np.ndarray):
+        _proposal_nums = proposal_nums.tolist()
+    else:
+        _proposal_nums = proposal_nums
+    if isinstance(recalls, np.ndarray):
+        _recalls = recalls.tolist()
+    else:
+        _recalls = recalls
+
+    import matplotlib.pyplot as plt
+    f = plt.figure()
+    plt.plot([0] + _proposal_nums, [0] + _recalls)
+    plt.xlabel('Proposal num')
+    plt.ylabel('Recall')
+    plt.axis([0, proposal_nums.max(), 0, 1])
+    f.show()
+
+
+def plot_iou_recall(recalls, iou_thrs):
+    """Plot IoU-Recalls curve.
+
+    Args:
+        recalls(ndarray or list): shape (k,)
+        iou_thrs(ndarray or list): same shape as `recalls`
+    """
+    if isinstance(iou_thrs, np.ndarray):
+        _iou_thrs = iou_thrs.tolist()
+    else:
+        _iou_thrs = iou_thrs
+    if isinstance(recalls, np.ndarray):
+        _recalls = recalls.tolist()
+    else:
+        _recalls = recalls
+
+    import matplotlib.pyplot as plt
+    f = plt.figure()
+    plt.plot(_iou_thrs + [1.0], _recalls + [0.])
+    plt.xlabel('IoU')
+    plt.ylabel('Recall')
+    plt.axis([iou_thrs.min(), 1, 0, 1])
+    f.show()
diff --git a/mmdet/core/hooks.py b/mmdet/core/hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..3347639d51ac19d5072bcb0a2e76c7747d686c77
--- /dev/null
+++ b/mmdet/core/hooks.py
@@ -0,0 +1,246 @@
+import os
+import os.path as osp
+import shutil
+import time
+
+import mmcv
+import numpy as np
+import torch
+from mmcv.torchpack import Hook
+from mmdet import collate, scatter
+from pycocotools.cocoeval import COCOeval
+
+from .eval import eval_recalls
+
+
+class EmptyCacheHook(Hook):
+
+    def before_epoch(self, runner):
+        torch.cuda.empty_cache()
+
+    def after_epoch(self, runner):
+        torch.cuda.empty_cache()
+
+
+class DistEvalHook(Hook):
+
+    def __init__(self, dataset, interval=1):
+        self.dataset = dataset
+        self.interval = interval
+        self.lock_dir = None
+
+    def _barrier(self, rank, world_size):
+        """Due to some issues with `torch.distributed.barrier()`, we have to
+        implement this ugly barrier function.
+        """
+        if rank == 0:
+            for i in range(1, world_size):
+                tmp = osp.join(self.lock_dir, '{}.pkl'.format(i))
+                while not (osp.exists(tmp)):
+                    time.sleep(1)
+            for i in range(1, world_size):
+                tmp = osp.join(self.lock_dir, '{}.pkl'.format(i))
+                os.remove(tmp)
+        else:
+            tmp = osp.join(self.lock_dir, '{}.pkl'.format(rank))
+            mmcv.dump([], tmp)
+            while osp.exists(tmp):
+                time.sleep(1)
+
+    def before_run(self, runner):
+        self.lock_dir = osp.join(runner.work_dir, '.lock_map_hook')
+        if runner.rank == 0:
+            if osp.exists(self.lock_dir):
+                shutil.rmtree(self.lock_dir)
+            mmcv.mkdir_or_exist(self.lock_dir)
+
+    def after_train_epoch(self, runner):
+        if not self.every_n_epochs(runner, self.interval):
+            return
+        runner.model.eval()
+        results = [None for _ in range(len(self.dataset))]
+        prog_bar = mmcv.ProgressBar(len(self.dataset))
+        for idx in range(runner.rank, len(self.dataset), runner.world_size):
+            data = self.dataset[idx]
+            device_id = torch.cuda.current_device()
+            imgs_data = tuple(
+                scatter(collate([data], samples_per_gpu=1), [device_id])[0])
+
+            # compute output
+            with torch.no_grad():
+                result = runner.model(
+                    *imgs_data,
+                    return_loss=False,
+                    return_bboxes=True,
+                    rescale=True)
+            results[idx] = result
+
+            batch_size = runner.world_size
+            for _ in range(batch_size):
+                prog_bar.update()
+
+        if runner.rank == 0:
+            print('\n')
+            self._barrier(runner.rank, runner.world_size)
+            for i in range(1, runner.world_size):
+                tmp_file = osp.join(runner.work_dir, 'temp_{}.pkl'.format(i))
+                tmp_results = mmcv.load(tmp_file)
+                for idx in range(i, len(results), runner.world_size):
+                    results[idx] = tmp_results[idx]
+                os.remove(tmp_file)
+            self.evaluate(runner, results)
+        else:
+            tmp_file = osp.join(runner.work_dir,
+                                'temp_{}.pkl'.format(runner.rank))
+            mmcv.dump(results, tmp_file)
+            self._barrier(runner.rank, runner.world_size)
+        self._barrier(runner.rank, runner.world_size)
+
+    def evaluate(self):
+        raise NotImplementedError
+
+
+class CocoEvalMixin(object):
+
+    def _xyxy2xywh(self, bbox):
+        _bbox = bbox.tolist()
+        return [
+            _bbox[0],
+            _bbox[1],
+            _bbox[2] - _bbox[0] + 1,
+            _bbox[3] - _bbox[1] + 1,
+        ]
+
+    def det2json(self, dataset, results):
+        json_results = []
+        for idx in range(len(dataset)):
+            img_id = dataset.img_ids[idx]
+            result = results[idx]
+            for label in range(len(result)):
+                bboxes = result[label]
+                for i in range(bboxes.shape[0]):
+                    data = dict()
+                    data['image_id'] = img_id
+                    data['bbox'] = self._xyxy2xywh(bboxes[i])
+                    data['score'] = float(bboxes[i][4])
+                    data['category_id'] = dataset.cat_ids[label]
+                    json_results.append(data)
+        return json_results
+
+    def segm2json(self, dataset, results):
+        json_results = []
+        for idx in range(len(dataset)):
+            img_id = dataset.img_ids[idx]
+            det, seg = results[idx]
+            for label in range(len(det)):
+                bboxes = det[label]
+                segms = seg[label]
+                for i in range(bboxes.shape[0]):
+                    data = dict()
+                    data['image_id'] = img_id
+                    data['bbox'] = self._xyxy2xywh(bboxes[i])
+                    data['score'] = float(bboxes[i][4])
+                    data['category_id'] = dataset.cat_ids[label]
+                    segms[i]['counts'] = segms[i]['counts'].decode()
+                    data['segmentation'] = segms[i]
+                    json_results.append(data)
+        return json_results
+
+    def proposal2json(self, dataset, results):
+        json_results = []
+        for idx in range(len(dataset)):
+            img_id = dataset.img_ids[idx]
+            bboxes = results[idx]
+            for i in range(bboxes.shape[0]):
+                data = dict()
+                data['image_id'] = img_id
+                data['bbox'] = self._xyxy2xywh(bboxes[i])
+                data['score'] = float(bboxes[i][4])
+                data['category_id'] = 1
+                json_results.append(data)
+        return json_results
+
+    def results2json(self, dataset, results, out_file):
+        if isinstance(results[0], list):
+            json_results = self.det2json(dataset, results)
+        elif isinstance(results[0], tuple):
+            json_results = self.segm2json(dataset, results)
+        elif isinstance(results[0], np.ndarray):
+            json_results = self.proposal2json(dataset, results)
+        else:
+            raise TypeError('invalid type of results')
+        mmcv.dump(json_results, out_file, file_format='json')
+
+
+class DistEvalRecallHook(DistEvalHook):
+
+    def __init__(self,
+                 dataset,
+                 proposal_nums=(100, 300, 1000),
+                 iou_thrs=np.arange(0.5, 0.96, 0.05)):
+        super(DistEvalRecallHook, self).__init__(dataset)
+        self.proposal_nums = np.array(proposal_nums, dtype=np.int32)
+        self.iou_thrs = np.array(iou_thrs, dtype=np.float32)
+
+    def evaluate(self, runner, results):
+        # official coco evaluation is too slow, here we use our own
+        # implementation, which may get slightly different results
+        gt_bboxes = []
+        for i in range(len(self.dataset)):
+            img_id = self.dataset.img_ids[i]
+            ann_ids = self.dataset.coco.getAnnIds(imgIds=img_id)
+            ann_info = self.dataset.coco.loadAnns(ann_ids)
+            if len(ann_info) == 0:
+                gt_bboxes.append(np.zeros((0, 4)))
+                continue
+            bboxes = []
+            for ann in ann_info:
+                if ann.get('ignore', False) or ann['iscrowd']:
+                    continue
+                x1, y1, w, h = ann['bbox']
+                bboxes.append([x1, y1, x1 + w - 1, y1 + h - 1])
+            bboxes = np.array(bboxes, dtype=np.float32)
+            if bboxes.shape[0] == 0:
+                bboxes = np.zeros((0, 4))
+            gt_bboxes.append(bboxes)
+
+        recalls = eval_recalls(
+            gt_bboxes,
+            results,
+            self.proposal_nums,
+            self.iou_thrs,
+            print_summary=False)
+        ar = recalls.mean(axis=1)
+        for i, num in enumerate(self.proposal_nums):
+            runner.log_buffer.output['AR@{}'.format(num)] = ar[i]
+        runner.log_buffer.ready = True
+
+
+class CocoDistEvalmAPHook(DistEvalHook, CocoEvalMixin):
+
+    def evaluate(self, runner, results):
+        tmp_file = osp.join(runner.work_dir, 'temp_0.json')
+        self.results2json(self.dataset, results, tmp_file)
+
+        res_types = ['bbox', 'segm'] if runner.model.with_mask else ['bbox']
+        cocoGt = self.dataset.coco
+        cocoDt = cocoGt.loadRes(tmp_file)
+        imgIds = cocoGt.getImgIds()
+        for res_type in res_types:
+            iou_type = res_type
+            cocoEval = COCOeval(cocoGt, cocoDt, iou_type)
+            cocoEval.params.imgIds = imgIds
+            cocoEval.evaluate()
+            cocoEval.accumulate()
+            cocoEval.summarize()
+            field = '{}_mAP'.format(res_type)
+            runner.log_buffer.output[field] = cocoEval.stats[0]
+        runner.log_buffer.ready = True
+        os.remove(tmp_file)
+
+
+class CocoDistCascadeEvalmAPHook(CocoDistEvalmAPHook):
+
+    def evaluate(self, runner, results):
+        results = [res[-1] for res in results]
+        super(CocoDistCascadeEvalmAPHook, self).evaluate(runner, results)
diff --git a/mmdet/core/mask_ops/__init__.py b/mmdet/core/mask_ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..25850cdc62ae69271f3788288d960b86ef179452
--- /dev/null
+++ b/mmdet/core/mask_ops/__init__.py
@@ -0,0 +1,10 @@
+from .segms import (flip_segms, polys_to_mask, mask_to_bbox,
+                    polys_to_mask_wrt_box, polys_to_boxes, rle_mask_voting,
+                    rle_mask_nms, rle_masks_to_boxes)
+from .utils import split_combined_gt_polys
+
+__all__ = [
+    'flip_segms', 'polys_to_mask', 'mask_to_bbox', 'polys_to_mask_wrt_box',
+    'polys_to_boxes', 'rle_mask_voting', 'rle_mask_nms', 'rle_masks_to_boxes',
+    'split_combined_gt_polys'
+]
diff --git a/mmdet/core/mask_ops/segms.py b/mmdet/core/mask_ops/segms.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2ae6b69a1ff206b085799fa82527e1d17be0a4f
--- /dev/null
+++ b/mmdet/core/mask_ops/segms.py
@@ -0,0 +1,271 @@
+# This file is copied from Detectron.
+
+# Copyright (c) 2017-present, Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+##############################################################################
+"""Functions for interacting with segmentation masks in the COCO format.
+The following terms are used in this module
+    mask: a binary mask encoded as a 2D numpy array
+    segm: a segmentation mask in one of the two COCO formats (polygon or RLE)
+    polygon: COCO's polygon format
+    RLE: COCO's run length encoding format
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+import pycocotools.mask as mask_util
+
+
+def flip_segms(segms, height, width):
+    """Left/right flip each mask in a list of masks."""
+
+    def _flip_poly(poly, width):
+        flipped_poly = np.array(poly)
+        flipped_poly[0::2] = width - np.array(poly[0::2]) - 1
+        return flipped_poly.tolist()
+
+    def _flip_rle(rle, height, width):
+        if 'counts' in rle and type(rle['counts']) == list:
+            # Magic RLE format handling painfully discovered by looking at the
+            # COCO API showAnns function.
+            rle = mask_util.frPyObjects([rle], height, width)
+        mask = mask_util.decode(rle)
+        mask = mask[:, ::-1, :]
+        rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
+        return rle
+
+    flipped_segms = []
+    for segm in segms:
+        if type(segm) == list:
+            # Polygon format
+            flipped_segms.append([_flip_poly(poly, width) for poly in segm])
+        else:
+            # RLE format
+            assert type(segm) == dict
+            flipped_segms.append(_flip_rle(segm, height, width))
+    return flipped_segms
+
+
+def polys_to_mask(polygons, height, width):
+    """Convert from the COCO polygon segmentation format to a binary mask
+    encoded as a 2D array of data type numpy.float32. The polygon segmentation
+    is understood to be enclosed inside a height x width image. The resulting
+    mask is therefore of shape (height, width).
+    """
+    rle = mask_util.frPyObjects(polygons, height, width)
+    mask = np.array(mask_util.decode(rle), dtype=np.float32)
+    # Flatten in case polygons was a list
+    mask = np.sum(mask, axis=2)
+    mask = np.array(mask > 0, dtype=np.float32)
+    return mask
+
+
+def mask_to_bbox(mask):
+    """Compute the tight bounding box of a binary mask."""
+    xs = np.where(np.sum(mask, axis=0) > 0)[0]
+    ys = np.where(np.sum(mask, axis=1) > 0)[0]
+
+    if len(xs) == 0 or len(ys) == 0:
+        return None
+
+    x0 = xs[0]
+    x1 = xs[-1]
+    y0 = ys[0]
+    y1 = ys[-1]
+    return np.array((x0, y0, x1, y1), dtype=np.float32)
+
+
+def polys_to_mask_wrt_box(polygons, box, M):
+    """Convert from the COCO polygon segmentation format to a binary mask
+    encoded as a 2D array of data type numpy.float32. The polygon segmentation
+    is understood to be enclosed in the given box and rasterized to an M x M
+    mask. The resulting mask is therefore of shape (M, M).
+    """
+    w = box[2] - box[0]
+    h = box[3] - box[1]
+
+    w = np.maximum(w, 1)
+    h = np.maximum(h, 1)
+
+    polygons_norm = []
+    for poly in polygons:
+        p = np.array(poly, dtype=np.float32)
+        p[0::2] = (p[0::2] - box[0]) * M / w
+        p[1::2] = (p[1::2] - box[1]) * M / h
+        polygons_norm.append(p)
+
+    rle = mask_util.frPyObjects(polygons_norm, M, M)
+    mask = np.array(mask_util.decode(rle), dtype=np.float32)
+    # Flatten in case polygons was a list
+    mask = np.sum(mask, axis=2)
+    mask = np.array(mask > 0, dtype=np.float32)
+    return mask
+
+
+def polys_to_boxes(polys):
+    """Convert a list of polygons into an array of tight bounding boxes."""
+    boxes_from_polys = np.zeros((len(polys), 4), dtype=np.float32)
+    for i in range(len(polys)):
+        poly = polys[i]
+        x0 = min(min(p[::2]) for p in poly)
+        x1 = max(max(p[::2]) for p in poly)
+        y0 = min(min(p[1::2]) for p in poly)
+        y1 = max(max(p[1::2]) for p in poly)
+        boxes_from_polys[i, :] = [x0, y0, x1, y1]
+
+    return boxes_from_polys
+
+
+def rle_mask_voting(top_masks,
+                    all_masks,
+                    all_dets,
+                    iou_thresh,
+                    binarize_thresh,
+                    method='AVG'):
+    """Returns new masks (in correspondence with `top_masks`) by combining
+    multiple overlapping masks coming from the pool of `all_masks`. Two methods
+    for combining masks are supported: 'AVG' uses a weighted average of
+    overlapping mask pixels; 'UNION' takes the union of all mask pixels.
+    """
+    if len(top_masks) == 0:
+        return
+
+    all_not_crowd = [False] * len(all_masks)
+    top_to_all_overlaps = mask_util.iou(top_masks, all_masks, all_not_crowd)
+    decoded_all_masks = [
+        np.array(mask_util.decode(rle), dtype=np.float32) for rle in all_masks
+    ]
+    decoded_top_masks = [
+        np.array(mask_util.decode(rle), dtype=np.float32) for rle in top_masks
+    ]
+    all_boxes = all_dets[:, :4].astype(np.int32)
+    all_scores = all_dets[:, 4]
+
+    # Fill box support with weights
+    mask_shape = decoded_all_masks[0].shape
+    mask_weights = np.zeros((len(all_masks), mask_shape[0], mask_shape[1]))
+    for k in range(len(all_masks)):
+        ref_box = all_boxes[k]
+        x_0 = max(ref_box[0], 0)
+        x_1 = min(ref_box[2] + 1, mask_shape[1])
+        y_0 = max(ref_box[1], 0)
+        y_1 = min(ref_box[3] + 1, mask_shape[0])
+        mask_weights[k, y_0:y_1, x_0:x_1] = all_scores[k]
+    mask_weights = np.maximum(mask_weights, 1e-5)
+
+    top_segms_out = []
+    for k in range(len(top_masks)):
+        # Corner case of empty mask
+        if decoded_top_masks[k].sum() == 0:
+            top_segms_out.append(top_masks[k])
+            continue
+
+        inds_to_vote = np.where(top_to_all_overlaps[k] >= iou_thresh)[0]
+        # Only matches itself
+        if len(inds_to_vote) == 1:
+            top_segms_out.append(top_masks[k])
+            continue
+
+        masks_to_vote = [decoded_all_masks[i] for i in inds_to_vote]
+        if method == 'AVG':
+            ws = mask_weights[inds_to_vote]
+            soft_mask = np.average(masks_to_vote, axis=0, weights=ws)
+            mask = np.array(soft_mask > binarize_thresh, dtype=np.uint8)
+        elif method == 'UNION':
+            # Any pixel that's on joins the mask
+            soft_mask = np.sum(masks_to_vote, axis=0)
+            mask = np.array(soft_mask > 1e-5, dtype=np.uint8)
+        else:
+            raise NotImplementedError('Method {} is unknown'.format(method))
+        rle = mask_util.encode(np.array(mask[:, :, np.newaxis], order='F'))[0]
+        top_segms_out.append(rle)
+
+    return top_segms_out
+
+
+def rle_mask_nms(masks, dets, thresh, mode='IOU'):
+    """Performs greedy non-maximum suppression based on an overlap measurement
+    between masks. The type of measurement is determined by `mode` and can be
+    either 'IOU' (standard intersection over union) or 'IOMA' (intersection over
+    mininum area).
+    """
+    if len(masks) == 0:
+        return []
+    if len(masks) == 1:
+        return [0]
+
+    if mode == 'IOU':
+        # Computes ious[m1, m2] = area(intersect(m1, m2)) / area(union(m1, m2))
+        all_not_crowds = [False] * len(masks)
+        ious = mask_util.iou(masks, masks, all_not_crowds)
+    elif mode == 'IOMA':
+        # Computes ious[m1, m2] = area(intersect(m1, m2)) / min(area(m1), area(m2))
+        all_crowds = [True] * len(masks)
+        # ious[m1, m2] = area(intersect(m1, m2)) / area(m2)
+        ious = mask_util.iou(masks, masks, all_crowds)
+        # ... = max(area(intersect(m1, m2)) / area(m2),
+        #           area(intersect(m2, m1)) / area(m1))
+        ious = np.maximum(ious, ious.transpose())
+    elif mode == 'CONTAINMENT':
+        # Computes ious[m1, m2] = area(intersect(m1, m2)) / area(m2)
+        # Which measures how much m2 is contained inside m1
+        all_crowds = [True] * len(masks)
+        ious = mask_util.iou(masks, masks, all_crowds)
+    else:
+        raise NotImplementedError('Mode {} is unknown'.format(mode))
+
+    scores = dets[:, 4]
+    order = np.argsort(-scores)
+
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        ovr = ious[i, order[1:]]
+        inds_to_keep = np.where(ovr <= thresh)[0]
+        order = order[inds_to_keep + 1]
+
+    return keep
+
+
+def rle_masks_to_boxes(masks):
+    """Computes the bounding box of each mask in a list of RLE encoded masks."""
+    if len(masks) == 0:
+        return []
+
+    decoded_masks = [
+        np.array(mask_util.decode(rle), dtype=np.float32) for rle in masks
+    ]
+
+    def get_bounds(flat_mask):
+        inds = np.where(flat_mask > 0)[0]
+        return inds.min(), inds.max()
+
+    boxes = np.zeros((len(decoded_masks), 4))
+    keep = [True] * len(decoded_masks)
+    for i, mask in enumerate(decoded_masks):
+        if mask.sum() == 0:
+            keep[i] = False
+            continue
+        flat_mask = mask.sum(axis=0)
+        x0, x1 = get_bounds(flat_mask)
+        flat_mask = mask.sum(axis=1)
+        y0, y1 = get_bounds(flat_mask)
+        boxes[i, :] = (x0, y0, x1, y1)
+
+    return boxes, np.where(keep)[0]
diff --git a/mmdet/core/mask_ops/utils.py b/mmdet/core/mask_ops/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2802430007e7b239bcb18ba20a26c0609c62245c
--- /dev/null
+++ b/mmdet/core/mask_ops/utils.py
@@ -0,0 +1,35 @@
+import cvbase as cvb
+import numpy as np
+import pycocotools.mask as mask_utils
+
+import mmcv
+
+
+def split_combined_gt_polys(gt_polys, gt_poly_lens, num_polys_per_mask):
+    """Split the combined 1-D polys into masks.
+
+    A mask is represented as a list of polys, and a poly is represented as
+    a 1-D array. In dataset, all masks are concatenated into a single 1-D
+    tensor. Here we need to split the tensor into original representations.
+
+    Args:
+        gt_polys (list): a list (length = image num) of 1-D tensors
+        gt_poly_lens (list): a list (length = image num) of poly length
+        num_polys_per_mask (list): a list (length = image num) of poly number
+            of each mask
+
+    Returns:
+        list: a list (length = image num) of list (length = mask num) of
+            list (length = poly num) of numpy array
+    """
+    mask_polys_list = []
+    for img_id in range(len(gt_polys)):
+        gt_polys_single = gt_polys[img_id].cpu().numpy()
+        gt_polys_lens_single = gt_poly_lens[img_id].cpu().numpy().tolist()
+        num_polys_per_mask_single = num_polys_per_mask[
+            img_id].cpu().numpy().tolist()
+
+        split_gt_polys = mmcv.slice_list(gt_polys_single, gt_polys_lens_single)
+        mask_polys = mmcv.slice_list(split_gt_polys, num_polys_per_mask_single)
+        mask_polys_list.append(mask_polys)
+    return mask_polys_list
diff --git a/mmdet/core/post_processing/__init__.py b/mmdet/core/post_processing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b24a3fc68525de1c73d687404990bd521bdf5b0
--- /dev/null
+++ b/mmdet/core/post_processing/__init__.py
@@ -0,0 +1,8 @@
+from .bbox_nms import multiclass_nms
+from .merge_augs import (merge_aug_proposals, merge_aug_bboxes,
+                         merge_aug_scores, merge_aug_masks)
+
+__all__ = [
+    'multiclass_nms', 'merge_aug_proposals', 'merge_aug_bboxes',
+    'merge_aug_scores', 'merge_aug_masks'
+]
diff --git a/mmdet/core/post_processing/bbox_nms.py b/mmdet/core/post_processing/bbox_nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..f619d2682a035344c6fda6974cd03c5cbfeb0f26
--- /dev/null
+++ b/mmdet/core/post_processing/bbox_nms.py
@@ -0,0 +1,54 @@
+import torch
+
+from mmdet.ops import nms
+
+
+def multiclass_nms(multi_bboxes, multi_scores, score_thr, nms_thr, max_num=-1):
+    """NMS for multi-class bboxes.
+
+    Args:
+        multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
+        multi_scores (Tensor): shape (n, #class)
+        score_thr (float): bbox threshold, bboxes with scores lower than it
+            will not be considered.
+        nms_thr (float): NMS IoU threshold
+        max_num (int): if there are more than max_num bboxes after NMS,
+            only top max_num will be kept.
+
+    Returns:
+        tuple: (bboxes, labels), tensors of shape (k, 5) and (k, 1). Labels
+            are 0-based.
+    """
+    num_classes = multi_scores.shape[1]
+    bboxes, labels = [], []
+    for i in range(1, num_classes):
+        cls_inds = multi_scores[:, i] > score_thr
+        if not cls_inds.any():
+            continue
+        # get bboxes and scores of this class
+        if multi_bboxes.shape[1] == 4:
+            _bboxes = multi_bboxes[cls_inds, :]
+        else:
+            _bboxes = multi_bboxes[cls_inds, i * 4:(i + 1) * 4]
+        _scores = multi_scores[cls_inds, i]
+        cls_dets = torch.cat([_bboxes, _scores[:, None]], dim=1)
+        # perform nms
+        nms_keep = nms(cls_dets, nms_thr)
+        cls_dets = cls_dets[nms_keep, :]
+        cls_labels = multi_bboxes.new_full(
+            (len(nms_keep), ), i - 1, dtype=torch.long)
+        bboxes.append(cls_dets)
+        labels.append(cls_labels)
+    if bboxes:
+        bboxes = torch.cat(bboxes)
+        labels = torch.cat(labels)
+        if bboxes.shape[0] > max_num:
+            _, inds = bboxes[:, -1].sort(descending=True)
+            inds = inds[:max_num]
+            bboxes = bboxes[inds]
+            labels = labels[inds]
+    else:
+        bboxes = multi_bboxes.new_zeros((0, 5))
+        labels = multi_bboxes.new_zeros((0, ), dtype=torch.long)
+
+    return bboxes, labels
diff --git a/mmdet/core/post_processing/merge_augs.py b/mmdet/core/post_processing/merge_augs.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d56e481e5aee2ce113cea7adcb11ebe0aaede5b
--- /dev/null
+++ b/mmdet/core/post_processing/merge_augs.py
@@ -0,0 +1,96 @@
+import torch
+
+from mmcv.ops import nms
+import numpy as np
+
+from ..bbox_ops import bbox_mapping_back
+
+
+def merge_aug_proposals(aug_proposals, img_metas, rpn_test_cfg):
+    """Merge augmented proposals (multiscale, flip, etc.)
+
+    Args:
+        aug_proposals (list[Tensor]): proposals from different testing
+            schemes, shape (n, 5). Note that they are not rescaled to the
+            original image size.
+        img_metas (list[dict]): image info including "shape_scale" and "flip".
+        rpn_test_cfg (dict): rpn test config.
+
+    Returns:
+        Tensor: shape (n, 4), proposals corresponding to original image scale.
+    """
+    recovered_proposals = []
+    for proposals, img_info in zip(aug_proposals, img_metas):
+        shape_scale = img_info['shape_scale'][0]
+        flip = img_info['flip'][0]
+        _proposals = proposals.clone()
+        _proposals[:, :4] = bbox_mapping_back(_proposals[:, :4], shape_scale,
+                                              flip)
+        recovered_proposals.append(_proposals)
+    aug_proposals = torch.cat(recovered_proposals, dim=0)
+    nms_keep = nms(aug_proposals, rpn_test_cfg.nms_thr,
+                   aug_proposals.get_device())
+    merged_proposals = aug_proposals[nms_keep, :]
+    scores = merged_proposals[:, 4]
+    _, order = scores.sort(0, descending=True)
+    num = min(rpn_test_cfg.max_num, merged_proposals.shape[0])
+    order = order[:num]
+    merged_proposals = merged_proposals[order, :]
+    return merged_proposals
+
+
+def merge_aug_bboxes(aug_bboxes, aug_scores, img_metas, rcnn_test_cfg):
+    """Merge augmented detection bboxes and scores.
+
+    Args:
+        aug_bboxes (list[Tensor]): shape (n, 4*#class)
+        aug_scores (list[Tensor] or None): shape (n, #class)
+        img_shapes (list[Tensor]): shape (3, ).
+        rcnn_test_cfg (dict): rcnn test config.
+
+    Returns:
+        tuple: (bboxes, scores)
+    """
+    recovered_bboxes = []
+    for bboxes, img_info in zip(aug_bboxes, img_metas):
+        shape_scale = img_info['shape_scale'][0]
+        flip = img_info['flip'][0]
+        bboxes = bbox_mapping_back(bboxes, shape_scale, flip)
+        recovered_bboxes.append(bboxes)
+    bboxes = torch.stack(recovered_bboxes).mean(dim=0)
+    if aug_scores is None:
+        return bboxes
+    else:
+        scores = torch.stack(aug_scores).mean(dim=0)
+        return bboxes, scores
+
+
+def merge_aug_scores(aug_scores):
+    """Merge augmented bbox scores."""
+    if isinstance(aug_scores[0], torch.Tensor):
+        return torch.mean(torch.stack(aug_scores), dim=0)
+    else:
+        return np.mean(aug_scores, axis=0)
+
+
+def merge_aug_masks(aug_masks, bboxes, img_metas, rcnn_test_cfg, weights=None):
+    """Merge augmented mask prediction.
+
+    Args:
+        aug_masks (list[ndarray]): shape (n, #class, h, w)
+        img_shapes (list[ndarray]): shape (3, ).
+        rcnn_test_cfg (dict): rcnn test config.
+
+    Returns:
+        tuple: (bboxes, scores)
+    """
+    recovered_masks = [
+        mask if not img_info['flip'][0] else mask[..., ::-1]
+        for mask, img_info in zip(aug_masks, img_metas)
+    ]
+    if weights is None:
+        merged_masks = np.mean(recovered_masks, axis=0)
+    else:
+        merged_masks = np.average(
+            np.array(recovered_masks), axis=0, weights=np.array(weights))
+    return merged_masks
diff --git a/mmdet/core/targets/__init__.py b/mmdet/core/targets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3b2567efff687ba503b8a37d9f096597a0c8780
--- /dev/null
+++ b/mmdet/core/targets/__init__.py
@@ -0,0 +1,5 @@
+from .anchor_target import anchor_target
+from .bbox_target import bbox_target
+from .mask_target import mask_target
+
+__all__ = ['anchor_target', 'bbox_target', 'mask_target']
diff --git a/mmdet/core/targets/anchor_target.py b/mmdet/core/targets/anchor_target.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec2389f90885da0c92f0598dc6d45f59c0ab6dac
--- /dev/null
+++ b/mmdet/core/targets/anchor_target.py
@@ -0,0 +1,2 @@
+def anchor_target():
+    pass
diff --git a/mmdet/core/targets/bbox_target.py b/mmdet/core/targets/bbox_target.py
new file mode 100644
index 0000000000000000000000000000000000000000..49642c2298735b163b98ad832a3a6a9ee9941c45
--- /dev/null
+++ b/mmdet/core/targets/bbox_target.py
@@ -0,0 +1,2 @@
+def bbox_target():
+    pass
diff --git a/mmdet/core/targets/mask_target.py b/mmdet/core/targets/mask_target.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c330e13b81e8cb27e35a8705e2e89b00792ddaa
--- /dev/null
+++ b/mmdet/core/targets/mask_target.py
@@ -0,0 +1,2 @@
+def mask_target():
+    pass
diff --git a/mmdet/datasets/__init__.py b/mmdet/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6045c2b0923993243a999f0008b79443126d0e26
--- /dev/null
+++ b/mmdet/datasets/__init__.py
@@ -0,0 +1,4 @@
+from .coco import CocoDataset
+from .collate import *
+from .sampler import *
+from .transforms import *
diff --git a/mmdet/datasets/coco.py b/mmdet/datasets/coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0705e79b6168c2ccf45610af3609013082ddb48
--- /dev/null
+++ b/mmdet/datasets/coco.py
@@ -0,0 +1,288 @@
+import os.path as osp
+
+import mmcv
+import numpy as np
+from pycocotools.coco import COCO
+from torch.utils.data import Dataset
+
+from .transforms import (ImageTransform, BboxTransform, PolyMaskTransform,
+                         Numpy2Tensor)
+from .utils import show_ann, random_scale
+from .utils import DataContainer as DC
+
+
+def parse_ann_info(ann_info, cat2label, with_mask=True):
+    """Parse bbox and mask annotation.
+
+    Args:
+        ann_info (list[dict]): Annotation info of an image.
+        cat2label (dict): The mapping from category ids to labels.
+        with_mask (bool): Whether to parse mask annotations.
+
+    Returns:
+        tuple: gt_bboxes, gt_labels and gt_mask_info
+    """
+    gt_bboxes = []
+    gt_labels = []
+    gt_bboxes_ignore = []
+    # each mask consists of one or several polys, each poly is a list of float.
+    if with_mask:
+        gt_mask_polys = []
+        gt_poly_lens = []
+    for i, ann in enumerate(ann_info):
+        if ann.get('ignore', False):
+            continue
+        x1, y1, w, h = ann['bbox']
+        if ann['area'] <= 0 or w < 1 or h < 1:
+            continue
+        bbox = [x1, y1, x1 + w - 1, y1 + h - 1]
+        if ann['iscrowd']:
+            gt_bboxes_ignore.append(bbox)
+        else:
+            gt_bboxes.append(bbox)
+            gt_labels.append(cat2label[ann['category_id']])
+            if with_mask:
+                # Note polys are not resized
+                mask_polys = [
+                    p for p in ann['segmentation'] if len(p) >= 6
+                ]  # valid polygons have >= 3 points (6 coordinates)
+                poly_lens = [len(p) for p in mask_polys]
+                gt_mask_polys.append(mask_polys)
+                gt_poly_lens.extend(poly_lens)
+    if gt_bboxes:
+        gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
+        gt_labels = np.array(gt_labels, dtype=np.int64)
+    else:
+        gt_bboxes = np.zeros((0, 4), dtype=np.float32)
+        gt_labels = np.array([], dtype=np.int64)
+
+    if gt_bboxes_ignore:
+        gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32)
+    else:
+        gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32)
+
+    ann = dict(
+        bboxes=gt_bboxes, labels=gt_labels, bboxes_ignore=gt_bboxes_ignore)
+
+    if with_mask:
+        ann['mask_polys'] = gt_mask_polys
+        ann['poly_lens'] = gt_poly_lens
+    return ann
+
+
+class CocoDataset(Dataset):
+
+    def __init__(self,
+                 ann_file,
+                 img_prefix,
+                 img_scale,
+                 img_norm_cfg,
+                 size_divisor=None,
+                 proposal_file=None,
+                 num_max_proposals=1000,
+                 flip_ratio=0,
+                 with_mask=True,
+                 with_crowd=True,
+                 with_label=True,
+                 test_mode=False,
+                 debug=False):
+        # path of the data file
+        self.coco = COCO(ann_file)
+        # filter images with no annotation during training
+        if not test_mode:
+            self.img_ids, self.img_infos = self._filter_imgs()
+        else:
+            self.img_ids = self.coco.getImgIds()
+            self.img_infos = [
+                self.coco.loadImgs(idx)[0] for idx in self.img_ids
+            ]
+        assert len(self.img_ids) == len(self.img_infos)
+        # get the mapping from original category ids to labels
+        self.cat_ids = self.coco.getCatIds()
+        self.cat2label = {
+            cat_id: i + 1
+            for i, cat_id in enumerate(self.cat_ids)
+        }
+        # prefix of images path
+        self.img_prefix = img_prefix
+        # (long_edge, short_edge) or [(long1, short1), (long2, short2), ...]
+        self.img_scales = img_scale if isinstance(img_scale,
+                                                  list) else [img_scale]
+        assert mmcv.is_list_of(self.img_scales, tuple)
+        # color channel order and normalize configs
+        self.img_norm_cfg = img_norm_cfg
+        # proposals
+        self.proposals = mmcv.load(
+            proposal_file) if proposal_file is not None else None
+        self.num_max_proposals = num_max_proposals
+        # flip ratio
+        self.flip_ratio = flip_ratio
+        assert flip_ratio >= 0 and flip_ratio <= 1
+        # padding border to ensure the image size can be divided by
+        # size_divisor (used for FPN)
+        self.size_divisor = size_divisor
+        # with crowd or not, False when using RetinaNet
+        self.with_crowd = with_crowd
+        # with mask or not
+        self.with_mask = with_mask
+        # with label is False for RPN
+        self.with_label = with_label
+        # in test mode or not
+        self.test_mode = test_mode
+        # debug mode or not
+        self.debug = debug
+
+        # set group flag for the sampler
+        self._set_group_flag()
+        # transforms
+        self.img_transform = ImageTransform(
+            size_divisor=self.size_divisor, **self.img_norm_cfg)
+        self.bbox_transform = BboxTransform()
+        self.mask_transform = PolyMaskTransform()
+        self.numpy2tensor = Numpy2Tensor()
+
+    def __len__(self):
+        return len(self.img_ids)
+
+    def _filter_imgs(self, min_size=32):
+        """Filter images too small or without ground truths."""
+        img_ids = list(set([_['image_id'] for _ in self.coco.anns.values()]))
+        valid_ids = []
+        img_infos = []
+        for i in img_ids:
+            info = self.coco.loadImgs(i)[0]
+            if min(info['width'], info['height']) >= min_size:
+                valid_ids.append(i)
+                img_infos.append(info)
+        return valid_ids, img_infos
+
+    def _load_ann_info(self, idx):
+        img_id = self.img_ids[idx]
+        ann_ids = self.coco.getAnnIds(imgIds=img_id)
+        ann_info = self.coco.loadAnns(ann_ids)
+        return ann_info
+
+    def _set_group_flag(self):
+        """Set flag according to image aspect ratio.
+
+        Images with aspect ratio greater than 1 will be set as group 1,
+        otherwise group 0.
+        """
+        self.flag = np.zeros(len(self.img_ids), dtype=np.uint8)
+        for i in range(len(self.img_ids)):
+            img_info = self.img_infos[i]
+            if img_info['width'] / img_info['height'] > 1:
+                self.flag[i] = 1
+
+    def _rand_another(self, idx):
+        pool = np.where(self.flag == self.flag[idx])[0]
+        return np.random.choice(pool)
+
+    def __getitem__(self, idx):
+        if self.test_mode:
+            return self.prepare_test_img(idx)
+        while True:
+            img_info = self.img_infos[idx]
+            ann_info = self._load_ann_info(idx)
+
+            # load image
+            img = mmcv.imread(osp.join(self.img_prefix, img_info['file_name']))
+            if self.debug:
+                show_ann(self.coco, img, ann_info)
+
+            # load proposals if necessary
+            if self.proposals is not None:
+                proposals = self.proposals[idx][:self.num_max_proposals, :4]
+                # TODO: Handle empty proposals properly. Currently images with
+                # no proposals are just ignored, but they can be used for
+                # training in concept.
+                if len(proposals) == 0:
+                    idx = self._rand_another(idx)
+                    continue
+
+            ann = parse_ann_info(ann_info, self.cat2label, self.with_mask)
+            gt_bboxes = ann['bboxes']
+            gt_labels = ann['labels']
+            gt_bboxes_ignore = ann['bboxes_ignore']
+            # skip the image if there is no valid gt bbox
+            if len(gt_bboxes) == 0:
+                idx = self._rand_another(idx)
+                continue
+
+            # apply transforms
+            flip = True if np.random.rand() < self.flip_ratio else False
+            img_scale = random_scale(self.img_scales)  # sample a scale
+            img, img_shape, scale_factor = self.img_transform(
+                img, img_scale, flip)
+            if self.proposals is not None:
+                proposals = self.bbox_transform(proposals, img_shape,
+                                                scale_factor, flip)
+            gt_bboxes = self.bbox_transform(gt_bboxes, img_shape, scale_factor,
+                                            flip)
+            gt_bboxes_ignore = self.bbox_transform(gt_bboxes_ignore, img_shape,
+                                                   scale_factor, flip)
+
+            if self.with_mask:
+                gt_mask_polys, gt_poly_lens, num_polys_per_mask = \
+                    self.mask_transform(
+                        ann['mask_polys'], ann['poly_lens'],
+                        img_info['height'], img_info['width'], flip)
+
+            ori_shape = (img_info['height'], img_info['width'])
+            img_meta = dict(
+                ori_shape=DC(ori_shape),
+                img_shape=DC(img_shape),
+                scale_factor=DC(scale_factor),
+                flip=DC(flip))
+
+            data = dict(
+                img=DC(img, stack=True),
+                img_meta=img_meta,
+                gt_bboxes=DC(gt_bboxes))
+            if self.proposals is not None:
+                data['proposals'] = DC(proposals)
+            if self.with_label:
+                data['gt_labels'] = DC(gt_labels)
+            if self.with_crowd:
+                data['gt_bboxes_ignore'] = DC(gt_bboxes_ignore)
+            if self.with_mask:
+                data['gt_mask_polys'] = DC(gt_mask_polys)
+                data['gt_poly_lens'] = DC(gt_poly_lens)
+                data['num_polys_per_mask'] = DC(num_polys_per_mask)
+            return data
+
+    def prepare_test_img(self, idx):
+        """Prepare an image for testing (multi-scale and flipping)"""
+        img_info = self._load_info(idx, with_ann=False)
+        img_file = osp.join(self.prefix, img_info['file_name'])
+        proposal = (self.proposals[idx][:, :4]
+                    if self.proposals is not None else None)
+
+        def prepare_single(img_file, scale, flip, proposal=None):
+            img_np, shape_scale_np = self.img_transform(img_file, scale, flip)
+            img, shape_scale = self.numpy2tensor(img_np, shape_scale_np)
+            img_meta = dict(shape_scale=shape_scale, flip=flip)
+            if proposal is not None:
+                proposal = self.bbox_transform(proposal, shape_scale_np, flip)
+                proposal = self.numpy2tensor(proposal)
+            return img, img_meta, proposal
+
+        imgs = []
+        img_metas = []
+        proposals = []
+        for scale in self.img_scale:
+            img, img_meta, proposal = prepare_single(img_file, scale, False,
+                                                     proposal)
+            imgs.append(img)
+            img_metas.append(img_meta)
+            proposals.append(proposal)
+            if self.flip_ratio > 0:
+                img, img_meta, prop = prepare_single(img_file, scale, True,
+                                                     proposal)
+                imgs.append(img)
+                img_metas.append(img_meta)
+                proposals.append(prop)
+        if self.proposals is None:
+            return imgs, img_metas
+        else:
+            return imgs, img_metas, proposals
diff --git a/mmdet/datasets/collate.py b/mmdet/datasets/collate.py
new file mode 100644
index 0000000000000000000000000000000000000000..44117d6f2d01d3aaa4c06996c2d8bf657e4a1ce5
--- /dev/null
+++ b/mmdet/datasets/collate.py
@@ -0,0 +1,57 @@
+import collections
+
+import torch
+import torch.nn.functional as F
+from torch.utils.data.dataloader import default_collate
+
+from .utils import DataContainer
+
+# https://github.com/pytorch/pytorch/issues/973
+import resource
+rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+resource.setrlimit(resource.RLIMIT_NOFILE, (4096, rlimit[1]))
+
+__all__ = ['collate']
+
+
+def collate(batch, samples_per_gpu=1):
+
+    if not isinstance(batch, collections.Sequence):
+        raise TypeError("{} is not supported.".format(batch.dtype))
+
+    if isinstance(batch[0], DataContainer):
+        assert len(batch) % samples_per_gpu == 0
+        stacked = []
+        if batch[0].stack:
+            for i in range(0, len(batch), samples_per_gpu):
+                assert isinstance(batch[i].data, torch.Tensor)
+                # TODO: handle tensors other than 3d
+                assert batch[i].dim() == 3
+                c, h, w = batch[0].size()
+                for sample in batch[i:i + samples_per_gpu]:
+                    assert c == sample.size(0)
+                    h = max(h, sample.size(1))
+                    w = max(w, sample.size(2))
+                padded_samples = [
+                    F.pad(
+                        sample.data,
+                        (0, w - sample.size(2), 0, h - sample.size(1)),
+                        value=sample.padding_value)
+                    for sample in batch[i:i + samples_per_gpu]
+                ]
+                stacked.append(default_collate(padded_samples))
+        else:
+            for i in range(0, len(batch), samples_per_gpu):
+                stacked.append(
+                    [sample.data for sample in batch[i:i + samples_per_gpu]])
+        return DataContainer(stacked, batch[0].stack, batch[0].padding_value)
+    elif isinstance(batch[0], collections.Sequence):
+        transposed = zip(*batch)
+        return [collate(samples, samples_per_gpu) for samples in transposed]
+    elif isinstance(batch[0], collections.Mapping):
+        return {
+            key: collate([d[key] for d in batch], samples_per_gpu)
+            for key in batch[0]
+        }
+    else:
+        return default_collate(batch)
diff --git a/mmdet/datasets/sampler.py b/mmdet/datasets/sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..74089821bf17a7bdc6f1f728c0340e382adb3046
--- /dev/null
+++ b/mmdet/datasets/sampler.py
@@ -0,0 +1,134 @@
+from __future__ import division
+
+import math
+import torch
+import numpy as np
+
+from torch.distributed import get_world_size, get_rank
+from torch.utils.data.sampler import Sampler
+
+__all__ = ['GroupSampler', 'DistributedGroupSampler']
+
+
+class GroupSampler(Sampler):
+
+    def __init__(self, dataset, samples_per_gpu=1):
+        assert hasattr(dataset, 'flag')
+        self.dataset = dataset
+        self.samples_per_gpu = samples_per_gpu
+        self.flag = dataset.flag.astype(np.int64)
+        self.group_sizes = np.bincount(self.flag)
+        self.num_samples = 0
+        for i, size in enumerate(self.group_sizes):
+            self.num_samples += int(np.ceil(
+                size / self.samples_per_gpu)) * self.samples_per_gpu
+
+    def __iter__(self):
+        indices = []
+        for i, size in enumerate(self.group_sizes):
+            if size == 0:
+                continue
+            indice = np.where(self.flag == i)[0]
+            assert len(indice) == size
+            np.random.shuffle(indice)
+            num_extra = int(np.ceil(size / self.samples_per_gpu)
+                            ) * self.samples_per_gpu - len(indice)
+            indice = np.concatenate([indice, indice[:num_extra]])
+            indices.append(indice)
+        indices = np.concatenate(indices)
+        indices = [
+            indices[i * self.samples_per_gpu:(i + 1) * self.samples_per_gpu]
+            for i in np.random.permutation(
+                range(len(indices) // self.samples_per_gpu))
+        ]
+        indices = np.concatenate(indices)
+        indices = torch.from_numpy(indices).long()
+        assert len(indices) == self.num_samples
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+
+class DistributedGroupSampler(Sampler):
+    """Sampler that restricts data loading to a subset of the dataset.
+    It is especially useful in conjunction with
+    :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
+    process can pass a DistributedSampler instance as a DataLoader sampler,
+    and load a subset of the original dataset that is exclusive to it.
+    .. note::
+        Dataset is assumed to be of constant size.
+    Arguments:
+        dataset: Dataset used for sampling.
+        num_replicas (optional): Number of processes participating in
+            distributed training.
+        rank (optional): Rank of the current process within num_replicas.
+    """
+
+    def __init__(self,
+                 dataset,
+                 samples_per_gpu=1,
+                 num_replicas=None,
+                 rank=None):
+        if num_replicas is None:
+            num_replicas = get_world_size()
+        if rank is None:
+            rank = get_rank()
+        self.dataset = dataset
+        self.samples_per_gpu = samples_per_gpu
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.epoch = 0
+
+        assert hasattr(self.dataset, 'flag')
+        self.flag = self.dataset.flag
+        self.group_sizes = np.bincount(self.flag)
+
+        self.num_samples = 0
+        for i, j in enumerate(self.group_sizes):
+            self.num_samples += int(
+                math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu /
+                          self.num_replicas)) * self.samples_per_gpu
+        self.total_size = self.num_samples * self.num_replicas
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        g = torch.Generator()
+        g.manual_seed(self.epoch)
+
+        indices = []
+        for i, size in enumerate(self.group_sizes):
+            if size > 0:
+                indice = np.where(self.flag == i)[0]
+                assert len(indice) == size
+                indice = indice[list(torch.randperm(int(size),
+                                                    generator=g))].tolist()
+                extra = int(
+                    math.ceil(
+                        size * 1.0 / self.samples_per_gpu / self.num_replicas)
+                ) * self.samples_per_gpu * self.num_replicas - len(indice)
+                indice += indice[:extra]
+                indices += indice
+
+        assert len(indices) == self.total_size
+
+        indices = [
+            indices[j] for i in list(
+                torch.randperm(
+                    len(indices) // self.samples_per_gpu, generator=g))
+            for j in range(i * self.samples_per_gpu, (i + 1) *
+                           self.samples_per_gpu)
+        ]
+
+        # subsample
+        offset = self.num_samples * self.rank
+        indices = indices[offset:offset + self.num_samples]
+        assert len(indices) == self.num_samples
+
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samples
+
+    def set_epoch(self, epoch):
+        self.epoch = epoch
diff --git a/mmdet/datasets/transforms.py b/mmdet/datasets/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..81f3a627d0d20a5890ea9c5f597e814ea373b9e5
--- /dev/null
+++ b/mmdet/datasets/transforms.py
@@ -0,0 +1,208 @@
+import mmcv
+# import cvbase as cvb
+import numpy as np
+import torch
+
+from mmdet.core import segms
+
+__all__ = [
+    'ImageTransform', 'BboxTransform', 'PolyMaskTransform', 'Numpy2Tensor'
+]
+
+
+class ImageTransform(object):
+    """Preprocess an image
+    1. rescale the image to expected size
+    2. normalize the image
+    3. flip the image (if needed)
+    4. pad the image (if needed)
+    5. transpose to (c, h, w)
+    """
+
+    def __init__(self,
+                 mean=(0, 0, 0),
+                 std=(1, 1, 1),
+                 to_rgb=True,
+                 size_divisor=None):
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+        self.to_rgb = to_rgb
+        self.size_divisor = size_divisor
+
+    def __call__(self, img, scale, flip=False):
+        img, scale_factor = mmcv.imrescale(img, scale, True)
+        img_shape = img.shape
+        img = mmcv.imnorm(img, self.mean, self.std, self.to_rgb)
+        if flip:
+            img = mmcv.imflip(img)
+        if self.size_divisor is not None:
+            img = mmcv.impad_to_multiple(img, self.size_divisor)
+        img = img.transpose(2, 0, 1)
+        return img, img_shape, scale_factor
+
+        # img, scale = cvb.resize_keep_ar(img_or_path, max_long_edge,
+        #                                 max_short_edge, True)
+        # shape_scale = np.array(img.shape + (scale, ), dtype=np.float32)
+        # if flip:
+        #     img = img[:, ::-1, :].copy()
+        # if self.color_order == 'RGB':
+        #     img = cvb.bgr2rgb(img)
+        # img = img.astype(np.float32)
+        # img -= self.color_mean
+        # img /= self.color_std
+        # if self.size_divisor is None:
+        #     padded_img = img
+        # else:
+        #     pad_h = int(np.ceil(
+        #         img.shape[0] / self.size_divisor)) * self.size_divisor
+        #     pad_w = int(np.ceil(
+        #         img.shape[1] / self.size_divisor)) * self.size_divisor
+        #     padded_img = cvb.pad_img(img, (pad_h, pad_w), pad_val=0)
+        # padded_img = padded_img.transpose(2, 0, 1)
+        # return padded_img, shape_scale
+
+
+class ImageCrop(object):
+    """crop image patches and resize patches into fixed size
+    1. (read and) flip image (if needed) 
+    2. crop image patches according to given bboxes
+    3. resize patches into fixed size (default 224x224)
+    4. normalize the image (if needed)
+    5. transpose to (c, h, w) (if needed)
+    """
+
+    def __init__(self,
+                 normalize=True,
+                 transpose=True,
+                 color_order='RGB',
+                 color_mean=(0, 0, 0),
+                 color_std=(1, 1, 1)):
+        self.normalize = normalize
+        self.transpose = transpose
+
+        assert color_order in ['RGB', 'BGR']
+        self.color_order = color_order
+        self.color_mean = np.array(color_mean, dtype=np.float32)
+        self.color_std = np.array(color_std, dtype=np.float32)
+
+    def __call__(self,
+                 img_or_path,
+                 bboxes,
+                 crop_size,
+                 scale_ratio=1.0,
+                 flip=False):
+        img = cvb.read_img(img_or_path)
+        if flip:
+            img = img[:, ::-1, :].copy()
+        crop_imgs = cvb.crop_img(
+            img,
+            bboxes[:, :4],
+            scale_ratio=scale_ratio,
+            pad_fill=self.color_mean)
+        processed_crop_imgs_list = []
+        for i in range(len(crop_imgs)):
+            crop_img = crop_imgs[i]
+            crop_img = cvb.resize(crop_img, crop_size)
+            crop_img = crop_img.astype(np.float32)
+            crop_img -= self.color_mean
+            crop_img /= self.color_std
+            processed_crop_imgs_list.append(crop_img)
+        processed_crop_imgs = np.stack(processed_crop_imgs_list, axis=0)
+        processed_crop_imgs = processed_crop_imgs.transpose(0, 3, 1, 2)
+        return processed_crop_imgs
+
+
+class BboxTransform(object):
+    """Preprocess gt bboxes
+    1. rescale bboxes according to image size
+    2. flip bboxes (if needed)
+    3. pad the first dimension to `max_num_gts`
+    """
+
+    def __init__(self, max_num_gts=None):
+        self.max_num_gts = max_num_gts
+
+    def __call__(self, bboxes, img_shape, scale_factor, flip=False):
+        gt_bboxes = bboxes * scale_factor
+        if flip:
+            gt_bboxes = mmcv.bbox_flip(gt_bboxes, img_shape)
+        if self.max_num_gts is None:
+            return gt_bboxes
+        else:
+            num_gts = gt_bboxes.shape[0]
+            padded_bboxes = np.zeros((self.max_num_gts, 4), dtype=np.float32)
+            padded_bboxes[:num_gts, :] = gt_bboxes
+            return padded_bboxes
+
+
+class PolyMaskTransform(object):
+
+    def __init__(self):
+        pass
+
+    def __call__(self, gt_mask_polys, gt_poly_lens, img_h, img_w, flip=False):
+        """
+        Args:
+            gt_mask_polys(list): a list of masks, each mask is a list of polys,
+                each poly is a list of numbers
+            gt_poly_lens(list): a list of int, indicating the size of each poly
+        """
+        if flip:
+            gt_mask_polys = segms.flip_segms(gt_mask_polys, img_h, img_w)
+        num_polys_per_mask = np.array(
+            [len(mask_polys) for mask_polys in gt_mask_polys], dtype=np.int64)
+        gt_poly_lens = np.array(gt_poly_lens, dtype=np.int64)
+        gt_mask_polys = [
+            np.concatenate(mask_polys).astype(np.float32)
+            for mask_polys in gt_mask_polys
+        ]
+        gt_mask_polys = np.concatenate(gt_mask_polys)
+        return gt_mask_polys, gt_poly_lens, num_polys_per_mask
+
+
+class MaskTransform(object):
+    """Preprocess masks
+    1. resize masks to expected size and stack to a single array
+    2. flip the masks (if needed)
+    3. pad the masks (if needed)
+    """
+
+    def __init__(self, max_num_gts, pad_size=None):
+        self.max_num_gts = max_num_gts
+        self.pad_size = pad_size
+
+    def __call__(self, masks, img_size, flip=False):
+        max_long_edge = max(img_size)
+        max_short_edge = min(img_size)
+        masks = [
+            cvb.resize_keep_ar(
+                mask,
+                max_long_edge,
+                max_short_edge,
+                interpolation=cvb.INTER_NEAREST) for mask in masks
+        ]
+        masks = np.stack(masks, axis=0)
+        if flip:
+            masks = masks[:, ::-1, :]
+        if self.pad_size is None:
+            pad_h = masks.shape[1]
+            pad_w = masks.shape[2]
+        else:
+            pad_size = self.pad_size if self.pad_size > 0 else max_long_edge
+            pad_h = pad_w = pad_size
+        padded_masks = np.zeros(
+            (self.max_num_gts, pad_h, pad_w), dtype=masks.dtype)
+        padded_masks[:masks.shape[0], :masks.shape[1], :masks.shape[2]] = masks
+        return padded_masks
+
+
+class Numpy2Tensor(object):
+
+    def __init__(self):
+        pass
+
+    def __call__(self, *args):
+        if len(args) == 1:
+            return torch.from_numpy(args[0])
+        else:
+            return tuple([torch.from_numpy(array) for array in args])
diff --git a/mmdet/datasets/utils/__init__.py b/mmdet/datasets/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..de3ea43bdf4e4cc526119054954fdd1acf811c38
--- /dev/null
+++ b/mmdet/datasets/utils/__init__.py
@@ -0,0 +1,2 @@
+from .data_container import DataContainer
+from .misc import *
diff --git a/mmdet/datasets/utils/data_container.py b/mmdet/datasets/utils/data_container.py
new file mode 100644
index 0000000000000000000000000000000000000000..c27beab37bbd28aeb37c1231b8ff94a335702216
--- /dev/null
+++ b/mmdet/datasets/utils/data_container.py
@@ -0,0 +1,80 @@
+import functools
+from collections import Sequence
+
+import mmcv
+import numpy as np
+import torch
+
+
+def to_tensor(data):
+    """Convert objects of various python types to :obj:`torch.Tensor`.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int` and :class:`float`.
+    """
+    if isinstance(data, np.ndarray):
+        return torch.from_numpy(data)
+    elif isinstance(data, torch.Tensor):
+        return data
+    elif isinstance(data, Sequence) and not mmcv.is_str(data):
+        return torch.tensor(data)
+    elif isinstance(data, int):
+        return torch.LongTensor([data])
+    elif isinstance(data, float):
+        return torch.FloatTensor([data])
+    else:
+        raise TypeError('type {} cannot be converted to tensor.'.format(
+            type(data)))
+
+
+def assert_tensor_type(func):
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if not isinstance(args[0].data, torch.Tensor):
+            raise AttributeError('{} has no attribute {} for type {}'.format(
+                args[0].__class__.__name__, func.__name__, args[0].datatype))
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+class DataContainer(object):
+
+    def __init__(self, data, stack=False, padding_value=0):
+        if isinstance(data, list):
+            self._data = data
+        else:
+            self._data = to_tensor(data)
+        self._stack = stack
+        self._padding_value = padding_value
+
+    def __repr__(self):
+        return '{}({})'.format(self.__class__.__name__, repr(self.data))
+
+    @property
+    def data(self):
+        return self._data
+
+    @property
+    def datatype(self):
+        if isinstance(self.data, torch.Tensor):
+            return self.data.type()
+        else:
+            return type(self.data)
+
+    @property
+    def stack(self):
+        return self._stack
+
+    @property
+    def padding_value(self):
+        return self._padding_value
+
+    @assert_tensor_type
+    def size(self, *args, **kwargs):
+        return self.data.size(*args, **kwargs)
+
+    @assert_tensor_type
+    def dim(self):
+        return self.data.dim()
diff --git a/mmdet/datasets/utils/misc.py b/mmdet/datasets/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..419c11ad08462268b9dfe6b43182a9ec4725b00c
--- /dev/null
+++ b/mmdet/datasets/utils/misc.py
@@ -0,0 +1,62 @@
+import mmcv
+
+import matplotlib.pyplot as plt
+import numpy as np
+import pycocotools.mask as maskUtils
+
+
+def random_scale(img_scales, mode='range'):
+    """Randomly select a scale from a list of scales or scale ranges.
+
+    Args:
+        img_scales (list[tuple]): Image scale or scale range.
+        mode (str): "range" or "value".
+
+    Returns:
+        tuple: Sampled image scale.
+    """
+    num_scales = len(img_scales)
+    if num_scales == 1:  # fixed scale is specified
+        img_scale = img_scales[0]
+    elif num_scales == 2:  # randomly sample a scale
+        if mode == 'range':
+            img_scale_long = [max(s) for s in img_scales]
+            img_scale_short = [min(s) for s in img_scales]
+            long_edge = np.random.randint(
+                min(img_scale_long),
+                max(img_scale_long) + 1)
+            short_edge = np.random.randint(
+                min(img_scale_short),
+                max(img_scale_short) + 1)
+            img_scale = (long_edge, short_edge)
+        elif mode == 'value':
+            img_scale = img_scales[np.random.randint(num_scales)]
+    else:
+        if mode != 'value':
+            raise ValueError(
+                'Only "value" mode supports more than 2 image scales')
+        img_scale = img_scales[np.random.randint(num_scales)]
+    return img_scale
+
+
+def show_ann(coco, img, ann_info):
+    plt.imshow(mmcv.bgr2rgb(img))
+    plt.axis('off')
+    coco.showAnns(ann_info)
+    plt.show()
+
+
+def draw_bbox_and_segm(img, results, dataset, score_thr=0.5):
+    bbox_results, segm_results = results
+    hi_bboxes = []
+    for cls_bboxes, cls_segms in zip(bbox_results, segm_results):
+        if len(cls_bboxes) == 0:
+            hi_bboxes.append(cls_bboxes)
+            continue
+        inds = np.where(cls_bboxes[:, -1] > score_thr)[0]
+        hi_bboxes.append(cls_bboxes[inds, :])
+        color_mask = np.random.random((1, 3))
+        for i in inds:
+            mask = maskUtils.decode(cls_segms[i]).astype(np.bool)
+            img[mask] = img[mask] * 0.5 + color_mask * 0.5
+    mmcv.draw_bboxes_with_label(np.ascontiguousarray(img), hi_bboxes, dataset)
diff --git a/mmdet/models/__init__.py b/mmdet/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mmdet/models/backbones/__init__.py b/mmdet/models/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9e21e83d1469167d35de22c6511f6c09c260727
--- /dev/null
+++ b/mmdet/models/backbones/__init__.py
@@ -0,0 +1 @@
+from .resnet import resnet
diff --git a/mmdet/models/backbones/resnet.py b/mmdet/models/backbones/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8203accd4b335886b7ebffd59517bdc8568769e
--- /dev/null
+++ b/mmdet/models/backbones/resnet.py
@@ -0,0 +1,325 @@
+import math
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from torchpack import load_checkpoint
+
+
+def conv3x3(in_planes, out_planes, stride=1, dilation=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=dilation,
+        dilation=dilation,
+        bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='fb'):
+        super(BasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride, dilation)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='fb',
+                 with_cp=False):
+        """Bottleneck block
+        if style is "fb", the stride-two layer is the 3x3 conv layer,
+        if style is "msra", the stride-two layer is the first 1x1 conv layer
+        """
+        super(Bottleneck, self).__init__()
+        assert style in ['fb', 'msra']
+        if style == 'fb':
+            conv1_stride = 1
+            conv2_stride = stride
+        else:
+            conv1_stride = stride
+            conv2_stride = 1
+        self.conv1 = nn.Conv2d(
+            inplanes, planes, kernel_size=1, stride=conv1_stride, bias=False)
+        self.conv2 = nn.Conv2d(
+            planes,
+            planes,
+            kernel_size=3,
+            stride=conv2_stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(
+            planes, planes * self.expansion, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.with_cp = with_cp
+
+    def forward(self, x):
+
+        def _inner_forward(x):
+            residual = x
+
+            out = self.conv1(x)
+            out = self.bn1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.bn2(out)
+            out = self.relu(out)
+
+            out = self.conv3(out)
+            out = self.bn3(out)
+
+            if self.downsample is not None:
+                residual = self.downsample(x)
+
+            out += residual
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+def make_res_layer(block,
+                   inplanes,
+                   planes,
+                   blocks,
+                   stride=1,
+                   dilation=1,
+                   style='fb',
+                   with_cp=False):
+    downsample = None
+    if stride != 1 or inplanes != planes * block.expansion:
+        downsample = nn.Sequential(
+            nn.Conv2d(
+                inplanes,
+                planes * block.expansion,
+                kernel_size=1,
+                stride=stride,
+                bias=False),
+            nn.BatchNorm2d(planes * block.expansion),
+        )
+
+    layers = []
+    layers.append(
+        block(
+            inplanes,
+            planes,
+            stride,
+            dilation,
+            downsample,
+            style=style,
+            with_cp=with_cp))
+    inplanes = planes * block.expansion
+    for i in range(1, blocks):
+        layers.append(
+            block(inplanes, planes, 1, dilation, style=style, with_cp=with_cp))
+
+    return nn.Sequential(*layers)
+
+
+class ResHead(nn.Module):
+
+    def __init__(self, block, num_blocks, stride=2, dilation=1, style='fb'):
+        self.layer4 = make_res_layer(
+            block,
+            1024,
+            512,
+            num_blocks,
+            stride=stride,
+            dilation=dilation,
+            style=style)
+
+    def forward(self, x):
+        return self.layer4(x)
+
+
+class ResNet(nn.Module):
+
+    def __init__(self,
+                 block,
+                 layers,
+                 strides=(1, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(0, 1, 2, 3),
+                 frozen_stages=-1,
+                 style='fb',
+                 sync_bn=False,
+                 with_cp=False):
+        super(ResNet, self).__init__()
+        if not len(layers) == len(strides) == len(dilations):
+            raise ValueError(
+                'The number of layers, strides and dilations must be equal, '
+                'but found have {} layers, {} strides and {} dilations'.format(
+                    len(layers), len(strides), len(dilations)))
+        assert max(out_indices) < len(layers)
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.style = style
+        self.sync_bn = sync_bn
+        self.inplanes = 64
+        self.conv1 = nn.Conv2d(
+            3, 64, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.res_layers = []
+        for i, num_blocks in enumerate(layers):
+
+            stride = strides[i]
+            dilation = dilations[i]
+
+            layer_name = 'layer{}'.format(i + 1)
+            planes = 64 * 2**i
+            res_layer = make_res_layer(
+                block,
+                self.inplanes,
+                planes,
+                num_blocks,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                with_cp=with_cp)
+            self.inplanes = planes * block.expansion
+            setattr(self, layer_name, res_layer)
+            self.res_layers.append(layer_name)
+        self.feat_dim = block.expansion * 64 * 2**(len(layers) - 1)
+        self.with_cp = with_cp
+
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            load_checkpoint(self, pretrained, strict=False)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                    nn.init.normal_(m.weight, 0, math.sqrt(2. / n))
+                elif isinstance(m, nn.BatchNorm2d):
+                    nn.init.constant_(m.weight, 1)
+                    nn.init.constant_(m.bias, 0)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        if len(outs) == 1:
+            return outs[0]
+        else:
+            return tuple(outs)
+
+    def train(self, mode=True):
+        super(ResNet, self).train(mode)
+        if not self.sync_bn:
+            for m in self.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eval()
+        if mode and self.frozen_stages >= 0:
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+            for param in self.bn1.parameters():
+                param.requires_grad = False
+            self.bn1.eval()
+            self.bn1.weight.requires_grad = False
+            self.bn1.bias.requires_grad = False
+            for i in range(1, self.frozen_stages + 1):
+                mod = getattr(self, 'layer{}'.format(i))
+                mod.eval()
+                for param in mod.parameters():
+                    param.requires_grad = False
+
+
+resnet_cfg = {
+    18: (BasicBlock, (2, 2, 2, 2)),
+    34: (BasicBlock, (3, 4, 6, 3)),
+    50: (Bottleneck, (3, 4, 6, 3)),
+    101: (Bottleneck, (3, 4, 23, 3)),
+    152: (Bottleneck, (3, 8, 36, 3))
+}
+
+
+def resnet(depth,
+           num_stages=4,
+           strides=(1, 2, 2, 2),
+           dilations=(1, 1, 1, 1),
+           out_indices=(2, ),
+           frozen_stages=-1,
+           style='fb',
+           sync_bn=False,
+           with_cp=False):
+    """Constructs a ResNet model.
+
+    Args:
+        depth (int): depth of resnet, from {18, 34, 50, 101, 152}
+        num_stages (int): num of resnet stages, normally 4
+        strides (list): strides of the first block of each stage
+        dilations (list): dilation of each stage
+        out_indices (list): output from which stages
+    """
+    if depth not in resnet_cfg:
+        raise KeyError('invalid depth {} for resnet'.format(depth))
+    block, layers = resnet_cfg[depth]
+    model = ResNet(block, layers[:num_stages], strides, dilations, out_indices,
+                   frozen_stages, style, sync_bn, with_cp)
+    return model
diff --git a/mmdet/models/bbox_heads/__init__.py b/mmdet/models/bbox_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e6709af6176d5d574bf7f4a5bdf8e67691787536
--- /dev/null
+++ b/mmdet/models/bbox_heads/__init__.py
@@ -0,0 +1,3 @@
+from .bbox_head import BBoxHead
+
+__all__ = ['BBoxHead']
diff --git a/mmdet/models/bbox_heads/bbox_head.py b/mmdet/models/bbox_heads/bbox_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f0c188a459286ee5c0e5ab71f8305da0d1ab761
--- /dev/null
+++ b/mmdet/models/bbox_heads/bbox_head.py
@@ -0,0 +1,123 @@
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmdet.core import (bbox_transform_inv, bbox_target, multiclass_nms,
+                        weighted_cross_entropy, weighted_smoothl1, accuracy)
+
+
+class BBoxHead(nn.Module):
+    """Simplest RoI head, with only two fc layers for classification and
+    regression respectively"""
+
+    def __init__(self,
+                 exclude_mal_box=True,
+                 with_avg_pool=False,
+                 with_cls=True,
+                 with_reg=True,
+                 roi_feat_size=7,
+                 in_channels=256,
+                 num_classes=81,
+                 target_means=[0., 0., 0., 0.],
+                 target_stds=[0.1, 0.1, 0.2, 0.2],
+                 reg_class_agnostic=False):
+        super(BBoxHead, self).__init__()
+        assert with_cls or with_reg
+        self.with_avg_pool = with_avg_pool
+        self.with_cls = with_cls
+        self.with_reg = with_reg
+        self.roi_feat_size = roi_feat_size
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.target_means = target_means
+        self.target_stds = target_stds
+        self.reg_class_agnostic = reg_class_agnostic
+        self.exclude_mal_box = exclude_mal_box
+
+        in_channels = self.in_channels
+        if self.with_avg_pool:
+            self.avg_pool = nn.AvgPool2d(roi_feat_size)
+        else:
+            in_channels *= (self.roi_feat_size * self.roi_feat_size)
+        if self.with_cls:
+            self.fc_cls = nn.Linear(in_channels, num_classes)
+        if self.with_reg:
+            out_dim_reg = 4 if reg_class_agnostic else 4 * num_classes
+            self.fc_reg = nn.Linear(in_channels, out_dim_reg)
+        self.debug_imgs = None
+
+    def init_weights(self):
+        if self.with_cls:
+            nn.init.normal_(self.fc_cls.weight, 0, 0.01)
+            nn.init.constant_(self.fc_cls.bias, 0)
+        if self.with_reg:
+            nn.init.normal_(self.fc_reg.weight, 0, 0.001)
+            nn.init.constant_(self.fc_reg.bias, 0)
+
+    def forward(self, x):
+        if self.with_avg_pool:
+            x = self.avg_pool(x)
+        x = x.view(x.size(0), -1)
+        cls_score = self.fc_cls(x) if self.with_cls else None
+        bbox_pred = self.fc_reg(x) if self.with_reg else None
+        return cls_score, bbox_pred
+
+    def bbox_target(self, pos_proposals, neg_proposals, pos_gt_bboxes,
+                    pos_gt_labels, rcnn_train_cfg):
+        reg_num_classes = 1 if self.reg_class_agnostic else self.num_classes
+        cls_reg_targets = bbox_target(
+            pos_proposals,
+            neg_proposals,
+            pos_gt_bboxes,
+            pos_gt_labels,
+            self.target_means,
+            self.target_stds,
+            rcnn_train_cfg,
+            reg_num_classes,
+            debug_imgs=self.debug_imgs)
+        return cls_reg_targets
+
+    def loss(self, cls_score, bbox_pred, labels, label_weights, bbox_targets,
+             bbox_weights):
+        losses = dict()
+        if cls_score is not None:
+            losses['loss_cls'] = weighted_cross_entropy(
+                cls_score, labels, label_weights)
+            losses['acc'] = accuracy(cls_score, labels)
+        if bbox_pred is not None:
+            losses['loss_reg'] = weighted_smoothl1(
+                bbox_pred,
+                bbox_targets,
+                bbox_weights,
+                ave_factor=bbox_targets.size(0))
+        return losses
+
+    def get_det_bboxes(self,
+                       rois,
+                       cls_score,
+                       bbox_pred,
+                       img_shape,
+                       rescale=False,
+                       nms_cfg=None):
+        if isinstance(cls_score, list):
+            cls_score = sum(cls_score) / float(len(cls_score))
+        scores = F.softmax(cls_score, dim=1) if cls_score is not None else None
+
+        if bbox_pred is not None:
+            bboxes = bbox_transform_inv(rois[:, 1:], bbox_pred,
+                                        self.target_means, self.target_stds,
+                                        img_shape)
+        else:
+            bboxes = rois[:, 1:]
+            # TODO: add clip here
+
+        if rescale:
+            bboxes /= img_shape[-1]
+
+        if nms_cfg is None:
+            return bboxes, scores
+        else:
+            det_bboxes, det_labels = multiclass_nms(
+                bboxes, scores, nms_cfg.score_thr, nms_cfg.nms_thr,
+                nms_cfg.max_per_img)
+
+            return det_bboxes, det_labels
diff --git a/mmdet/models/builder.py b/mmdet/models/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..f109d851397a5106c33d173eda8986ee1c0f8b06
--- /dev/null
+++ b/mmdet/models/builder.py
@@ -0,0 +1,47 @@
+import mmcv
+from torch import nn
+
+from . import (backbones, necks, roi_extractors, rpn_heads, bbox_heads,
+               mask_heads)
+
+__all__ = [
+    'build_backbone', 'build_neck', 'build_rpn_head', 'build_roi_extractor',
+    'build_bbox_head', 'build_mask_head'
+]
+
+
+def _build_module(cfg, parrent=None):
+    return cfg if isinstance(cfg, nn.Module) else mmcv.obj_from_dict(
+        cfg, parrent)
+
+
+def build(cfg, parrent=None):
+    if isinstance(cfg, list):
+        modules = [_build_module(cfg_, parrent) for cfg_ in cfg]
+        return nn.Sequential(*modules)
+    else:
+        return _build_module(cfg, parrent)
+
+
+def build_backbone(cfg):
+    return build(cfg, backbones)
+
+
+def build_neck(cfg):
+    return build(cfg, necks)
+
+
+def build_rpn_head(cfg):
+    return build(cfg, rpn_heads)
+
+
+def build_roi_extractor(cfg):
+    return build(cfg, roi_extractors)
+
+
+def build_bbox_head(cfg):
+    return build(cfg, bbox_heads)
+
+
+def build_mask_head(cfg):
+    return build(cfg, mask_heads)
diff --git a/mmdet/models/common/__init__.py b/mmdet/models/common/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a611c251065f2addc6c069d61c7e1f18fbd7da2
--- /dev/null
+++ b/mmdet/models/common/__init__.py
@@ -0,0 +1,4 @@
+from .conv_module import ConvModule
+from .norm import build_norm_layer
+
+__all__ = ['ConvModule', 'build_norm_layer']
diff --git a/mmdet/models/common/conv_module.py b/mmdet/models/common/conv_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..25121972da29d8e4e83fb2301b8f8d25a1727f7e
--- /dev/null
+++ b/mmdet/models/common/conv_module.py
@@ -0,0 +1,95 @@
+import warnings
+
+import torch.nn as nn
+
+from .norm import build_norm_layer
+
+
+class ConvModule(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 normalize=None,
+                 activation='relu',
+                 inplace=True,
+                 activate_last=True):
+        super(ConvModule, self).__init__()
+        self.with_norm = normalize is not None
+        self.with_activatation = activation is not None
+        self.with_bias = bias
+        self.activation = activation
+        self.activate_last = activate_last
+
+        if self.with_norm and self.with_bias:
+            warnings.warn('ConvModule has norm and bias at the same time')
+
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias=bias)
+
+        self.in_channels = self.conv.in_channels
+        self.out_channels = self.conv.out_channels
+        self.kernel_size = self.conv.kernel_size
+        self.stride = self.conv.stride
+        self.padding = self.conv.padding
+        self.dilation = self.conv.dilation
+        self.transposed = self.conv.transposed
+        self.output_padding = self.conv.output_padding
+        self.groups = self.conv.groups
+
+        if self.with_norm:
+            # self.norm_type, self.norm_params = parse_norm(normalize)
+            # assert self.norm_type in [None, 'BN', 'SyncBN', 'GN', 'SN']
+            # self.Norm2d = norm_cfg[self.norm_type]
+            if self.activate_last:
+                self.norm = build_norm_layer(normalize, out_channels)
+                # self.norm = self.Norm2d(out_channels, **self.norm_params)
+            else:
+                self.norm = build_norm_layer(normalize, in_channels)
+                # self.norm = self.Norm2d(in_channels, **self.norm_params)
+
+        if self.with_activatation:
+            assert activation in ['relu'], 'Only ReLU supported.'
+            if self.activation == 'relu':
+                self.activate = nn.ReLU(inplace=inplace)
+
+        # Default using msra init
+        self.init_weights()
+
+    def init_weights(self):
+        nonlinearity = 'relu' if self.activation is None else self.activation
+        nn.init.kaiming_normal_(
+            self.conv.weight, mode='fan_out', nonlinearity=nonlinearity)
+        if self.with_bias:
+            nn.init.constant_(self.conv.bias, 0)
+        if self.with_norm:
+            nn.init.constant_(self.norm.weight, 1)
+            nn.init.constant_(self.norm.bias, 0)
+
+    def forward(self, x, activate=True, norm=True):
+        if self.activate_last:
+            x = self.conv(x)
+            if norm and self.with_norm:
+                x = self.norm(x)
+            if activate and self.with_activatation:
+                x = self.activate(x)
+        else:
+            if norm and self.with_norm:
+                x = self.norm(x)
+            if activate and self.with_activatation:
+                x = self.activate(x)
+            x = self.conv(x)
+        return x
diff --git a/mmdet/models/common/norm.py b/mmdet/models/common/norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b82cd046e82e8ece24c5552687ae2952cfd9932
--- /dev/null
+++ b/mmdet/models/common/norm.py
@@ -0,0 +1,17 @@
+import torch.nn as nn
+
+norm_cfg = {'BN': nn.BatchNorm2d, 'SyncBN': None, 'GN': None}
+
+
+def build_norm_layer(cfg, num_features):
+    assert isinstance(cfg, dict) and 'type' in cfg
+    cfg_ = cfg.copy()
+    cfg_.setdefault('eps', 1e-5)
+    layer_type = cfg_.pop('type')
+
+    if layer_type not in norm_cfg:
+        raise KeyError('Unrecognized norm type {}'.format(layer_type))
+    elif norm_cfg[layer_type] is None:
+        raise NotImplementedError
+
+    return norm_cfg[layer_type](num_features, **cfg_)
diff --git a/mmdet/models/detectors/__init__.py b/mmdet/models/detectors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mmdet/models/detectors/rpn.py b/mmdet/models/detectors/rpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d80c9d9b10a12c07155f11ab00b24542f805cc6
--- /dev/null
+++ b/mmdet/models/detectors/rpn.py
@@ -0,0 +1,100 @@
+import torch.nn as nn
+
+from mmdet.core import tensor2imgs, merge_aug_proposals, bbox_mapping
+from .. import builder
+
+
+class RPN(nn.Module):
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 rpn_head,
+                 rpn_train_cfg,
+                 rpn_test_cfg,
+                 pretrained=None):
+        super(RPN, self).__init__()
+        self.backbone = builder.build_backbone(backbone)
+        self.neck = builder.build_neck(neck) if neck is not None else None
+        self.rpn_head = builder.build_rpn_head(rpn_head)
+        self.rpn_train_cfg = rpn_train_cfg
+        self.rpn_test_cfg = rpn_test_cfg
+        self.init_weights(pretrained=pretrained)
+
+    def init_weights(self, pretrained=None):
+        if pretrained is not None:
+            print('load model from: {}'.format(pretrained))
+        self.backbone.init_weights(pretrained=pretrained)
+        if self.neck is not None:
+            self.neck.init_weights()
+        self.rpn_head.init_weights()
+
+    def forward(self,
+                img,
+                img_meta,
+                gt_bboxes=None,
+                return_loss=True,
+                return_bboxes=False,
+                rescale=False):
+        if not return_loss:
+            return self.test(img, img_meta, rescale)
+
+        img_shapes = img_meta['shape_scale']
+
+        if self.rpn_train_cfg.get('debug', False):
+            self.rpn_head.debug_imgs = tensor2imgs(img)
+
+        x = self.backbone(img)
+        if self.neck is not None:
+            x = self.neck(x)
+        rpn_outs = self.rpn_head(x)
+
+        rpn_loss_inputs = rpn_outs + (gt_bboxes, img_shapes,
+                                      self.rpn_train_cfg)
+        losses = self.rpn_head.loss(*rpn_loss_inputs)
+        return losses
+
+    def test(self, imgs, img_metas, rescale=False):
+        """Test w/ or w/o augmentations."""
+        assert isinstance(imgs, list) and isinstance(img_metas, list)
+        assert len(imgs) == len(img_metas)
+        img_per_gpu = imgs[0].size(0)
+        assert img_per_gpu == 1
+        if len(imgs) == 1:
+            return self.simple_test(imgs[0], img_metas[0], rescale)
+        else:
+            return self.aug_test(imgs, img_metas, rescale)
+
+    def simple_test(self, img, img_meta, rescale=False):
+        img_shapes = img_meta['shape_scale']
+        # get feature maps
+        x = self.backbone(img)
+        if self.neck is not None:
+            x = self.neck(x)
+        rpn_outs = self.rpn_head(x)
+        proposal_inputs = rpn_outs + (img_shapes, self.rpn_test_cfg)
+        proposals = self.rpn_head.get_proposals(*proposal_inputs)[0]
+        if rescale:
+            proposals[:, :4] /= img_shapes[0][-1]
+        return proposals.cpu().numpy()
+
+    def aug_test(self, imgs, img_metas, rescale=False):
+        aug_proposals = []
+        for img, img_meta in zip(imgs, img_metas):
+            x = self.backbone(img)
+            if self.neck is not None:
+                x = self.neck(x)
+            rpn_outs = self.rpn_head(x)
+            proposal_inputs = rpn_outs + (img_meta['shape_scale'],
+                                          self.rpn_test_cfg)
+            proposal_list = self.rpn_head.get_proposals(*proposal_inputs)
+            assert len(proposal_list) == 1
+            aug_proposals.append(proposal_list[0])  # len(proposal_list) = 1
+        merged_proposals = merge_aug_proposals(aug_proposals, img_metas,
+                                               self.rpn_test_cfg)
+        if not rescale:
+            img_shape = img_metas[0]['shape_scale'][0]
+            flip = img_metas[0]['flip'][0]
+            merged_proposals[:, :4] = bbox_mapping(merged_proposals[:, :4],
+                                                   img_shape, flip)
+        return merged_proposals.cpu().numpy()
diff --git a/mmdet/models/detectors/two_stage.py b/mmdet/models/detectors/two_stage.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c057d606fba6c322733490591d5352a42b426a5
--- /dev/null
+++ b/mmdet/models/detectors/two_stage.py
@@ -0,0 +1,329 @@
+import torch
+import torch.nn as nn
+
+from .. import builder
+from mmdet.core.utils import tensor2imgs
+from mmdet.core import (bbox2roi, bbox_mapping, split_combined_gt_polys,
+                        bbox_sampling, multiclass_nms, merge_aug_proposals,
+                        merge_aug_bboxes, merge_aug_masks, bbox2result)
+
+
+class TwoStageDetector(nn.Module):
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 rpn_head,
+                 roi_block,
+                 bbox_head,
+                 rpn_train_cfg,
+                 rpn_test_cfg,
+                 rcnn_train_cfg,
+                 rcnn_test_cfg,
+                 mask_block=None,
+                 mask_head=None,
+                 pretrained=None):
+        super(TwoStageDetector, self).__init__()
+        self.backbone = builder.build_backbone(backbone)
+        self.neck = builder.build_neck(neck) if neck is not None else None
+        self.rpn_head = builder.build_rpn_head(rpn_head)
+        self.bbox_roi_extractor = builder.build_roi_block(roi_block)
+        self.bbox_head = builder.build_bbox_head(bbox_head)
+        self.mask_roi_extractor = builder.build_roi_block(mask_block) if (
+            mask_block is not None) else None
+        self.mask_head = builder.build_mask_head(mask_head) if (
+            mask_head is not None) else None
+        self.with_mask = False if self.mask_head is None else True
+
+        self.rpn_train_cfg = rpn_train_cfg
+        self.rpn_test_cfg = rpn_test_cfg
+        self.rcnn_train_cfg = rcnn_train_cfg
+        self.rcnn_test_cfg = rcnn_test_cfg
+        self.init_weights(pretrained=pretrained)
+
+    def init_weights(self, pretrained=None):
+        if pretrained is not None:
+            print('load model from: {}'.format(pretrained))
+        self.backbone.init_weights(pretrained=pretrained)
+        if self.neck is not None:
+            if isinstance(self.neck, nn.Sequential):
+                for m in self.neck:
+                    m.init_weights()
+            else:
+                self.neck.init_weights()
+        self.rpn_head.init_weights()
+        self.bbox_roi_extractor.init_weights()
+        self.bbox_head.init_weights()
+        if self.mask_roi_extractor is not None:
+            self.mask_roi_extractor.init_weights()
+        if self.mask_head is not None:
+            self.mask_head.init_weights()
+
+    def forward(self,
+                img,
+                img_meta,
+                gt_bboxes=None,
+                gt_labels=None,
+                gt_ignore=None,
+                gt_polys=None,
+                gt_poly_lens=None,
+                num_polys_per_mask=None,
+                return_loss=True,
+                return_bboxes=False,
+                rescale=False):
+        if not return_loss:
+            return self.test(img, img_meta, rescale)
+
+        if not self.with_mask:
+            assert (gt_polys is None and gt_poly_lens is None
+                    and num_polys_per_mask is None)
+        else:
+            assert (gt_polys is not None and gt_poly_lens is not None
+                    and num_polys_per_mask is not None)
+            gt_polys = split_combined_gt_polys(gt_polys, gt_poly_lens,
+                                               num_polys_per_mask)
+
+        if self.rpn_train_cfg.get('debug', False):
+            self.rpn_head.debug_imgs = tensor2imgs(img)
+        if self.rcnn_train_cfg.get('debug', False):
+            self.bbox_head.debug_imgs = tensor2imgs(img)
+            if self.mask_head is not None:
+                self.mask_head.debug_imgs = tensor2imgs(img)
+
+        img_shapes = img_meta['shape_scale']
+
+        x = self.backbone(img)
+        if self.neck is not None:
+            x = self.neck(x)
+
+        rpn_outs = self.rpn_head(x)
+        proposal_inputs = rpn_outs + (img_shapes, self.rpn_test_cfg)
+        proposal_list = self.rpn_head.get_proposals(*proposal_inputs)
+
+        (pos_inds, neg_inds, pos_proposals, neg_proposals,
+         pos_assigned_gt_inds, pos_gt_bboxes, pos_gt_labels) = bbox_sampling(
+             proposal_list, gt_bboxes, gt_ignore, gt_labels,
+             self.rcnn_train_cfg)
+
+        labels, label_weights, bbox_targets, bbox_weights = \
+            self.bbox_head.proposal_target(
+                pos_proposals, neg_proposals, pos_gt_bboxes, pos_gt_labels,
+                self.rcnn_train_cfg)
+
+        rois = bbox2roi([
+            torch.cat([pos, neg], dim=0)
+            for pos, neg in zip(pos_proposals, neg_proposals)
+        ])
+        # TODO: a more flexible way to configurate feat maps
+        roi_feats = self.bbox_roi_extractor(
+            x[:self.bbox_roi_extractor.num_inputs], rois)
+        cls_score, bbox_pred = self.bbox_head(roi_feats)
+
+        losses = dict()
+        rpn_loss_inputs = rpn_outs + (gt_bboxes, img_shapes,
+                                      self.rpn_train_cfg)
+        rpn_losses = self.rpn_head.loss(*rpn_loss_inputs)
+        losses.update(rpn_losses)
+
+        loss_bbox = self.bbox_head.loss(cls_score, bbox_pred, labels,
+                                        label_weights, bbox_targets,
+                                        bbox_weights)
+        losses.update(loss_bbox)
+
+        if self.with_mask:
+            mask_targets = self.mask_head.mask_target(
+                pos_proposals, pos_assigned_gt_inds, gt_polys, img_shapes,
+                self.rcnn_train_cfg)
+            pos_rois = bbox2roi(pos_proposals)
+            mask_feats = self.mask_roi_extractor(
+                x[:self.mask_roi_extractor.num_inputs], pos_rois)
+            mask_pred = self.mask_head(mask_feats)
+            losses['loss_mask'] = self.mask_head.loss(mask_pred, mask_targets,
+                                                      torch.cat(pos_gt_labels))
+        return losses
+
+    def test(self, imgs, img_metas, rescale=False):
+        """Test w/ or w/o augmentations."""
+        assert isinstance(imgs, list) and isinstance(img_metas, list)
+        assert len(imgs) == len(img_metas)
+        img_per_gpu = imgs[0].size(0)
+        assert img_per_gpu == 1
+        if len(imgs) == 1:
+            return self.simple_test(imgs[0], img_metas[0], rescale)
+        else:
+            return self.aug_test(imgs, img_metas, rescale)
+
+    def simple_test_bboxes(self, x, img_meta, rescale=False):
+        """Test only det bboxes without augmentation."""
+
+        img_shapes = img_meta['shape_scale']
+        rpn_outs = self.rpn_head(x)
+        proposal_inputs = rpn_outs + (img_shapes, self.rpn_test_cfg)
+        proposal_list = self.rpn_head.get_proposals(*proposal_inputs)
+
+        rois = bbox2roi(proposal_list)
+        roi_feats = self.bbox_roi_extractor(
+            x[:len(self.bbox_roi_extractor.featmap_strides)], rois)
+        cls_score, bbox_pred = self.bbox_head(roi_feats)
+        # image shape of the first image in the batch (only one)
+        img_shape = img_shapes[0]
+        det_bboxes, det_labels = self.bbox_head.get_det_bboxes(
+            rois,
+            cls_score,
+            bbox_pred,
+            img_shape,
+            rescale=rescale,
+            nms_cfg=self.rcnn_test_cfg)
+        return det_bboxes, det_labels
+
+    def simple_test_mask(self,
+                         x,
+                         img_meta,
+                         det_bboxes,
+                         det_labels,
+                         rescale=False):
+        # image shape of the first image in the batch (only one)
+        img_shape = img_meta['shape_scale'][0]
+        if det_bboxes.shape[0] == 0:
+            segm_result = [[] for _ in range(self.mask_head.num_classes - 1)]
+        else:
+            # if det_bboxes is rescaled to the original image size, we need to
+            # rescale it back to the testing scale to obtain RoIs.
+            _bboxes = (det_bboxes[:, :4] * img_shape[-1]
+                       if rescale else det_bboxes)
+            mask_rois = bbox2roi([_bboxes])
+            mask_feats = self.mask_roi_extractor(
+                x[:len(self.mask_roi_extractor.featmap_strides)], mask_rois)
+            mask_pred = self.mask_head(mask_feats)
+            segm_result = self.mask_head.get_seg_masks(
+                mask_pred, det_bboxes, det_labels, img_shape,
+                self.rcnn_test_cfg, rescale)
+        return segm_result
+
+    def simple_test(self, img, img_meta, rescale=False):
+        """Test without augmentation."""
+        # get feature maps
+        x = self.backbone(img)
+        if self.neck is not None:
+            x = self.neck(x)
+        det_bboxes, det_labels = self.simple_test_bboxes(
+            x, img_meta, rescale=rescale)
+        bbox_result = bbox2result(det_bboxes, det_labels,
+                                  self.bbox_head.num_classes)
+        if not self.with_mask:
+            return bbox_result
+
+        segm_result = self.simple_test_mask(
+            x, img_meta, det_bboxes, det_labels, rescale=rescale)
+
+        return bbox_result, segm_result
+
+    def aug_test_bboxes(self, imgs, img_metas):
+        """Test with augmentations for det bboxes."""
+        # step 1: get RPN proposals for augmented images, apply NMS to the
+        # union of all proposals.
+        aug_proposals = []
+        for img, img_meta in zip(imgs, img_metas):
+            x = self.backbone(img)
+            if self.neck is not None:
+                x = self.neck(x)
+            rpn_outs = self.rpn_head(x)
+            proposal_inputs = rpn_outs + (img_meta['shape_scale'],
+                                          self.rpn_test_cfg)
+            proposal_list = self.rpn_head.get_proposals(*proposal_inputs)
+            assert len(proposal_list) == 1
+            aug_proposals.append(proposal_list[0])  # len(proposal_list) = 1
+        # after merging, proposals will be rescaled to the original image size
+        merged_proposals = merge_aug_proposals(aug_proposals, img_metas,
+                                               self.rpn_test_cfg)
+        # step 2: Given merged proposals, predict bboxes for augmented images,
+        # output the union of these bboxes.
+        aug_bboxes = []
+        aug_scores = []
+        for img, img_meta in zip(imgs, img_metas):
+            # only one image in the batch
+            img_shape = img_meta['shape_scale'][0]
+            flip = img_meta['flip'][0]
+            proposals = bbox_mapping(merged_proposals[:, :4], img_shape, flip)
+            rois = bbox2roi([proposals])
+            # recompute feature maps to save GPU memory
+            x = self.backbone(img)
+            if self.neck is not None:
+                x = self.neck(x)
+            roi_feats = self.bbox_roi_extractor(
+                x[:len(self.bbox_roi_extractor.featmap_strides)], rois)
+            cls_score, bbox_pred = self.bbox_head(roi_feats)
+            bboxes, scores = self.bbox_head.get_det_bboxes(
+                rois,
+                cls_score,
+                bbox_pred,
+                img_shape,
+                rescale=False,
+                nms_cfg=None)
+            aug_bboxes.append(bboxes)
+            aug_scores.append(scores)
+        # after merging, bboxes will be rescaled to the original image size
+        merged_bboxes, merged_scores = merge_aug_bboxes(
+            aug_bboxes, aug_scores, img_metas, self.rcnn_test_cfg)
+        det_bboxes, det_labels = multiclass_nms(
+            merged_bboxes, merged_scores, self.rcnn_test_cfg.score_thr,
+            self.rcnn_test_cfg.nms_thr, self.rcnn_test_cfg.max_per_img)
+        return det_bboxes, det_labels
+
+    def aug_test_mask(self,
+                      imgs,
+                      img_metas,
+                      det_bboxes,
+                      det_labels,
+                      rescale=False):
+        # step 3: Given merged bboxes, predict masks for augmented images,
+        # scores of masks are averaged across augmented images.
+        if rescale:
+            _det_bboxes = det_bboxes
+        else:
+            _det_bboxes = det_bboxes.clone()
+            _det_bboxes[:, :4] *= img_metas[0]['shape_scale'][0][-1]
+        if det_bboxes.shape[0] == 0:
+            segm_result = [[] for _ in range(self.mask_head.num_classes - 1)]
+        else:
+            aug_masks = []
+            for img, img_meta in zip(imgs, img_metas):
+                img_shape = img_meta['shape_scale'][0]
+                flip = img_meta['flip'][0]
+                _bboxes = bbox_mapping(det_bboxes[:, :4], img_shape, flip)
+                mask_rois = bbox2roi([_bboxes])
+                x = self.backbone(img)
+                if self.neck is not None:
+                    x = self.neck(x)
+                mask_feats = self.mask_roi_extractor(
+                    x[:len(self.mask_roi_extractor.featmap_strides)],
+                    mask_rois)
+                mask_pred = self.mask_head(mask_feats)
+                # convert to numpy array to save memory
+                aug_masks.append(mask_pred.sigmoid().cpu().numpy())
+            merged_masks = merge_aug_masks(aug_masks, img_metas,
+                                           self.rcnn_test_cfg)
+            segm_result = self.mask_head.get_seg_masks(
+                merged_masks, _det_bboxes, det_labels,
+                img_metas[0]['shape_scale'][0], self.rcnn_test_cfg, rescale)
+        return segm_result
+
+    def aug_test(self, imgs, img_metas, rescale=False):
+        """Test with augmentations.
+        If rescale is False, then returned bboxes and masks will fit the scale
+        if imgs[0].
+        """
+        # aug test det bboxes
+        det_bboxes, det_labels = self.aug_test_bboxes(imgs, img_metas)
+        if rescale:
+            _det_bboxes = det_bboxes
+        else:
+            _det_bboxes = det_bboxes.clone()
+            _det_bboxes[:, :4] *= img_metas[0]['shape_scale'][0][-1]
+        bbox_result = bbox2result(_det_bboxes, det_labels,
+                                  self.bbox_head.num_classes)
+        if not self.with_mask:
+            return bbox_result
+        segm_result = self.aug_test_mask(
+            imgs, img_metas, det_bboxes, det_labels, rescale=rescale)
+        return bbox_result, segm_result
diff --git a/mmdet/models/mask_heads/__init__.py b/mmdet/models/mask_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a21ae9add5a78d23781bf36a696b28606e19b0ce
--- /dev/null
+++ b/mmdet/models/mask_heads/__init__.py
@@ -0,0 +1,3 @@
+from .fcn_mask_head import FCNMaskHead
+
+__all__ = ['FCNMaskHead']
diff --git a/mmdet/models/mask_heads/fcn_mask_head.py b/mmdet/models/mask_heads/fcn_mask_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..28865a68f006a4cd04753a1eb6caeda9ce3fc284
--- /dev/null
+++ b/mmdet/models/mask_heads/fcn_mask_head.py
@@ -0,0 +1,175 @@
+import mmcv
+import numpy as np
+import pycocotools.mask as mask_util
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+
+from ..common import ConvModule
+from mmdet.core import mask_target, mask_cross_entropy
+
+
+class FCNMaskHead(nn.Module):
+
+    def __init__(self,
+                 num_convs=4,
+                 roi_feat_size=14,
+                 in_channels=256,
+                 conv_kernel_size=3,
+                 conv_out_channels=256,
+                 upsample_method='deconv',
+                 upsample_ratio=2,
+                 num_classes=81,
+                 class_agnostic=False,
+                 with_cp=False,
+                 normalize=None):
+        super(FCNMaskHead, self).__init__()
+        if upsample_method not in [None, 'deconv', 'nearest', 'bilinear']:
+            raise ValueError(
+                'Invalid upsample method {}, accepted methods '
+                'are "deconv", "nearest", "bilinear"'.format(upsample_method))
+        self.num_convs = num_convs
+        self.roi_feat_size = roi_feat_size  # WARN: not used and reserved
+        self.in_channels = in_channels
+        self.conv_kernel_size = conv_kernel_size
+        self.conv_out_channels = conv_out_channels
+        self.upsample_method = upsample_method
+        self.upsample_ratio = upsample_ratio
+        self.num_classes = num_classes
+        self.class_agnostic = class_agnostic
+        self.normalize = normalize
+        self.with_bias = normalize is None
+        self.with_cp = with_cp
+
+        self.convs = nn.ModuleList()
+        for i in range(self.num_convs):
+            in_channels = (self.in_channels
+                           if i == 0 else self.conv_out_channels)
+            padding = (self.conv_kernel_size - 1) // 2
+            self.convs.append(
+                ConvModule(
+                    in_channels,
+                    self.conv_out_channels,
+                    3,
+                    padding=padding,
+                    normalize=normalize,
+                    bias=self.with_bias))
+        if self.upsample_method is None:
+            self.upsample = None
+        elif self.upsample_method == 'deconv':
+            self.upsample = nn.ConvTranspose2d(
+                self.conv_out_channels,
+                self.conv_out_channels,
+                self.upsample_ratio,
+                stride=self.upsample_ratio)
+        else:
+            self.upsample = nn.Upsample(
+                scale_factor=self.upsample_ratio, mode=self.upsample_method)
+
+        out_channels = 1 if self.class_agnostic else self.num_classes
+        self.conv_logits = nn.Conv2d(self.conv_out_channels, out_channels, 1)
+        self.relu = nn.ReLU(inplace=True)
+        self.debug_imgs = None
+
+    def init_weights(self):
+        for m in [self.upsample, self.conv_logits]:
+            if m is None:
+                continue
+            nn.init.kaiming_normal_(
+                m.weight, mode='fan_out', nonlinearity='relu')
+            nn.init.constant_(m.bias, 0)
+
+    def convs_forward(self, x):
+
+        def m_lvl_convs_forward(x):
+            for conv in self.convs[1:-1]:
+                x = conv(x)
+            return x
+
+        if self.num_convs > 0:
+            x = self.convs[0](x)
+            if self.num_convs > 1:
+                if self.with_cp and x.requires_grad:
+                    x = cp.checkpoint(m_lvl_convs_forward, x)
+                else:
+                    x = m_lvl_convs_forward(x)
+                x = self.convs[-1](x)
+        return x
+
+    def forward(self, x):
+        x = self.convs_forward(x)
+        if self.upsample is not None:
+            x = self.upsample(x)
+            if self.upsample_method == 'deconv':
+                x = self.relu(x)
+        mask_pred = self.conv_logits(x)
+        return mask_pred
+
+    def mask_target(self, pos_proposals, pos_assigned_gt_inds, gt_masks,
+                    img_shapes, rcnn_train_cfg):
+        mask_targets = mask_target(pos_proposals, pos_assigned_gt_inds,
+                                   gt_masks, img_shapes, rcnn_train_cfg)
+        return mask_targets
+
+    def loss(self, mask_pred, mask_targets, labels):
+        loss_mask = mask_cross_entropy(mask_pred, mask_targets, labels)
+        return loss_mask
+
+    def get_seg_masks(self,
+                      mask_pred,
+                      det_bboxes,
+                      det_labels,
+                      img_shape,
+                      rcnn_test_cfg,
+                      ori_scale,
+                      rescale=True):
+        """Get segmentation masks from mask_pred and bboxes
+        Args:
+            mask_pred (Tensor or ndarray): shape (n, #class+1, h, w).
+                For single-scale testing, mask_pred is the direct output of
+                model, whose type is Tensor, while for multi-scale testing,
+                it will be converted to numpy array outside of this method.
+            det_bboxes (Tensor): shape (n, 4/5)
+            det_labels (Tensor): shape (n, )
+            img_shape (Tensor): shape (3, )
+            rcnn_test_cfg (dict): rcnn testing config
+            rescale (bool): whether rescale masks to original image size
+        Returns:
+            list[list]: encoded masks
+        """
+        if isinstance(mask_pred, torch.Tensor):
+            mask_pred = mask_pred.sigmoid().cpu().numpy()
+        assert isinstance(mask_pred, np.ndarray)
+        cls_segms = [[] for _ in range(self.num_classes - 1)]
+        bboxes = det_bboxes.cpu().numpy()[:, :4]
+        labels = det_labels.cpu().numpy() + 1
+        scale_factor = img_shape[-1] if rescale else 1.0
+        img_h = ori_scale['height'] if rescale else np.round(
+            ori_scale['height'].item() * img_shape[-1].item()).astype(np.int32)
+        img_w = ori_scale['width'] if rescale else np.round(
+            ori_scale['width'].item() * img_shape[-1].item()).astype(np.int32)
+
+        for i in range(bboxes.shape[0]):
+            bbox = (bboxes[i, :] / float(scale_factor)).astype(int)
+            label = labels[i]
+            w = bbox[2] - bbox[0] + 1
+            h = bbox[3] - bbox[1] + 1
+            w = max(w, 1)
+            h = max(h, 1)
+
+            if not self.class_agnostic:
+                mask_pred_ = mask_pred[i, label, :, :]
+            else:
+                mask_pred_ = mask_pred[i, 0, :, :]
+
+            im_mask = np.zeros((img_h, img_w), dtype=np.float32)
+
+            im_mask[bbox[1]:bbox[1] + h, bbox[0]:bbox[0] + w] = mmcv.resize(
+                mask_pred_, (w, h))
+            # im_mask = cv2.resize(im_mask, (img_w, img_h))
+            im_mask = np.array(
+                im_mask > rcnn_test_cfg.mask_thr_binary, dtype=np.uint8)
+            rle = mask_util.encode(
+                np.array(im_mask[:, :, np.newaxis], order='F'))[0]
+            cls_segms[label - 1].append(rle)
+        return cls_segms
diff --git a/mmdet/models/misc.py b/mmdet/models/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad52b587ac126ed2cfbf5e2ed5c98356e1499c5f
--- /dev/null
+++ b/mmdet/models/misc.py
@@ -0,0 +1,9 @@
+from functools import partial
+
+from six.moves import map, zip
+
+
+def multi_apply(func, *args, **kwargs):
+    pfunc = partial(func, **kwargs) if kwargs else func
+    map_results = map(pfunc, *args)
+    return tuple(map(list, zip(*map_results)))
diff --git a/mmdet/models/necks/__init__.py b/mmdet/models/necks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0093021ebac1e46fbb798ed6ee96a192dbd8604c
--- /dev/null
+++ b/mmdet/models/necks/__init__.py
@@ -0,0 +1,3 @@
+from .fpn import FPN
+
+__all__ = ['FPN']
diff --git a/mmdet/models/necks/fpn.py b/mmdet/models/necks/fpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4734e18621bec4cdb8e33052935c6d7f3a495e2
--- /dev/null
+++ b/mmdet/models/necks/fpn.py
@@ -0,0 +1,125 @@
+import torch.nn as nn
+import torch.nn.functional as F
+from ..common import ConvModule
+from ..weight_init import xavier_init
+
+
+class FPN(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_outs,
+                 start_level=0,
+                 end_level=-1,
+                 add_extra_convs=False,
+                 normalize=None,
+                 activation=None):
+        super(FPN, self).__init__()
+        assert isinstance(in_channels, list)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.num_ins = len(in_channels)
+        self.num_outs = num_outs
+        self.activation = activation
+        self.with_bias = normalize is None
+
+        if end_level == -1:
+            self.backbone_end_level = self.num_ins
+            assert num_outs >= self.num_ins - start_level
+        else:
+            # if end_level < inputs, no extra level is allowed
+            self.backbone_end_level = end_level
+            assert end_level <= len(in_channels)
+            assert num_outs == end_level - start_level
+        self.start_level = start_level
+        self.end_level = end_level
+        self.add_extra_convs = add_extra_convs
+
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_convs = nn.ModuleList()
+
+        for i in range(self.start_level, self.backbone_end_level):
+            l_conv = ConvModule(
+                in_channels[i],
+                out_channels,
+                1,
+                normalize=normalize,
+                bias=self.with_bias,
+                activation=self.activation,
+                inplace=False)
+            fpn_conv = ConvModule(
+                out_channels,
+                out_channels,
+                3,
+                padding=1,
+                normalize=normalize,
+                bias=self.with_bias,
+                activation=self.activation,
+                inplace=False)
+
+            self.lateral_convs.append(l_conv)
+            self.fpn_convs.append(fpn_conv)
+
+            # lvl_id = i - self.start_level
+            # setattr(self, 'lateral_conv{}'.format(lvl_id), l_conv)
+            # setattr(self, 'fpn_conv{}'.format(lvl_id), fpn_conv)
+
+        # add extra conv layers (e.g., RetinaNet)
+        extra_levels = num_outs - self.backbone_end_level + self.start_level
+        if add_extra_convs and extra_levels >= 1:
+            for i in range(extra_levels):
+                in_channels = (self.in_channels[self.backbone_end_level - 1]
+                               if i == 0 else out_channels)
+                extra_fpn_conv = ConvModule(
+                    in_channels,
+                    out_channels,
+                    3,
+                    stride=2,
+                    padding=1,
+                    normalize=normalize,
+                    bias=self.with_bias,
+                    activation=self.activation,
+                    inplace=False)
+                self.fpn_convs.append(extra_fpn_conv)
+
+    # default init_weights for conv(msra) and norm in ConvModule
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                xavier_init(m, distribution='uniform')
+
+    def forward(self, inputs):
+        assert len(inputs) == len(self.in_channels)
+
+        # build laterals
+        laterals = [
+            lateral_conv(inputs[i + self.start_level])
+            for i, lateral_conv in enumerate(self.lateral_convs)
+        ]
+
+        # build top-down path
+        used_backbone_levels = len(laterals)
+        for i in range(used_backbone_levels - 1, 0, -1):
+            laterals[i - 1] += F.upsample(
+                laterals[i], scale_factor=2, mode='nearest')
+
+        # build outputs
+        # part 1: from original levels
+        outs = [
+            self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels)
+        ]
+        # part 2: add extra levels
+        if self.num_outs > len(outs):
+            # use max pool to get more levels on top of outputs (Faster R-CNN, Mask R-CNN)
+            if not self.add_extra_convs:
+                for i in range(self.num_outs - used_backbone_levels):
+                    outs.append(F.max_pool2d(outs[-1], 1, stride=2))
+            # add conv layers on top of original feature maps (RetinaNet)
+            else:
+                orig = inputs[self.backbone_end_level - 1]
+                outs.append(self.fpn_convs[used_backbone_levels](orig))
+                for i in range(used_backbone_levels + 1, self.num_outs):
+                    # BUG: we should add relu before each extra conv
+                    outs.append(self.fpn_convs[i](outs[-1]))
+        return tuple(outs)
diff --git a/mmdet/models/roi_extractors/__init__.py b/mmdet/models/roi_extractors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e76e689753f10e87b3f6d9482e880b902f9b747e
--- /dev/null
+++ b/mmdet/models/roi_extractors/__init__.py
@@ -0,0 +1,3 @@
+from .single_level import SingleLevelRoI
+
+__all__ = ['SingleLevelRoI']
diff --git a/mmdet/models/roi_extractors/single_level.py b/mmdet/models/roi_extractors/single_level.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e37ac83d6ffb7beab56926329f71311f7eef116
--- /dev/null
+++ b/mmdet/models/roi_extractors/single_level.py
@@ -0,0 +1,73 @@
+from __future__ import division
+
+import torch
+import torch.nn as nn
+
+from mmdet import ops
+
+
+class SingleLevelRoI(nn.Module):
+    """Extract RoI features from a single level feature map. Each RoI is
+    mapped to a level according to its scale."""
+
+    def __init__(self,
+                 roi_layer,
+                 out_channels,
+                 featmap_strides,
+                 finest_scale=56):
+        super(SingleLevelRoI, self).__init__()
+        self.roi_layers = self.build_roi_layers(roi_layer, featmap_strides)
+        self.out_channels = out_channels
+        self.featmap_strides = featmap_strides
+        self.finest_scale = finest_scale
+
+    @property
+    def num_inputs(self):
+        return len(self.featmap_strides)
+
+    def init_weights(self):
+        pass
+
+    def build_roi_layers(self, layer_cfg, featmap_strides):
+        cfg = layer_cfg.copy()
+        layer_type = cfg.pop('type')
+        assert hasattr(ops, layer_type)
+        layer_cls = getattr(ops, layer_type)
+        roi_layers = nn.ModuleList(
+            [layer_cls(spatial_scale=1 / s, **cfg) for s in featmap_strides])
+        return roi_layers
+
+    def map_roi_levels(self, rois, num_levels):
+        """Map rois to corresponding feature levels (0-based) by scales.
+
+        scale < finest_scale: level 0
+        finest_scale <= scale < finest_scale * 2: level 1
+        finest_scale * 2 <= scale < finest_scale * 4: level 2
+        scale >= finest_scale * 4: level 3
+        """
+        scale = torch.sqrt(
+            (rois[:, 3] - rois[:, 1] + 1) * (rois[:, 4] - rois[:, 2] + 1))
+        target_lvls = torch.floor(torch.log2(scale / self.finest_scale + 1e-6))
+        target_lvls = target_lvls.clamp(min=0, max=num_levels - 1).long()
+        return target_lvls
+
+    def forward(self, feats, rois):
+        """Extract roi features with the roi layer. If multiple feature levels
+        are used, then rois are mapped to corresponding levels according to
+        their scales.
+        """
+        if len(feats) == 1:
+            return self.roi_layers[0](feats[0], rois)
+
+        out_size = self.roi_layers[0].out_size
+        num_levels = len(feats)
+        target_lvls = self.map_roi_levels(rois, num_levels)
+        roi_feats = torch.cuda.FloatTensor(rois.size()[0], self.out_channels,
+                                           out_size, out_size).fill_(0)
+        for i in range(num_levels):
+            inds = target_lvls == i
+            if inds.any():
+                rois_ = rois[inds, :]
+                roi_feats_t = self.roi_layers[i](feats[i], rois_)
+                roi_feats[inds] += roi_feats_t
+        return roi_feats
diff --git a/mmdet/models/rpn_heads/__init__.py b/mmdet/models/rpn_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbc4b3affbf31059fdcbb1b4b43eeb1544c631f0
--- /dev/null
+++ b/mmdet/models/rpn_heads/__init__.py
@@ -0,0 +1,3 @@
+from .rpn_head import RPNHead
+
+__all__ = ['RPNHead']
diff --git a/mmdet/models/rpn_heads/rpn_head.py b/mmdet/models/rpn_heads/rpn_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2fce9ebe7aa5c820139fa0188e2f6a25322ed66
--- /dev/null
+++ b/mmdet/models/rpn_heads/rpn_head.py
@@ -0,0 +1,237 @@
+from __future__ import division
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmdet.core import (AnchorGenerator, anchor_target, bbox_transform_inv,
+                        weighted_cross_entropy, weighted_smoothl1,
+                        weighted_binary_cross_entropy)
+from mmdet.ops import nms
+from ..misc import multi_apply
+from ..weight_init import normal_init
+
+
+class RPNHead(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 feat_channels=512,
+                 coarsest_stride=32,
+                 anchor_scales=[8, 16, 32],
+                 anchor_ratios=[0.5, 1.0, 2.0],
+                 anchor_strides=[4, 8, 16, 32, 64],
+                 anchor_base_sizes=None,
+                 target_means=(.0, .0, .0, .0),
+                 target_stds=(1.0, 1.0, 1.0, 1.0),
+                 use_sigmoid_cls=False):
+        super(RPNHead, self).__init__()
+        self.in_channels = in_channels
+        self.feat_channels = feat_channels
+        self.coarsest_stride = coarsest_stride
+        self.anchor_scales = anchor_scales
+        self.anchor_ratios = anchor_ratios
+        self.anchor_strides = anchor_strides
+        self.anchor_base_sizes = anchor_strides.copy(
+        ) if anchor_base_sizes is None else anchor_base_sizes
+        self.target_means = target_means
+        self.target_stds = target_stds
+        self.use_sigmoid_cls = use_sigmoid_cls
+
+        self.anchor_generators = []
+        for anchor_base in self.anchor_base_sizes:
+            self.anchor_generators.append(
+                AnchorGenerator(anchor_base, anchor_scales, anchor_ratios))
+        self.rpn_conv = nn.Conv2d(in_channels, feat_channels, 3, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+        self.num_anchors = len(self.anchor_ratios) * len(self.anchor_scales)
+        out_channels = (self.num_anchors
+                        if self.use_sigmoid_cls else self.num_anchors * 2)
+        self.rpn_cls = nn.Conv2d(feat_channels, out_channels, 1)
+        self.rpn_reg = nn.Conv2d(feat_channels, self.num_anchors * 4, 1)
+        self.debug_imgs = None
+
+    def init_weights(self):
+        normal_init(self.rpn_conv, std=0.01)
+        normal_init(self.rpn_cls, std=0.01)
+        normal_init(self.rpn_reg, std=0.01)
+
+    def forward_single(self, x):
+        rpn_feat = self.relu(self.rpn_conv(x))
+        rpn_cls_score = self.rpn_cls(rpn_feat)
+        rpn_bbox_pred = self.rpn_reg(rpn_feat)
+        return rpn_cls_score, rpn_bbox_pred
+
+    def forward(self, feats):
+        return multi_apply(self.forward_single, feats)
+
+    def get_anchors(self, featmap_sizes, img_shapes):
+        """Get anchors given a list of feature map sizes, and get valid flags
+        at the same time. (Extra padding regions should be marked as invalid)
+        """
+        # calculate actual image shapes
+        padded_img_shapes = []
+        for img_shape in img_shapes:
+            h, w = img_shape[:2]
+            padded_h = int(
+                np.ceil(h / self.coarsest_stride) * self.coarsest_stride)
+            padded_w = int(
+                np.ceil(w / self.coarsest_stride) * self.coarsest_stride)
+            padded_img_shapes.append((padded_h, padded_w))
+        # generate anchors for different feature levels
+        # len = feature levels
+        anchor_list = []
+        # len = imgs per gpu
+        valid_flag_list = [[] for _ in range(len(img_shapes))]
+        for i in range(len(featmap_sizes)):
+            anchor_stride = self.anchor_strides[i]
+            anchors = self.anchor_generators[i].grid_anchors(
+                featmap_sizes[i], anchor_stride)
+            anchor_list.append(anchors)
+            # for each image in this feature level, get valid flags
+            featmap_size = featmap_sizes[i]
+            for img_id, (h, w) in enumerate(padded_img_shapes):
+                valid_feat_h = min(
+                    int(np.ceil(h / anchor_stride)), featmap_size[0])
+                valid_feat_w = min(
+                    int(np.ceil(w / anchor_stride)), featmap_size[1])
+                flags = self.anchor_generators[i].valid_flags(
+                    featmap_size, (valid_feat_h, valid_feat_w))
+                valid_flag_list[img_id].append(flags)
+        return anchor_list, valid_flag_list
+
+    def loss_single(self, rpn_cls_score, rpn_bbox_pred, labels, label_weights,
+                    bbox_targets, bbox_weights, num_total_samples, cfg):
+        labels = labels.contiguous().view(-1)
+        label_weights = label_weights.contiguous().view(-1)
+        bbox_targets = bbox_targets.contiguous().view(-1, 4)
+        bbox_weights = bbox_weights.contiguous().view(-1, 4)
+        if self.use_sigmoid_cls:
+            rpn_cls_score = rpn_cls_score.permute(0, 2, 3,
+                                                  1).contiguous().view(-1)
+            loss_cls = weighted_binary_cross_entropy(
+                rpn_cls_score,
+                labels,
+                label_weights,
+                ave_factor=num_total_samples)
+        else:
+            rpn_cls_score = rpn_cls_score.permute(0, 2, 3,
+                                                  1).contiguous().view(-1, 2)
+            loss_cls = weighted_cross_entropy(
+                rpn_cls_score,
+                labels,
+                label_weights,
+                ave_factor=num_total_samples)
+        rpn_bbox_pred = rpn_bbox_pred.permute(0, 2, 3, 1).contiguous().view(
+            -1, 4)
+        loss_reg = weighted_smoothl1(
+            rpn_bbox_pred,
+            bbox_targets,
+            bbox_weights,
+            beta=cfg.smoothl1_beta,
+            ave_factor=num_total_samples)
+        return loss_cls, loss_reg
+
+    def loss(self, rpn_cls_scores, rpn_bbox_preds, gt_bboxes, img_shapes, cfg):
+        featmap_sizes = [featmap.size()[-2:] for featmap in rpn_cls_scores]
+        assert len(featmap_sizes) == len(self.anchor_generators)
+
+        anchor_list, valid_flag_list = self.get_anchors(
+            featmap_sizes, img_shapes)
+        cls_reg_targets = anchor_target(
+            anchor_list, valid_flag_list, featmap_sizes, gt_bboxes, img_shapes,
+            self.target_means, self.target_stds, cfg)
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_samples) = cls_reg_targets
+        losses_cls, losses_reg = multi_apply(
+            self.loss_single,
+            rpn_cls_scores,
+            rpn_bbox_preds,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            num_total_samples=num_total_samples,
+            cfg=cfg)
+        return dict(loss_rpn_cls=losses_cls, loss_rpn_reg=losses_reg)
+
+    def get_proposals(self, rpn_cls_scores, rpn_bbox_preds, img_shapes, cfg):
+        img_per_gpu = len(img_shapes)
+        featmap_sizes = [featmap.size()[-2:] for featmap in rpn_cls_scores]
+        mlvl_anchors = [
+            self.anchor_generators[idx].grid_anchors(featmap_sizes[idx],
+                                                     self.anchor_strides[idx])
+            for idx in range(len(featmap_sizes))
+        ]
+        proposal_list = []
+        for img_id in range(img_per_gpu):
+            rpn_cls_score_list = [
+                rpn_cls_scores[idx][img_id].detach()
+                for idx in range(len(rpn_cls_scores))
+            ]
+            rpn_bbox_pred_list = [
+                rpn_bbox_preds[idx][img_id].detach()
+                for idx in range(len(rpn_bbox_preds))
+            ]
+            assert len(rpn_cls_score_list) == len(rpn_bbox_pred_list)
+            img_shape = img_shapes[img_id]
+            proposals = self._get_proposals_single(
+                rpn_cls_score_list, rpn_bbox_pred_list, mlvl_anchors,
+                img_shape, cfg)
+            proposal_list.append(proposals)
+        return proposal_list
+
+    def _get_proposals_single(self, rpn_cls_scores, rpn_bbox_preds,
+                              mlvl_anchors, img_shape, cfg):
+        mlvl_proposals = []
+        for idx in range(len(rpn_cls_scores)):
+            rpn_cls_score = rpn_cls_scores[idx]
+            rpn_bbox_pred = rpn_bbox_preds[idx]
+            assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:]
+            anchors = mlvl_anchors[idx]
+            if self.use_sigmoid_cls:
+                rpn_cls_score = rpn_cls_score.permute(1, 2,
+                                                      0).contiguous().view(-1)
+                rpn_cls_prob = F.sigmoid(rpn_cls_score)
+                scores = rpn_cls_prob
+            else:
+                rpn_cls_score = rpn_cls_score.permute(1, 2,
+                                                      0).contiguous().view(
+                                                          -1, 2)
+                rpn_cls_prob = F.softmax(rpn_cls_score, dim=1)
+                scores = rpn_cls_prob[:, 1]
+            rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).contiguous().view(
+                -1, 4)
+            _, order = scores.sort(0, descending=True)
+            if cfg.nms_pre > 0:
+                order = order[:cfg.nms_pre]
+                rpn_bbox_pred = rpn_bbox_pred[order, :]
+                anchors = anchors[order, :]
+                scores = scores[order]
+            proposals = bbox_transform_inv(anchors, rpn_bbox_pred,
+                                           self.target_means, self.target_stds,
+                                           img_shape)
+            w = proposals[:, 2] - proposals[:, 0] + 1
+            h = proposals[:, 3] - proposals[:, 1] + 1
+            valid_inds = torch.nonzero((w >= cfg.min_bbox_size) &
+                                       (h >= cfg.min_bbox_size)).squeeze()
+            proposals = proposals[valid_inds, :]
+            scores = scores[valid_inds]
+            proposals = torch.cat([proposals, scores.unsqueeze(-1)], dim=-1)
+            nms_keep = nms(proposals, cfg.nms_thr)[:cfg.nms_post]
+            proposals = proposals[nms_keep, :]
+            mlvl_proposals.append(proposals)
+        proposals = torch.cat(mlvl_proposals, 0)
+        if cfg.nms_across_levels:
+            nms_keep = nms(proposals, cfg.nms_thr)[:cfg.max_num]
+            proposals = proposals[nms_keep, :]
+        else:
+            scores = proposals[:, 4]
+            _, order = scores.sort(0, descending=True)
+            num = min(cfg.max_num, proposals.shape[0])
+            order = order[:num]
+            proposals = proposals[order, :]
+        return proposals
diff --git a/mmdet/models/weight_init.py b/mmdet/models/weight_init.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e9b13b4fbc17d6d1986da876108c1a813190c2d
--- /dev/null
+++ b/mmdet/models/weight_init.py
@@ -0,0 +1,39 @@
+import torch.nn as nn
+
+
+def xavier_init(module, gain=1, bias=0, distribution='normal'):
+    assert distribution in ['uniform', 'normal']
+    if distribution == 'uniform':
+        nn.init.xavier_uniform_(module.weight, gain=gain)
+    else:
+        nn.init.xavier_normal_(module.weight, gain=gain)
+    if hasattr(module, 'bias'):
+        nn.init.constant_(module.bias, bias)
+
+
+def normal_init(module, mean=0, std=1, bias=0):
+    nn.init.normal_(module.weight, mean, std)
+    if hasattr(module, 'bias'):
+        nn.init.constant_(module.bias, bias)
+
+
+def uniform_init(module, a=0, b=1, bias=0):
+    nn.init.uniform_(module.weight, a, b)
+    if hasattr(module, 'bias'):
+        nn.init.constant_(module.bias, bias)
+
+
+def kaiming_init(module,
+                 mode='fan_out',
+                 nonlinearity='relu',
+                 bias=0,
+                 distribution='normal'):
+    assert distribution in ['uniform', 'normal']
+    if distribution == 'uniform':
+        nn.init.kaiming_uniform_(
+            module.weight, mode=mode, nonlinearity=nonlinearity)
+    else:
+        nn.init.kaiming_normal_(
+            module.weight, mode=mode, nonlinearity=nonlinearity)
+    if hasattr(module, 'bias'):
+        nn.init.constant_(module.bias, bias)
diff --git a/mmdet/nn/__init__.py b/mmdet/nn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b627f5e7b807b1c6ae321c775c8fc8d03266238
--- /dev/null
+++ b/mmdet/nn/__init__.py
@@ -0,0 +1 @@
+from .parallel import MMDataParallel, MMDistributedDataParallel
diff --git a/mmdet/nn/parallel/__init__.py b/mmdet/nn/parallel/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ea0a58e4a53737372b7995f3f9d570cba50dddb
--- /dev/null
+++ b/mmdet/nn/parallel/__init__.py
@@ -0,0 +1,7 @@
+from .data_parallel import MMDataParallel
+from .distributed import MMDistributedDataParallel
+from .scatter_gather import scatter, scatter_kwargs
+
+__all__ = [
+    'MMDataParallel', 'MMDistributedDataParallel', 'scatter', 'scatter_kwargs'
+]
diff --git a/mmdet/nn/parallel/_functions.py b/mmdet/nn/parallel/_functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..75bb954dce440f7634c47d4a021360df53f3509e
--- /dev/null
+++ b/mmdet/nn/parallel/_functions.py
@@ -0,0 +1,74 @@
+import torch
+from torch.nn.parallel._functions import _get_stream
+
+
+def scatter(input, devices, streams=None):
+    """Scatters tensor across multiple GPUs.
+    """
+    if streams is None:
+        streams = [None] * len(devices)
+
+    if isinstance(input, list):
+        chunk_size = (len(input) - 1) // len(devices) + 1
+        outputs = [
+            scatter(input[i], [devices[i // chunk_size]],
+                    [streams[i // chunk_size]]) for i in range(len(input))
+        ]
+        return outputs
+    elif isinstance(input, torch.Tensor):
+        output = input.contiguous()
+        # TODO: copy to a pinned buffer first (if copying from CPU)
+        stream = streams[0] if output.numel() > 0 else None
+        with torch.cuda.device(devices[0]), torch.cuda.stream(stream):
+            output = output.cuda(devices[0], non_blocking=True)
+        return output
+    else:
+        raise Exception('Unknown type {}.'.format(type(input)))
+
+
+def synchronize_stream(output, devices, streams):
+    if isinstance(output, list):
+        chunk_size = len(output) // len(devices)
+        for i in range(len(devices)):
+            for j in range(chunk_size):
+                synchronize_stream(output[i * chunk_size + j], [devices[i]],
+                                   [streams[i]])
+    elif isinstance(output, torch.Tensor):
+        if output.numel() != 0:
+            with torch.cuda.device(devices[0]):
+                main_stream = torch.cuda.current_stream()
+                main_stream.wait_stream(streams[0])
+                output.record_stream(main_stream)
+    else:
+        raise Exception('Unknown type {}.'.format(type(output)))
+
+
+def get_input_device(input):
+    if isinstance(input, list):
+        for item in input:
+            input_device = get_input_device(item)
+            if input_device != -1:
+                return input_device
+        return -1
+    elif isinstance(input, torch.Tensor):
+        return input.get_device() if input.is_cuda else -1
+    else:
+        raise Exception('Unknown type {}.'.format(type(input)))
+
+
+class Scatter(object):
+
+    @staticmethod
+    def forward(target_gpus, input):
+        input_device = get_input_device(input)
+        streams = None
+        if input_device == -1:
+            # Perform CPU to GPU copies in a background stream
+            streams = [_get_stream(device) for device in target_gpus]
+
+        outputs = scatter(input, target_gpus, streams)
+        # Synchronize with the copy stream
+        if streams is not None:
+            synchronize_stream(outputs, target_gpus, streams)
+
+        return tuple(outputs)
diff --git a/mmdet/nn/parallel/data_parallel.py b/mmdet/nn/parallel/data_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..6735cb4afb7b512c5e9f757e962612ad1073ae12
--- /dev/null
+++ b/mmdet/nn/parallel/data_parallel.py
@@ -0,0 +1,9 @@
+from torch.nn.parallel import DataParallel
+
+from .scatter_gather import scatter_kwargs
+
+
+class MMDataParallel(DataParallel):
+
+    def scatter(self, inputs, kwargs, device_ids):
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
diff --git a/mmdet/nn/parallel/distributed.py b/mmdet/nn/parallel/distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..2809778ad93951650677a546b57190cb7659302d
--- /dev/null
+++ b/mmdet/nn/parallel/distributed.py
@@ -0,0 +1,9 @@
+from torch.nn.parallel import DistributedDataParallel
+
+from .scatter_gather import scatter_kwargs
+
+
+class MMDistributedDataParallel(DistributedDataParallel):
+
+    def scatter(self, inputs, kwargs, device_ids):
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
diff --git a/mmdet/nn/parallel/scatter_gather.py b/mmdet/nn/parallel/scatter_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..82511fd1db12774e1df1468e93353f2a963ed962
--- /dev/null
+++ b/mmdet/nn/parallel/scatter_gather.py
@@ -0,0 +1,48 @@
+import torch
+from ._functions import Scatter
+from torch.nn.parallel._functions import Scatter as OrigScatter
+from detkit.datasets.utils import DataContainer
+
+
+def scatter(inputs, target_gpus, dim=0):
+    """Scatter inputs to target gpus.
+
+    The only difference from original :func:`scatter` is to add support for
+    :type:`~mmdet.DataContainer`.
+    """
+
+    def scatter_map(obj):
+        if isinstance(obj, torch.Tensor):
+            return OrigScatter.apply(target_gpus, None, dim, obj)
+        if isinstance(obj, DataContainer) and isinstance(obj.data, list):
+            return Scatter.forward(target_gpus, obj.data)
+        if isinstance(obj, tuple) and len(obj) > 0:
+            return list(zip(*map(scatter_map, obj)))
+        if isinstance(obj, list) and len(obj) > 0:
+            return list(map(list, zip(*map(scatter_map, obj))))
+        if isinstance(obj, dict) and len(obj) > 0:
+            return list(map(type(obj), zip(*map(scatter_map, obj.items()))))
+        return [obj for targets in target_gpus]
+
+    # After scatter_map is called, a scatter_map cell will exist. This cell
+    # has a reference to the actual function scatter_map, which has references
+    # to a closure that has a reference to the scatter_map cell (because the
+    # fn is recursive). To avoid this reference cycle, we set the function to
+    # None, clearing the cell
+    try:
+        return scatter_map(inputs)
+    finally:
+        scatter_map = None
+
+
+def scatter_kwargs(inputs, kwargs, target_gpus, dim=0):
+    """Scatter with support for kwargs dictionary"""
+    inputs = scatter(inputs, target_gpus, dim) if inputs else []
+    kwargs = scatter(kwargs, target_gpus, dim) if kwargs else []
+    if len(inputs) < len(kwargs):
+        inputs.extend([() for _ in range(len(kwargs) - len(inputs))])
+    elif len(kwargs) < len(inputs):
+        kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))])
+    inputs = tuple(inputs)
+    kwargs = tuple(kwargs)
+    return inputs, kwargs
diff --git a/mmdet/ops/__init__.py b/mmdet/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..52e5808016cb94e63a7501cef7b1292805eb3491
--- /dev/null
+++ b/mmdet/ops/__init__.py
@@ -0,0 +1,3 @@
+from .nms import nms, soft_nms
+from .roi_align import RoIAlign, roi_align
+from .roi_pool import RoIPool, roi_pool
diff --git a/mmdet/ops/nms/.gitignore b/mmdet/ops/nms/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..ce1da4c53c0301615c1f0ba3b01a859ad68259cb
--- /dev/null
+++ b/mmdet/ops/nms/.gitignore
@@ -0,0 +1 @@
+*.cpp
diff --git a/mmdet/ops/nms/Makefile b/mmdet/ops/nms/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..39556dd28ba76300d0f491cd5e66d4a4d19fc8ee
--- /dev/null
+++ b/mmdet/ops/nms/Makefile
@@ -0,0 +1,8 @@
+PYTHON=${PYTHON:-python}
+
+all:
+	echo "Compiling nms kernels..."
+	$(PYTHON) setup.py build_ext --inplace
+
+clean:
+	rm *.so
diff --git a/mmdet/ops/nms/__init__.py b/mmdet/ops/nms/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cf8569b97b3a568458428776b1dbd6737882389
--- /dev/null
+++ b/mmdet/ops/nms/__init__.py
@@ -0,0 +1 @@
+from .nms_wrapper import nms, soft_nms
diff --git a/mmdet/ops/nms/cpu_nms.pyx b/mmdet/ops/nms/cpu_nms.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..1d0bef3321d78fc73556906649ab61eaaea60d86
--- /dev/null
+++ b/mmdet/ops/nms/cpu_nms.pyx
@@ -0,0 +1,68 @@
+# --------------------------------------------------------
+# Fast R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import numpy as np
+cimport numpy as np
+
+cdef inline np.float32_t max(np.float32_t a, np.float32_t b):
+    return a if a >= b else b
+
+cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
+    return a if a <= b else b
+
+def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):
+    cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
+    cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
+    cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
+    cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
+    cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]
+
+    cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1]
+
+    cdef int ndets = dets.shape[0]
+    cdef np.ndarray[np.int_t, ndim=1] suppressed = \
+            np.zeros((ndets), dtype=np.int)
+
+    # nominal indices
+    cdef int _i, _j
+    # sorted indices
+    cdef int i, j
+    # temp variables for box i's (the box currently under consideration)
+    cdef np.float32_t ix1, iy1, ix2, iy2, iarea
+    # variables for computing overlap with box j (lower scoring box)
+    cdef np.float32_t xx1, yy1, xx2, yy2
+    cdef np.float32_t w, h
+    cdef np.float32_t inter, ovr
+
+    keep = []
+    for _i in range(ndets):
+        i = order[_i]
+        if suppressed[i] == 1:
+            continue
+        keep.append(i)
+        ix1 = x1[i]
+        iy1 = y1[i]
+        ix2 = x2[i]
+        iy2 = y2[i]
+        iarea = areas[i]
+        for _j in range(_i + 1, ndets):
+            j = order[_j]
+            if suppressed[j] == 1:
+                continue
+            xx1 = max(ix1, x1[j])
+            yy1 = max(iy1, y1[j])
+            xx2 = min(ix2, x2[j])
+            yy2 = min(iy2, y2[j])
+            w = max(0.0, xx2 - xx1 + 1)
+            h = max(0.0, yy2 - yy1 + 1)
+            inter = w * h
+            ovr = inter / (iarea + areas[j] - inter)
+            if ovr >= thresh:
+                suppressed[j] = 1
+
+    return keep
diff --git a/mmdet/ops/nms/cpu_soft_nms.pyx b/mmdet/ops/nms/cpu_soft_nms.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..05ec5a5446221d3593a10edfd4d714bfa6192309
--- /dev/null
+++ b/mmdet/ops/nms/cpu_soft_nms.pyx
@@ -0,0 +1,123 @@
+# ----------------------------------------------------------
+# Soft-NMS: Improving Object Detection With One Line of Code
+# Copyright (c) University of Maryland, College Park
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Navaneeth Bodla and Bharat Singh
+# ----------------------------------------------------------
+
+import numpy as np
+cimport numpy as np
+
+
+cdef inline np.float32_t max(np.float32_t a, np.float32_t b):
+    return a if a >= b else b
+
+cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
+    return a if a <= b else b
+
+def cpu_soft_nms(
+    np.ndarray[float, ndim=2] boxes_in,
+    float sigma=0.5,
+    float Nt=0.3,
+    float threshold=0.001,
+    unsigned int method=0
+):
+    boxes = boxes_in.copy()
+    cdef unsigned int N = boxes.shape[0]
+    cdef float iw, ih, box_area
+    cdef float ua
+    cdef int pos = 0
+    cdef float maxscore = 0
+    cdef int maxpos = 0
+    cdef float x1, x2, y1, y2, tx1, tx2, ty1, ty2, ts, area, weight, ov
+    inds = np.arange(N)
+
+    for i in range(N):
+        maxscore = boxes[i, 4]
+        maxpos = i
+
+        tx1 = boxes[i,0]
+        ty1 = boxes[i,1]
+        tx2 = boxes[i,2]
+        ty2 = boxes[i,3]
+        ts = boxes[i,4]
+        ti = inds[i]
+
+        pos = i + 1
+        # get max box
+        while pos < N:
+            if maxscore < boxes[pos, 4]:
+                maxscore = boxes[pos, 4]
+                maxpos = pos
+            pos = pos + 1
+
+        # add max box as a detection
+        boxes[i,0] = boxes[maxpos,0]
+        boxes[i,1] = boxes[maxpos,1]
+        boxes[i,2] = boxes[maxpos,2]
+        boxes[i,3] = boxes[maxpos,3]
+        boxes[i,4] = boxes[maxpos,4]
+        inds[i] = inds[maxpos]
+
+        # swap ith box with position of max box
+        boxes[maxpos,0] = tx1
+        boxes[maxpos,1] = ty1
+        boxes[maxpos,2] = tx2
+        boxes[maxpos,3] = ty2
+        boxes[maxpos,4] = ts
+        inds[maxpos] = ti
+
+        tx1 = boxes[i,0]
+        ty1 = boxes[i,1]
+        tx2 = boxes[i,2]
+        ty2 = boxes[i,3]
+        ts = boxes[i,4]
+
+        pos = i + 1
+        # NMS iterations, note that N changes if detection boxes fall below
+        # threshold
+        while pos < N:
+            x1 = boxes[pos, 0]
+            y1 = boxes[pos, 1]
+            x2 = boxes[pos, 2]
+            y2 = boxes[pos, 3]
+            s = boxes[pos, 4]
+
+            area = (x2 - x1 + 1) * (y2 - y1 + 1)
+            iw = (min(tx2, x2) - max(tx1, x1) + 1)
+            if iw > 0:
+                ih = (min(ty2, y2) - max(ty1, y1) + 1)
+                if ih > 0:
+                    ua = float((tx2 - tx1 + 1) * (ty2 - ty1 + 1) + area - iw * ih)
+                    ov = iw * ih / ua #iou between max box and detection box
+
+                    if method == 1: # linear
+                        if ov > Nt:
+                            weight = 1 - ov
+                        else:
+                            weight = 1
+                    elif method == 2: # gaussian
+                        weight = np.exp(-(ov * ov)/sigma)
+                    else: # original NMS
+                        if ov > Nt:
+                            weight = 0
+                        else:
+                            weight = 1
+
+                    boxes[pos, 4] = weight*boxes[pos, 4]
+
+                    # if box score falls below threshold, discard the box by
+                    # swapping with last box update N
+                    if boxes[pos, 4] < threshold:
+                        boxes[pos,0] = boxes[N-1, 0]
+                        boxes[pos,1] = boxes[N-1, 1]
+                        boxes[pos,2] = boxes[N-1, 2]
+                        boxes[pos,3] = boxes[N-1, 3]
+                        boxes[pos,4] = boxes[N-1, 4]
+                        inds[pos] = inds[N-1]
+                        N = N - 1
+                        pos = pos - 1
+
+            pos = pos + 1
+
+    return boxes[:N], inds[:N]
\ No newline at end of file
diff --git a/mmdet/ops/nms/gpu_nms.hpp b/mmdet/ops/nms/gpu_nms.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2d45e344aeb93c00262f98153dd3e1300a9adcce
--- /dev/null
+++ b/mmdet/ops/nms/gpu_nms.hpp
@@ -0,0 +1,3 @@
+void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
+          int boxes_dim, float nms_overlap_thresh, int device_id, size_t base);
+size_t nms_Malloc();
diff --git a/mmdet/ops/nms/gpu_nms.pyx b/mmdet/ops/nms/gpu_nms.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..e5ae72578731c38150bf0c79866fcabfcb936ceb
--- /dev/null
+++ b/mmdet/ops/nms/gpu_nms.pyx
@@ -0,0 +1,43 @@
+# --------------------------------------------------------
+# Faster R-CNN
+# Copyright (c) 2015 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ross Girshick
+# --------------------------------------------------------
+
+import numpy as np
+cimport numpy as np
+
+assert sizeof(int) == sizeof(np.int32_t)
+
+cdef extern from "gpu_nms.hpp":
+    void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int, size_t) nogil
+    size_t nms_Malloc() nogil
+
+memory_pool = {}
+
+def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh,
+            np.int32_t device_id=0):
+    cdef int boxes_num = dets.shape[0]
+    cdef int boxes_dim = dets.shape[1]
+    cdef int num_out
+    cdef size_t base
+    cdef np.ndarray[np.int32_t, ndim=1] \
+        keep = np.zeros(boxes_num, dtype=np.int32)
+    cdef np.ndarray[np.float32_t, ndim=1] \
+        scores = dets[:, 4]
+    cdef np.ndarray[np.int_t, ndim=1] \
+        order = scores.argsort()[::-1]
+    cdef np.ndarray[np.float32_t, ndim=2] \
+        sorted_dets = dets[order, :]
+    cdef float cthresh = thresh
+    if device_id not in memory_pool:
+        with nogil:
+            base = nms_Malloc()
+        memory_pool[device_id] = base
+        # print "malloc", base
+    base = memory_pool[device_id]
+    with nogil:
+        _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, cthresh, device_id, base)
+    keep = keep[:num_out]
+    return list(order[keep])
diff --git a/mmdet/ops/nms/nms_kernel.cu b/mmdet/ops/nms/nms_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4c5f0ec5e1096260e57ff314074f9c36da0a4e72
--- /dev/null
+++ b/mmdet/ops/nms/nms_kernel.cu
@@ -0,0 +1,188 @@
+// ------------------------------------------------------------------
+// Faster R-CNN
+// Copyright (c) 2015 Microsoft
+// Licensed under The MIT License [see fast-rcnn/LICENSE for details]
+// Written by Shaoqing Ren
+// ------------------------------------------------------------------
+
+#include <stdio.h>
+#include <iostream>
+#include <vector>
+#include "gpu_nms.hpp"
+
+#define CUDA_CHECK(condition)                                    \
+    /* Code block avoids redefinition of cudaError_t error */    \
+    do {                                                         \
+        cudaError_t error = condition;                           \
+        if (error != cudaSuccess) {                              \
+            std::cout << cudaGetErrorString(error) << std::endl; \
+        }                                                        \
+    } while (0)
+
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+#define MULTIPLIER 16
+#define LONGLONG_SIZE 64
+
+int const threadsPerBlock =
+    sizeof(unsigned long long) * 8 *
+    MULTIPLIER;  // number of bits for a long long variable
+
+__device__ inline float devIoU(float const* const a, float const* const b) {
+    float left = max(a[0], b[0]), right = min(a[2], b[2]);
+    float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
+    float width = max(right - left + 1, 0.f),
+          height = max(bottom - top + 1, 0.f);
+    float interS = width * height;
+    float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
+    float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
+    return interS / (Sa + Sb - interS);
+}
+
+__global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
+                           const float* dev_boxes,
+                           unsigned long long* dev_mask) {
+    const int row_start = blockIdx.y;
+    const int col_start = blockIdx.x;
+
+    // if (row_start > col_start) return;
+
+    const int row_size =
+        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+    const int col_size =
+        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+    __shared__ float block_boxes[threadsPerBlock * 5];
+    if (threadIdx.x < col_size) {
+        block_boxes[threadIdx.x * 5 + 0] =
+            dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
+        block_boxes[threadIdx.x * 5 + 1] =
+            dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
+        block_boxes[threadIdx.x * 5 + 2] =
+            dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
+        block_boxes[threadIdx.x * 5 + 3] =
+            dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
+        block_boxes[threadIdx.x * 5 + 4] =
+            dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
+    }
+    __syncthreads();
+
+    unsigned long long ts[MULTIPLIER];
+
+    if (threadIdx.x < row_size) {
+#pragma unroll
+        for (int i = 0; i < MULTIPLIER; ++i) {
+            ts[i] = 0;
+        }
+        const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+        const float* cur_box = dev_boxes + cur_box_idx * 5;
+        int i = 0;
+        int start = 0;
+        if (row_start == col_start) {
+            start = threadIdx.x + 1;
+        }
+        for (i = start; i < col_size; i++) {
+            if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
+                ts[i / LONGLONG_SIZE] |= 1ULL << (i % LONGLONG_SIZE);
+            }
+        }
+        const int col_blocks = DIVUP(n_boxes, threadsPerBlock);
+
+#pragma unroll
+        for (int i = 0; i < MULTIPLIER; ++i) {
+            dev_mask[(cur_box_idx * col_blocks + col_start) * MULTIPLIER + i] =
+                ts[i];
+        }
+    }
+}
+
+void _set_device(int device_id) {
+    int current_device;
+    CUDA_CHECK(cudaGetDevice(&current_device));
+    if (current_device == device_id) {
+        return;
+    }
+    // The call to cudaSetDevice must come before any calls to Get, which
+    // may perform initialization using the GPU.
+    CUDA_CHECK(cudaSetDevice(device_id));
+}
+
+const size_t MEMORY_SIZE = 500000000;
+size_t nms_Malloc() {
+    float* boxes_dev = NULL;
+    CUDA_CHECK(cudaMalloc(&boxes_dev, MEMORY_SIZE));
+    return size_t(boxes_dev);
+}
+
+void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
+          int boxes_dim, float nms_overlap_thresh, int device_id, size_t base) {
+    _set_device(device_id);
+
+    float* boxes_dev = NULL;
+    unsigned long long* mask_dev = NULL;
+
+    const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
+
+    if (base > 0) {
+        size_t require_mem =
+            boxes_num * boxes_dim * sizeof(float) +
+            boxes_num * col_blocks * sizeof(unsigned long long) * MULTIPLIER;
+        if (require_mem >= MEMORY_SIZE) {
+            std::cout << "require_mem: " << require_mem << std::endl;
+        }
+        boxes_dev = (float*)(base);
+        mask_dev =
+            (unsigned long long*)(base +
+                                  512 * ((unsigned long long)(boxes_num *
+                                                              boxes_dim *
+                                                              sizeof(float) /
+                                                              512) +
+                                         1));
+    } else {
+        CUDA_CHECK(
+            cudaMalloc(&boxes_dev, boxes_num * boxes_dim * sizeof(float)));
+        CUDA_CHECK(cudaMalloc(&mask_dev, MULTIPLIER * boxes_num * col_blocks *
+                                             sizeof(unsigned long long)));
+    }
+    CUDA_CHECK(cudaMemcpy(boxes_dev, boxes_host,
+                          boxes_num * boxes_dim * sizeof(float),
+                          cudaMemcpyHostToDevice));
+
+    dim3 blocks(DIVUP(boxes_num, threadsPerBlock),
+                DIVUP(boxes_num, threadsPerBlock));
+    dim3 threads(threadsPerBlock);
+    nms_kernel<<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes_dev,
+                                    mask_dev);
+
+    std::vector<unsigned long long> mask_host(boxes_num * col_blocks *
+                                              MULTIPLIER);
+    CUDA_CHECK(cudaMemcpy(
+        &mask_host[0], mask_dev,
+        sizeof(unsigned long long) * boxes_num * col_blocks * MULTIPLIER,
+        cudaMemcpyDeviceToHost));
+
+    std::vector<unsigned long long> remv(col_blocks * MULTIPLIER);
+    memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks * MULTIPLIER);
+
+    int num_to_keep = 0;
+    for (int i = 0; i < boxes_num; i++) {
+        int nblock = i / threadsPerBlock;
+        int inblock = i % threadsPerBlock;
+        int offset = inblock / LONGLONG_SIZE;
+        int bit_pos = inblock % LONGLONG_SIZE;
+
+        if (!(remv[nblock * MULTIPLIER + offset] & (1ULL << bit_pos))) {
+            keep_out[num_to_keep++] = i;
+            unsigned long long* p = &mask_host[0] + i * col_blocks * MULTIPLIER;
+            for (int j = nblock * MULTIPLIER + offset;
+                 j < col_blocks * MULTIPLIER; j++) {
+                remv[j] |= p[j];
+            }
+        }
+    }
+    *num_out = num_to_keep;
+
+    if (!base) {
+        CUDA_CHECK(cudaFree(boxes_dev));
+        CUDA_CHECK(cudaFree(mask_dev));
+    }
+}
diff --git a/mmdet/ops/nms/nms_wrapper.py b/mmdet/ops/nms/nms_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..43d5e5c6e5c038467f2084d46d85b97bb2a943f1
--- /dev/null
+++ b/mmdet/ops/nms/nms_wrapper.py
@@ -0,0 +1,46 @@
+import numpy as np
+import torch
+
+from .gpu_nms import gpu_nms
+from .cpu_nms import cpu_nms
+from .cpu_soft_nms import cpu_soft_nms
+
+
+def nms(dets, thresh, device_id=None):
+    """Dispatch to either CPU or GPU NMS implementations."""
+
+    if isinstance(dets, torch.Tensor):
+        if dets.is_cuda:
+            device_id = dets.get_device()
+        dets = dets.detach().cpu().numpy()
+    assert isinstance(dets, np.ndarray)
+
+    if dets.shape[0] == 0:
+        inds = []
+    else:
+        inds = (gpu_nms(dets, thresh, device_id=device_id)
+                if device_id is not None else cpu_nms(dets, thresh))
+
+    if isinstance(dets, torch.Tensor):
+        return dets.new_tensor(inds, dtype=torch.long)
+    else:
+        return np.array(inds, dtype=np.int)
+
+
+def soft_nms(dets, Nt=0.3, method=1, sigma=0.5, min_score=0):
+    if isinstance(dets, torch.Tensor):
+        _dets = dets.detach().cpu().numpy()
+    else:
+        _dets = dets.copy()
+    assert isinstance(_dets, np.ndarray)
+
+    new_dets, inds = cpu_soft_nms(
+        _dets, Nt=Nt, method=method, sigma=sigma, threshold=min_score)
+
+    if isinstance(dets, torch.Tensor):
+        return dets.new_tensor(
+            inds, dtype=torch.long), dets.new_tensor(new_dets)
+    else:
+        return np.array(
+            inds, dtype=np.int), np.array(
+                new_dets, dtype=np.float32)
diff --git a/mmdet/ops/nms/setup.py b/mmdet/ops/nms/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..98bf57c8f135805927205ec638d865177b070d8c
--- /dev/null
+++ b/mmdet/ops/nms/setup.py
@@ -0,0 +1,91 @@
+import os
+from distutils.core import setup
+from distutils.extension import Extension
+
+import numpy as np
+from Cython.Build import cythonize
+from Cython.Distutils import build_ext
+
+CUDA_ROOT = '/usr/local/cuda'
+CUDA = {
+    "include": os.path.join(CUDA_ROOT, 'include'),
+    "lib": os.path.join(CUDA_ROOT, 'lib64'),
+    "nvcc": os.path.join(CUDA_ROOT, 'bin', "nvcc")
+}
+
+inc_dirs = [CUDA['include'], np.get_include()]
+
+lib_dirs = [CUDA['lib']]
+
+# extensions
+ext_args = dict(
+    include_dirs=inc_dirs,
+    library_dirs=lib_dirs,
+    language='c++',
+    libraries=['cudart'],
+    extra_compile_args={
+        "cc": ['-Wno-unused-function', '-Wno-write-strings'],
+        "nvcc": [
+            '-arch=sm_52', '--ptxas-options=-v', '-c', '--compiler-options',
+            '-fPIC'
+        ],
+    },
+)
+
+extensions = [
+    Extension('cpu_nms', ['cpu_nms.pyx'], **ext_args),
+    Extension('gpu_nms', ['gpu_nms.pyx', 'nms_kernel.cu'], **ext_args),
+    Extension('cpu_soft_nms', ['cpu_soft_nms.pyx'], **ext_args),
+]
+
+
+def customize_compiler_for_nvcc(self):
+    """inject deep into distutils to customize how the dispatch
+    to cc/nvcc works.
+    If you subclass UnixCCompiler, it's not trivial to get your subclass
+    injected in, and still have the right customizations (i.e.
+    distutils.sysconfig.customize_compiler) run on it. So instead of going
+    the OO route, I have this. Note, it's kindof like a wierd functional
+    subclassing going on."""
+
+    # tell the compiler it can processes .cu
+    self.src_extensions.append('.cu')
+
+    # save references to the default compiler_so and _comple methods
+    default_compiler_so = self.compiler_so
+    super = self._compile
+
+    # now redefine the _compile method. This gets executed for each
+    # object but distutils doesn't have the ability to change compilers
+    # based on source extension: we add it.
+    def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
+        if os.path.splitext(src)[1] == '.cu':
+            # use the cuda for .cu files
+            self.set_executable('compiler_so', CUDA['nvcc'])
+            # use only a subset of the extra_postargs, which are 1-1 translated
+            # from the extra_compile_args in the Extension class
+            postargs = extra_postargs['nvcc']
+        else:
+            postargs = extra_postargs['cc']
+
+        super(obj, src, ext, cc_args, postargs, pp_opts)
+        # reset the default compiler_so, which we might have changed for cuda
+        self.compiler_so = default_compiler_so
+
+    # inject our redefined _compile method into the class
+    self._compile = _compile
+
+
+# run the customize_compiler
+class custom_build_ext(build_ext):
+
+    def build_extensions(self):
+        customize_compiler_for_nvcc(self.compiler)
+        build_ext.build_extensions(self)
+
+
+setup(
+    name='nms',
+    cmdclass={'build_ext': custom_build_ext},
+    ext_modules=cythonize(extensions),
+)
diff --git a/mmdet/ops/roi_align/__init__.py b/mmdet/ops/roi_align/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae27e21d6c78e9ffd8d13e8c71017ef6f365fb5e
--- /dev/null
+++ b/mmdet/ops/roi_align/__init__.py
@@ -0,0 +1,2 @@
+from .functions.roi_align import roi_align
+from .modules.roi_align import RoIAlign
diff --git a/mmdet/ops/roi_align/functions/__init__.py b/mmdet/ops/roi_align/functions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mmdet/ops/roi_align/functions/roi_align.py b/mmdet/ops/roi_align/functions/roi_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e546fe59527570a2331f6f79bb6113f1cc1abb9
--- /dev/null
+++ b/mmdet/ops/roi_align/functions/roi_align.py
@@ -0,0 +1,61 @@
+from torch.autograd import Function, Variable
+
+from .. import roi_align_cuda
+
+
+class RoIAlignFunction(Function):
+
+    @staticmethod
+    def forward(ctx, features, rois, out_size, spatial_scale, sample_num=0):
+        if isinstance(out_size, int):
+            out_h = out_size
+            out_w = out_size
+        elif isinstance(out_size, tuple):
+            assert len(out_size) == 2
+            assert isinstance(out_size[0], int)
+            assert isinstance(out_size[1], int)
+            out_h, out_w = out_size
+        else:
+            raise TypeError(
+                '"out_size" must be an integer or tuple of integers')
+        ctx.spatial_scale = spatial_scale
+        ctx.sample_num = sample_num
+        ctx.save_for_backward(rois)
+        ctx.feature_size = features.size()
+
+        batch_size, num_channels, data_height, data_width = features.size()
+        num_rois = rois.size(0)
+
+        output = features.new_zeros(num_rois, num_channels, out_h, out_w)
+        if features.is_cuda:
+            roi_align_cuda.forward(features, rois, out_h, out_w, spatial_scale,
+                                   sample_num, output)
+        else:
+            raise NotImplementedError
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        feature_size = ctx.feature_size
+        spatial_scale = ctx.spatial_scale
+        sample_num = ctx.sample_num
+        rois = ctx.saved_tensors[0]
+        assert (feature_size is not None and grad_output.is_cuda)
+
+        batch_size, num_channels, data_height, data_width = feature_size
+        out_w = grad_output.size(3)
+        out_h = grad_output.size(2)
+
+        grad_input = grad_rois = None
+        if ctx.needs_input_grad[0]:
+            grad_input = Variable(
+                rois.new(batch_size, num_channels, data_height, data_width)
+                .zero_())
+            roi_align_cuda.backward(grad_output, rois, out_h, out_w,
+                                    spatial_scale, sample_num, grad_input)
+
+        return grad_input, grad_rois, None, None, None
+
+
+roi_align = RoIAlignFunction.apply
diff --git a/mmdet/ops/roi_align/gradcheck.py b/mmdet/ops/roi_align/gradcheck.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2c51e64bb7b5eba9da3087d83cfa1083f965bbc
--- /dev/null
+++ b/mmdet/ops/roi_align/gradcheck.py
@@ -0,0 +1,29 @@
+import numpy as np
+import torch
+from torch.autograd import gradcheck
+
+import os.path as osp
+import sys
+sys.path.append(osp.abspath(osp.join(__file__, '../../')))
+from roi_align import RoIAlign
+
+feat_size = 15
+spatial_scale = 1.0 / 8
+img_size = feat_size / spatial_scale
+num_imgs = 2
+num_rois = 20
+
+batch_ind = np.random.randint(num_imgs, size=(num_rois, 1))
+rois = np.random.rand(num_rois, 4) * img_size * 0.5
+rois[:, 2:] += img_size * 0.5
+rois = np.hstack((batch_ind, rois))
+
+feat = torch.randn(
+    num_imgs, 16, feat_size, feat_size, requires_grad=True, device='cuda:0')
+rois = torch.from_numpy(rois).float().cuda()
+inputs = (feat, rois)
+print('Gradcheck for roi align...')
+test = gradcheck(RoIAlign(3, spatial_scale), inputs, atol=1e-3, eps=1e-3)
+print(test)
+test = gradcheck(RoIAlign(3, spatial_scale, 2), inputs, atol=1e-3, eps=1e-3)
+print(test)
diff --git a/mmdet/ops/roi_align/modules/__init__.py b/mmdet/ops/roi_align/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mmdet/ops/roi_align/modules/roi_align.py b/mmdet/ops/roi_align/modules/roi_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..b83b74e6b7c151eaf627c2b6d3530823ce8cda05
--- /dev/null
+++ b/mmdet/ops/roi_align/modules/roi_align.py
@@ -0,0 +1,16 @@
+from torch.nn.modules.module import Module
+from ..functions.roi_align import RoIAlignFunction
+
+
+class RoIAlign(Module):
+
+    def __init__(self, out_size, spatial_scale, sample_num=0):
+        super(RoIAlign, self).__init__()
+
+        self.out_size = out_size
+        self.spatial_scale = float(spatial_scale)
+        self.sample_num = int(sample_num)
+
+    def forward(self, features, rois):
+        return RoIAlignFunction.apply(features, rois, self.out_size,
+                                      self.spatial_scale, self.sample_num)
diff --git a/mmdet/ops/roi_align/setup.py b/mmdet/ops/roi_align/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..f02a5ea30d66f51761038c7802d948f039871c8c
--- /dev/null
+++ b/mmdet/ops/roi_align/setup.py
@@ -0,0 +1,12 @@
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+setup(
+    name='roi_align_cuda',
+    ext_modules=[
+        CUDAExtension('roi_align_cuda', [
+            'src/roi_align_cuda.cpp',
+            'src/roi_align_kernel.cu',
+        ]),
+    ],
+    cmdclass={'build_ext': BuildExtension})
diff --git a/mmdet/ops/roi_align/src/roi_align_cuda.cpp b/mmdet/ops/roi_align/src/roi_align_cuda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e4c28c142268d4caf3ff2800dcfe9b24e8e99c66
--- /dev/null
+++ b/mmdet/ops/roi_align/src/roi_align_cuda.cpp
@@ -0,0 +1,85 @@
+#include <torch/torch.h>
+
+#include <cmath>
+#include <vector>
+
+int ROIAlignForwardLaucher(const at::Tensor features, const at::Tensor rois,
+                           const float spatial_scale, const int sample_num,
+                           const int channels, const int height,
+                           const int width, const int num_rois,
+                           const int pooled_height, const int pooled_width,
+                           at::Tensor output);
+
+int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
+                            const float spatial_scale, const int sample_num,
+                            const int channels, const int height,
+                            const int width, const int num_rois,
+                            const int pooled_height, const int pooled_width,
+                            at::Tensor bottom_grad);
+
+#define CHECK_CUDA(x) AT_ASSERT(x.type().is_cuda(), #x " must be a CUDAtensor ")
+#define CHECK_CONTIGUOUS(x) \
+  AT_ASSERT(x.is_contiguous(), #x " must be contiguous ")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
+
+int roi_align_forward_cuda(at::Tensor features, at::Tensor rois,
+                           int pooled_height, int pooled_width,
+                           float spatial_scale, int sample_num,
+                           at::Tensor output) {
+  CHECK_INPUT(features);
+  CHECK_INPUT(rois);
+  CHECK_INPUT(output);
+
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+
+  if (size_rois != 5) {
+    printf("wrong roi size\n");
+    return 0;
+  }
+
+  int num_channels = features.size(1);
+  int data_height = features.size(2);
+  int data_width = features.size(3);
+
+  ROIAlignForwardLaucher(features, rois, spatial_scale, sample_num,
+                         num_channels, data_height, data_width, num_rois,
+                         pooled_height, pooled_width, output);
+
+  return 1;
+}
+
+int roi_align_backward_cuda(at::Tensor top_grad, at::Tensor rois,
+                            int pooled_height, int pooled_width,
+                            float spatial_scale, int sample_num,
+                            at::Tensor bottom_grad) {
+  CHECK_INPUT(top_grad);
+  CHECK_INPUT(rois);
+  CHECK_INPUT(bottom_grad);
+
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 5) {
+    printf("wrong roi size\n");
+    return 0;
+  }
+
+  int num_channels = bottom_grad.size(1);
+  int data_height = bottom_grad.size(2);
+  int data_width = bottom_grad.size(3);
+
+  ROIAlignBackwardLaucher(top_grad, rois, spatial_scale, sample_num,
+                          num_channels, data_height, data_width, num_rois,
+                          pooled_height, pooled_width, bottom_grad);
+
+  return 1;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", &roi_align_forward_cuda, "Roi_Align forward (CUDA)");
+  m.def("backward", &roi_align_backward_cuda, "Roi_Align backward (CUDA)");
+}
diff --git a/mmdet/ops/roi_align/src/roi_align_kernel.cu b/mmdet/ops/roi_align/src/roi_align_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..31be093c038872ff0b48c79157e5048d25a416cf
--- /dev/null
+++ b/mmdet/ops/roi_align/src/roi_align_kernel.cu
@@ -0,0 +1,319 @@
+#include <ATen/ATen.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <math.h>
+#include <stdio.h>
+#include <vector>
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;                   \
+       i += blockDim.x * gridDim.x)
+
+#define THREADS_PER_BLOCK 1024
+
+inline int GET_BLOCKS(const int N) {
+  int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+  int max_block_num = 65000;
+  return min(optimal_block_num, max_block_num);
+}
+
+template <typename scalar_t>
+__device__ scalar_t bilinear_interpolate(const scalar_t *bottom_data,
+                                         const int height, const int width,
+                                         scalar_t y, scalar_t x) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    return 0;
+  }
+
+  if (y <= 0)
+    y = 0;
+  if (x <= 0)
+    x = 0;
+
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (scalar_t)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (scalar_t)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  scalar_t ly = y - y_low;
+  scalar_t lx = x - x_low;
+  scalar_t hy = 1. - ly;
+  scalar_t hx = 1. - lx;
+  // do bilinear interpolation
+  scalar_t lt = bottom_data[y_low * width + x_low];
+  scalar_t rt = bottom_data[y_low * width + x_high];
+  scalar_t lb = bottom_data[y_high * width + x_low];
+  scalar_t rb = bottom_data[y_high * width + x_high];
+  scalar_t w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  scalar_t val = (w1 * lt + w2 * rt + w3 * lb + w4 * rb);
+
+  return val;
+}
+
+template <typename scalar_t>
+__global__ void
+ROIAlignForward(const int nthreads, const scalar_t *bottom_data,
+                const scalar_t *bottom_rois, const scalar_t spatial_scale,
+                const int sample_num, const int channels, const int height,
+                const int width, const int pooled_height,
+                const int pooled_width, scalar_t *top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the aligned output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    scalar_t roi_start_w = offset_bottom_rois[1] * spatial_scale;
+    scalar_t roi_start_h = offset_bottom_rois[2] * spatial_scale;
+    scalar_t roi_end_w = (offset_bottom_rois[3] + 1) * spatial_scale;
+    scalar_t roi_end_h = (offset_bottom_rois[4] + 1) * spatial_scale;
+
+    // Force malformed ROIs to be 1x1
+    scalar_t roi_width = fmaxf((scalar_t)roi_end_w - roi_start_w, 0.);
+    scalar_t roi_height = fmaxf((scalar_t)roi_end_h - roi_start_h, 0.);
+
+    scalar_t bin_size_h = roi_height / pooled_height;
+    scalar_t bin_size_w = roi_width / pooled_width;
+
+    const scalar_t *offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels + c) * height * width;
+
+    int sample_num_h = (sample_num > 0)
+                           ? sample_num
+                           : ceil(roi_height / pooled_height); // e.g., = 2
+    int sample_num_w =
+        (sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
+
+    scalar_t h = (scalar_t)(ph + 0.5) * bin_size_h + roi_start_h;
+    scalar_t w = (scalar_t)(pw + 0.5) * bin_size_w + roi_start_w;
+
+    int hstart = fminf(floor(h), height - 2);
+    int wstart = fminf(floor(w), width - 2);
+
+    scalar_t output_val = 0;
+    for (int iy = 0; iy < sample_num_h; iy++) {
+      const scalar_t y = roi_start_h + ph * bin_size_h +
+                         (scalar_t)(iy + scalar_t(.5f)) * bin_size_h /
+                             (scalar_t)(sample_num_h);
+      for (int ix = 0; ix < sample_num_w; ix++) {
+        const scalar_t x = roi_start_w + pw * bin_size_w +
+                           (scalar_t)(ix + scalar_t(.5f)) * bin_size_w /
+                               (scalar_t)(sample_num_w);
+        scalar_t val = bilinear_interpolate<scalar_t>(offset_bottom_data,
+                                                      height, width, y, x);
+        output_val += val;
+      }
+    }
+    output_val /= (sample_num_h * sample_num_w);
+    top_data[index] = output_val;
+  }
+}
+
+int ROIAlignForwardLaucher(const at::Tensor features, const at::Tensor rois,
+                           const float spatial_scale, const int sample_num,
+                           const int channels, const int height,
+                           const int width, const int num_rois,
+                           const int pooled_height, const int pooled_width,
+                           at::Tensor output) {
+  const int output_size = num_rois * pooled_height * pooled_width * channels;
+  AT_DISPATCH_FLOATING_TYPES(
+      features.type(), "ROIAlignLaucherForward", ([&] {
+        const scalar_t *bottom_data = features.data<scalar_t>();
+        const scalar_t *rois_data = rois.data<scalar_t>();
+        scalar_t *top_data = output.data<scalar_t>();
+
+        ROIAlignForward<
+            scalar_t><<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
+            output_size, bottom_data, rois_data, scalar_t(spatial_scale),
+            sample_num, channels, height, width, pooled_height, pooled_width,
+            top_data);
+      }));
+  cudaError_t err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    fprintf(stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString(err));
+    exit(-1);
+  }
+
+  return 1;
+}
+
+template <typename scalar_t>
+__device__ void
+bilinear_interpolate_gradient(const int height, const int width, scalar_t y,
+                              scalar_t x, scalar_t &w1, scalar_t &w2,
+                              scalar_t &w3, scalar_t &w4, int &x_low,
+                              int &x_high, int &y_low, int &y_high) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0)
+    y = 0;
+  if (x <= 0)
+    x = 0;
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (scalar_t)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (scalar_t)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  scalar_t ly = y - y_low;
+  scalar_t lx = x - x_low;
+  scalar_t hy = 1. - ly;
+  scalar_t hx = 1. - lx;
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <typename scalar_t>
+__global__ void
+ROIAlignBackward(const int nthreads, const scalar_t *top_diff,
+                 const scalar_t *bottom_rois, const scalar_t spatial_scale,
+                 const int sample_num, const int channels, const int height,
+                 const int width, const int pooled_height,
+                 const int pooled_width, scalar_t *bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the aligned output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    scalar_t roi_start_w = offset_bottom_rois[1] * spatial_scale;
+    scalar_t roi_start_h = offset_bottom_rois[2] * spatial_scale;
+    scalar_t roi_end_w = (offset_bottom_rois[3] + 1) * spatial_scale;
+    scalar_t roi_end_h = (offset_bottom_rois[4] + 1) * spatial_scale;
+
+    // Force malformed ROIs to be 1x1
+    scalar_t roi_width = fmaxf((scalar_t)roi_end_w - roi_start_w, 0.);
+    scalar_t roi_height = fmaxf((scalar_t)roi_end_h - roi_start_h, 0.);
+
+    scalar_t bin_size_h = roi_height / pooled_height;
+    scalar_t bin_size_w = roi_width / pooled_width;
+
+    scalar_t *offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels + c) * height * width;
+    int offset_top = (n * channels + c) * pooled_height * pooled_width +
+                     ph * pooled_width + pw;
+    scalar_t offset_top_diff = top_diff[offset_top];
+
+    int sample_num_h = (sample_num > 0)
+                           ? sample_num
+                           : ceil(roi_height / pooled_height); // e.g., = 2
+    int sample_num_w =
+        (sample_num > 0) ? sample_num : ceil(roi_width / pooled_width);
+
+    const scalar_t count = (scalar_t)(sample_num_h * sample_num_w);
+
+    scalar_t h = (scalar_t)(ph + 0.5) * bin_size_h + roi_start_h;
+    scalar_t w = (scalar_t)(pw + 0.5) * bin_size_w + roi_start_w;
+
+    int hstart = fminf(floor(h), height - 2);
+    int wstart = fminf(floor(w), width - 2);
+
+    for (int iy = 0; iy < sample_num_h; iy++) {
+      const scalar_t y =
+          roi_start_h + ph * bin_size_h +
+          (scalar_t)(iy + .5f) * bin_size_h / (scalar_t)(sample_num_h);
+      for (int ix = 0; ix < sample_num_w; ix++) {
+        const scalar_t x =
+            roi_start_w + pw * bin_size_w +
+            (scalar_t)(ix + .5f) * bin_size_w / (scalar_t)(sample_num_w);
+        scalar_t w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient<scalar_t>(
+            height, width, y, x, w1, w2, w3, w4, x_low, x_high, y_low, y_high);
+        scalar_t g1 = offset_top_diff * w1 / count;
+        scalar_t g2 = offset_top_diff * w2 / count;
+        scalar_t g3 = offset_top_diff * w3 / count;
+        scalar_t g4 = offset_top_diff * w4 / count;
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(offset_bottom_diff + y_low * width + x_low, g1);
+          atomicAdd(offset_bottom_diff + y_low * width + x_high, g2);
+          atomicAdd(offset_bottom_diff + y_high * width + x_low, g3);
+          atomicAdd(offset_bottom_diff + y_high * width + x_high, g4);
+        }
+      }
+    }
+  }
+}
+
+template <>
+__global__ void ROIAlignBackward<double>(
+    const int nthreads, const double *top_diff, const double *bottom_rois,
+    const double spatial_scale, const int sample_num, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, double *bottom_diff) {}
+
+int ROIAlignBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
+                            const float spatial_scale, const int sample_num,
+                            const int channels, const int height,
+                            const int width, const int num_rois,
+                            const int pooled_height, const int pooled_width,
+                            at::Tensor bottom_grad) {
+  const int output_size = num_rois * pooled_height * pooled_width * channels;
+
+  AT_DISPATCH_FLOATING_TYPES(
+      top_grad.type(), "ROIAlignLaucherBackward", ([&] {
+        const scalar_t *top_diff = top_grad.data<scalar_t>();
+        const scalar_t *rois_data = rois.data<scalar_t>();
+        scalar_t *bottom_diff = bottom_grad.data<scalar_t>();
+        if (sizeof(scalar_t) == sizeof(double)) {
+          fprintf(stderr, "double is not supported\n");
+          exit(-1);
+        }
+
+        ROIAlignBackward<
+            scalar_t><<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
+            output_size, top_diff, rois_data, spatial_scale, sample_num,
+            channels, height, width, pooled_height, pooled_width, bottom_diff);
+      }));
+  cudaError_t err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    fprintf(stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString(err));
+    exit(-1);
+  }
+
+  return 1;
+}
diff --git a/mmdet/ops/roi_pool/__init__.py b/mmdet/ops/roi_pool/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c8506d319d3c9c2300860a6c0d64259e43e7916
--- /dev/null
+++ b/mmdet/ops/roi_pool/__init__.py
@@ -0,0 +1,2 @@
+from .functions.roi_pool import roi_pool
+from .modules.roi_pool import RoIPool
diff --git a/mmdet/ops/roi_pool/functions/__init__.py b/mmdet/ops/roi_pool/functions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mmdet/ops/roi_pool/functions/roi_pool.py b/mmdet/ops/roi_pool/functions/roi_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..78ba1395fb9b653673c3ad57d076def78887b5ff
--- /dev/null
+++ b/mmdet/ops/roi_pool/functions/roi_pool.py
@@ -0,0 +1,56 @@
+import torch
+from torch.autograd import Function
+
+from .. import roi_pool_cuda
+
+
+class RoIPoolFunction(Function):
+
+    @staticmethod
+    def forward(ctx, features, rois, out_size, spatial_scale):
+        if isinstance(out_size, int):
+            out_h = out_size
+            out_w = out_size
+        elif isinstance(out_size, tuple):
+            assert len(out_size) == 2
+            assert isinstance(out_size[0], int)
+            assert isinstance(out_size[1], int)
+            out_h, out_w = out_size
+        else:
+            raise TypeError(
+                '"out_size" must be an integer or tuple of integers')
+        assert features.is_cuda
+        ctx.save_for_backward(rois)
+        num_channels = features.size(1)
+        num_rois = rois.size(0)
+        out_size = (num_rois, num_channels, out_h, out_w)
+        output = features.new_zeros(*out_size)
+
+        argmax = features.new_zeros(*out_size, dtype=torch.int)
+        roi_pool_cuda.forward(features, rois, out_h, out_w, spatial_scale,
+                              output, argmax)
+        ctx.spatial_scale = spatial_scale
+        ctx.feature_size = features.size()
+        ctx.argmax = argmax
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        assert grad_output.is_cuda
+        spatial_scale = ctx.spatial_scale
+        feature_size = ctx.feature_size
+        argmax = ctx.argmax
+        rois = ctx.saved_tensors[0]
+        assert feature_size is not None
+
+        grad_input = grad_rois = None
+        if ctx.needs_input_grad[0]:
+            grad_input = grad_output.new(feature_size).zero_()
+            roi_pool_cuda.backward(grad_output, rois, argmax, spatial_scale,
+                                   grad_input)
+
+        return grad_input, grad_rois, None, None
+
+
+roi_pool = RoIPoolFunction.apply
diff --git a/mmdet/ops/roi_pool/gradcheck.py b/mmdet/ops/roi_pool/gradcheck.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfc08b2e138855e913a2ac1f3c365a570aba661d
--- /dev/null
+++ b/mmdet/ops/roi_pool/gradcheck.py
@@ -0,0 +1,15 @@
+import torch
+from torch.autograd import gradcheck
+
+import os.path as osp
+import sys
+sys.path.append(osp.abspath(osp.join(__file__, '../../')))
+from roi_pooling import RoIPool
+
+feat = torch.randn(4, 16, 15, 15, requires_grad=True).cuda()
+rois = torch.Tensor([[0, 0, 0, 50, 50], [0, 10, 30, 43, 55],
+                     [1, 67, 40, 110, 120]]).cuda()
+inputs = (feat, rois)
+print('Gradcheck for roi pooling...')
+test = gradcheck(RoIPool(4, 1.0 / 8), inputs, eps=1e-5, atol=1e-3)
+print(test)
diff --git a/mmdet/ops/roi_pool/modules/__init__.py b/mmdet/ops/roi_pool/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mmdet/ops/roi_pool/modules/roi_pool.py b/mmdet/ops/roi_pool/modules/roi_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7fffd08c656ee7301aeed5a8262714f4be4157d
--- /dev/null
+++ b/mmdet/ops/roi_pool/modules/roi_pool.py
@@ -0,0 +1,14 @@
+from torch.nn.modules.module import Module
+from ..functions.roi_pool import roi_pool
+
+
+class RoIPool(Module):
+
+    def __init__(self, out_size, spatial_scale):
+        super(RoIPool, self).__init__()
+
+        self.out_size = out_size
+        self.spatial_scale = float(spatial_scale)
+
+    def forward(self, features, rois):
+        return roi_pool(features, rois, self.out_size, self.spatial_scale)
diff --git a/mmdet/ops/roi_pool/setup.py b/mmdet/ops/roi_pool/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..16991b889220f9ae4c7763460033754c6ff38f77
--- /dev/null
+++ b/mmdet/ops/roi_pool/setup.py
@@ -0,0 +1,12 @@
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+setup(
+    name='roi_pool',
+    ext_modules=[
+        CUDAExtension('roi_pool_cuda', [
+            'src/roi_pool_cuda.cpp',
+            'src/roi_pool_kernel.cu',
+        ])
+    ],
+    cmdclass={'build_ext': BuildExtension})
diff --git a/mmdet/ops/roi_pool/src/roi_pool_cuda.cpp b/mmdet/ops/roi_pool/src/roi_pool_cuda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..799c151d192911f03e446ea9c1ad7bb18fa3b1d1
--- /dev/null
+++ b/mmdet/ops/roi_pool/src/roi_pool_cuda.cpp
@@ -0,0 +1,86 @@
+#include <torch/torch.h>
+
+#include <cmath>
+#include <vector>
+
+int ROIPoolForwardLaucher(const at::Tensor features, const at::Tensor rois,
+                          const float spatial_scale, const int channels,
+                          const int height, const int width, const int num_rois,
+                          const int pooled_h, const int pooled_w,
+                          at::Tensor output, at::Tensor argmax);
+
+int ROIPoolBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
+                           const at::Tensor argmax, const float spatial_scale,
+                           const int batch_size, const int channels,
+                           const int height, const int width,
+                           const int num_rois, const int pooled_h,
+                           const int pooled_w, at::Tensor bottom_grad);
+
+#define CHECK_CUDA(x) AT_ASSERT(x.type().is_cuda(), #x " must be a CUDAtensor ")
+#define CHECK_CONTIGUOUS(x) \
+  AT_ASSERT(x.is_contiguous(), #x " must be contiguous ")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
+
+int roi_pooling_forward_cuda(at::Tensor features, at::Tensor rois,
+                             int pooled_height, int pooled_width,
+                             float spatial_scale, at::Tensor output,
+                             at::Tensor argmax) {
+  CHECK_INPUT(features);
+  CHECK_INPUT(rois);
+  CHECK_INPUT(output);
+  CHECK_INPUT(argmax);
+
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+
+  if (size_rois != 5) {
+    printf("wrong roi size\n");
+    return 0;
+  }
+
+  int channels = features.size(1);
+  int height = features.size(2);
+  int width = features.size(3);
+
+  ROIPoolForwardLaucher(features, rois, spatial_scale, channels, height, width,
+                        num_rois, pooled_height, pooled_width, output, argmax);
+
+  return 1;
+}
+
+int roi_pooling_backward_cuda(at::Tensor top_grad, at::Tensor rois,
+                              at::Tensor argmax, float spatial_scale,
+                              at::Tensor bottom_grad) {
+  CHECK_INPUT(top_grad);
+  CHECK_INPUT(rois);
+  CHECK_INPUT(argmax);
+  CHECK_INPUT(bottom_grad);
+
+  int pooled_height = top_grad.size(2);
+  int pooled_width = top_grad.size(3);
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+
+  if (size_rois != 5) {
+    printf("wrong roi size\n");
+    return 0;
+  }
+  int batch_size = bottom_grad.size(0);
+  int channels = bottom_grad.size(1);
+  int height = bottom_grad.size(2);
+  int width = bottom_grad.size(3);
+
+  ROIPoolBackwardLaucher(top_grad, rois, argmax, spatial_scale, batch_size,
+                         channels, height, width, num_rois, pooled_height,
+                         pooled_width, bottom_grad);
+
+  return 1;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", &roi_pooling_forward_cuda, "Roi_Pooling forward (CUDA)");
+  m.def("backward", &roi_pooling_backward_cuda, "Roi_Pooling backward (CUDA)");
+}
diff --git a/mmdet/ops/roi_pool/src/roi_pool_kernel.cu b/mmdet/ops/roi_pool/src/roi_pool_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c94a9cd78503c19995db88dd71f2b1ce5a36d629
--- /dev/null
+++ b/mmdet/ops/roi_pool/src/roi_pool_kernel.cu
@@ -0,0 +1,193 @@
+#include <ATen/ATen.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <math.h>
+#include <stdio.h>
+#include <vector>
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n;                   \
+       i += blockDim.x * gridDim.x)
+
+#define THREADS_PER_BLOCK 1024
+
+inline int GET_BLOCKS(const int N) {
+  int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+  int max_block_num = 65000;
+  return min(optimal_block_num, max_block_num);
+}
+
+template <typename scalar_t>
+__global__ void ROIPoolForward(const int nthreads, const scalar_t *bottom_data,
+                               const scalar_t *rois,
+                               const scalar_t spatial_scale, const int channels,
+                               const int height, const int width,
+                               const int pooled_h, const int pooled_w,
+                               scalar_t *top_data, int *argmax_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_w;
+    int ph = (index / pooled_w) % pooled_h;
+    int c = (index / pooled_w / pooled_h) % channels;
+    int n = index / pooled_w / pooled_h / channels;
+
+    const scalar_t *offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+    // calculate the roi region on feature maps
+    scalar_t roi_x1 = offset_rois[1] * spatial_scale;
+    scalar_t roi_y1 = offset_rois[2] * spatial_scale;
+    scalar_t roi_x2 = (offset_rois[3] + 1) * spatial_scale;
+    scalar_t roi_y2 = (offset_rois[4] + 1) * spatial_scale;
+
+    // force malformed rois to be 1x1
+    scalar_t roi_w = roi_x2 - roi_x1;
+    scalar_t roi_h = roi_y2 - roi_y1;
+    if (roi_w <= 0 || roi_h <= 0)
+      continue;
+
+    scalar_t bin_size_w = roi_w / static_cast<scalar_t>(pooled_w);
+    scalar_t bin_size_h = roi_h / static_cast<scalar_t>(pooled_h);
+
+    // the corresponding bin region
+    int bin_x1 = floor(static_cast<scalar_t>(pw) * bin_size_w + roi_x1);
+    int bin_y1 = floor(static_cast<scalar_t>(ph) * bin_size_h + roi_y1);
+    int bin_x2 = ceil(static_cast<scalar_t>(pw + 1) * bin_size_w + roi_x1);
+    int bin_y2 = ceil(static_cast<scalar_t>(ph + 1) * bin_size_h + roi_y1);
+
+    // add roi offsets and clip to input boundaries
+    bin_x1 = min(max(bin_x1, 0), width);
+    bin_y1 = min(max(bin_y1, 0), height);
+    bin_x2 = min(max(bin_x2, 0), width);
+    bin_y2 = min(max(bin_y2, 0), height);
+    bool is_empty = (bin_y2 <= bin_y1) || (bin_x2 <= bin_x1);
+
+    // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
+    int max_idx = -1;
+    bottom_data += (roi_batch_ind * channels + c) * height * width;
+
+    // Define an empty pooling region to be zero
+    scalar_t max_val = is_empty ? 0 : bottom_data[bin_y1 * width + bin_x1] - 1;
+
+    for (int h = bin_y1; h < bin_y2; ++h) {
+      for (int w = bin_x1; w < bin_x2; ++w) {
+        int offset = h * width + w;
+        if (bottom_data[offset] > max_val) {
+          max_val = bottom_data[offset];
+          max_idx = offset;
+        }
+      }
+    }
+    top_data[index] = max_val;
+    if (argmax_data != NULL)
+      argmax_data[index] = max_idx;
+  }
+}
+
+int ROIPoolForwardLaucher(const at::Tensor features, const at::Tensor rois,
+                          const float spatial_scale, const int channels,
+                          const int height, const int width, const int num_rois,
+                          const int pooled_h, const int pooled_w,
+                          at::Tensor output, at::Tensor argmax) {
+  const int output_size = num_rois * channels * pooled_h * pooled_w;
+
+  AT_DISPATCH_FLOATING_TYPES(
+      features.type(), "ROIPoolLaucherForward", ([&] {
+        const scalar_t *bottom_data = features.data<scalar_t>();
+        const scalar_t *rois_data = rois.data<scalar_t>();
+        scalar_t *top_data = output.data<scalar_t>();
+        int *argmax_data = argmax.data<int>();
+
+        ROIPoolForward<
+            scalar_t><<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
+            output_size, bottom_data, rois_data, scalar_t(spatial_scale),
+            channels, height, width, pooled_h, pooled_w, top_data, argmax_data);
+      }));
+  cudaError_t err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    fprintf(stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString(err));
+    exit(-1);
+  }
+  return 1;
+}
+
+template <typename scalar_t>
+__global__ void ROIPoolBackward(const int nthreads, const scalar_t *top_diff,
+                                const scalar_t *rois, const int *argmax_data,
+                                const scalar_t spatial_scale,
+                                const int channels, const int height,
+                                const int width, const int pooled_h,
+                                const int pooled_w, scalar_t *bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int pw = index % pooled_w;
+    int ph = (index / pooled_w) % pooled_h;
+    int c = (index / pooled_w / pooled_h) % channels;
+    int n = index / pooled_w / pooled_h / channels;
+
+    int roi_batch_ind = rois[n * 5];
+    int bottom_index = argmax_data[(n * channels + c) * pooled_h * pooled_w +
+                                   ph * pooled_w + pw];
+
+    atomicAdd(bottom_diff + (roi_batch_ind * channels + c) * height * width +
+                  bottom_index,
+              top_diff[index]);
+  }
+}
+
+template <>
+__global__ void
+ROIPoolBackward<double>(const int nthreads, const double *top_diff,
+                        const double *rois, const int *argmax_data,
+                        const double spatial_scale, const int channels,
+                        const int height, const int width, const int pooled_h,
+                        const int pooled_w, double *bottom_diff) {
+  // CUDA_1D_KERNEL_LOOP(index, nthreads) {
+  //   int pw = index % pooled_w;
+  //   int ph = (index / pooled_w) % pooled_h;
+  //   int c = (index / pooled_w / pooled_h) % channels;
+  //   int n = index / pooled_w / pooled_h / channels;
+
+  //   int roi_batch_ind = rois[n * 5];
+  //   int bottom_index = argmax_data[(n * channels + c) * pooled_h * pooled_w +
+  //                                  ph * pooled_w + pw];
+
+  //   *(bottom_diff + (roi_batch_ind * channels + c) * height * width +
+  //                 bottom_index) +=top_diff[index];
+  // }
+}
+
+int ROIPoolBackwardLaucher(const at::Tensor top_grad, const at::Tensor rois,
+                           const at::Tensor argmax, const float spatial_scale,
+                           const int batch_size, const int channels,
+                           const int height, const int width,
+                           const int num_rois, const int pooled_h,
+                           const int pooled_w, at::Tensor bottom_grad) {
+  const int output_size = num_rois * pooled_h * pooled_w * channels;
+
+  AT_DISPATCH_FLOATING_TYPES(
+      top_grad.type(), "ROIPoolLaucherBackward", ([&] {
+        const scalar_t *top_diff = top_grad.data<scalar_t>();
+        const scalar_t *rois_data = rois.data<scalar_t>();
+        const int *argmax_data = argmax.data<int>();
+        scalar_t *bottom_diff = bottom_grad.data<scalar_t>();
+
+        if (sizeof(scalar_t) == sizeof(double)) {
+          fprintf(stderr, "double is not supported\n");
+          exit(-1);
+        }
+
+        ROIPoolBackward<
+            scalar_t><<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
+            output_size, top_diff, rois_data, argmax_data,
+            scalar_t(spatial_scale), channels, height, width, pooled_h,
+            pooled_w, bottom_diff);
+      }));
+  cudaError_t err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    fprintf(stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString(err));
+    exit(-1);
+  }
+
+  return 1;
+}
diff --git a/mmdet/version.py b/mmdet/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b8877c505752cd3aaa805b09b88791d3ca0c9bb
--- /dev/null
+++ b/mmdet/version.py
@@ -0,0 +1 @@
+__version__ = '0.5.0'
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ed19bd5a810692f308f99617f20fe2e07e86f5a
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,40 @@
+from setuptools import find_packages, setup
+
+
+def readme():
+    with open('README.md') as f:
+        content = f.read()
+    return content
+
+
+def get_version():
+    version_file = 'mmcv/version.py'
+    with open(version_file, 'r') as f:
+        exec(compile(f.read(), version_file, 'exec'))
+    return locals()['__version__']
+
+
+setup(
+    name='mmdet',
+    version=get_version(),
+    description='Open MMLab Detection Toolbox',
+    long_description=readme(),
+    keywords='computer vision, object detection',
+    packages=find_packages(),
+    classifiers=[
+        'Development Status :: 4 - Beta',
+        'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
+        'Operating System :: OS Independent',
+        'Programming Language :: Python :: 2',
+        'Programming Language :: Python :: 2.7',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.4',
+        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
+        'Topic :: Utilities',
+    ],
+    license='GPLv3',
+    setup_requires=['pytest-runner'],
+    tests_require=['pytest'],
+    install_requires=['numpy', 'matplotlib', 'six', 'terminaltables'],
+    zip_safe=False)