From 89022a260c6c7aeb10113986bacdaabe368dc605 Mon Sep 17 00:00:00 2001
From: Jiaqi Wang <1155098160@link.cuhk.edu.hk>
Date: Thu, 23 May 2019 20:57:11 +0800
Subject: [PATCH] Code of CVPR 2019 Paper: Region Proposal by Guided Anchoring
 (#594)

* add two stage w/o neck and w/ upperneck

* add rpn r50 c4

* update c4 configs

* fix

* config update

* update config

* minor update

* mask rcnn support c4 train and test

* lr fix

* cascade support upper_neck

* add cascade c4 config

* update config

* update

* update res_layer to new interface

* refactoring

* c4 configs update

* refactoring

* update rpn_c4 config

* rename upper_neck as shared_head

* update

* update configs

* update

* update c4 configs

* update according to commits

* update

* add ga rpn

* test bug fix

* test bug fix with loc_filter_thr is large

* update configs

* update configs

* add ga_retinanet

* ga test bug fix

* update configs

* update

* init masked conv

* update

* update masked conv

* update

* support no ga_sampler

* update

* update

* test with masked_conv

* update comment

* fix flake errors

* fix flake 8 errors

* refactor bounded iou loss

* refactor ga_retina_head

* update configs

* refactor masked conv

* fix flake8 error

* refactor guided_anchor_head and ga_rpn_head

* update configs

* use_sigmoid_cls -> cls_sigmoid_loss; use_focal_loss -> cls_focal_loss

* refactoring

* cls_sigmoid_loss -> use_sigmoid_cls

* fix flake8 error

* add some docs

* rename normalize to norm_cfg

* update configs

* add readme

* update ga_faster config

* update readme

* update readme

* rename configs as r50_caffe

* merge master

* refactor guided anchor target

* update readme

* update approx mas iou assigner

* refactor guided anchor target

* update docstring

* refactor ga heads

* fix flake8 error

* update readme

* update model url

* update comments

* refactor get anchors

* update docstring

* not use_loc_filter during training

* add R-101 results

* update to support build loss api

* fix flake8 error

* update readme with x-101 performances

* update readme

* add a link in project readme

* refactor code about ga shape inside flags

* update

* update

* add x101 config files

* add ga_rpn r101 config

* update some comments

* add comments

* add comments

* update comments

* fix flake8 error
---
 MODEL_ZOO.md                                  |   4 +
 compile.sh                                    |   7 +
 configs/guided_anchoring/README.md            |  42 ++
 .../ga_fast_r50_caffe_fpn_1x.py               | 130 ++++
 .../ga_faster_r50_caffe_fpn_1x.py             | 193 ++++++
 .../ga_faster_x101_32x4d_fpn_1x.py            | 193 ++++++
 .../ga_retinanet_r50_caffe_fpn_1x.py          | 152 +++++
 .../ga_retinanet_x101_32x4d_fpn_1x.py         | 152 +++++
 .../ga_rpn_r101_caffe_rpn_1x.py               | 151 +++++
 .../ga_rpn_r50_caffe_fpn_1x.py                | 151 +++++
 .../ga_rpn_x101_32x4d_fpn_1x.py               | 151 +++++
 mmdet/core/anchor/__init__.py                 |   8 +-
 mmdet/core/anchor/guided_anchor_target.py     | 285 +++++++++
 mmdet/core/bbox/assigners/__init__.py         |   5 +-
 .../bbox/assigners/approx_max_iou_assigner.py | 116 ++++
 mmdet/core/loss/__init__.py                   |  13 +-
 mmdet/core/loss/losses.py                     |  97 ++-
 mmdet/models/anchor_heads/__init__.py         |  10 +-
 mmdet/models/anchor_heads/ga_retina_head.py   | 107 ++++
 mmdet/models/anchor_heads/ga_rpn_head.py      | 127 ++++
 .../models/anchor_heads/guided_anchor_head.py | 589 ++++++++++++++++++
 mmdet/models/losses/__init__.py               |   3 +-
 mmdet/models/losses/iou_loss.py               |  26 +
 mmdet/ops/__init__.py                         |   4 +-
 mmdet/ops/masked_conv/__init__.py             |   4 +
 mmdet/ops/masked_conv/functions/__init__.py   |   0
 .../ops/masked_conv/functions/masked_conv.py  |  55 ++
 mmdet/ops/masked_conv/modules/__Init__.py     |   0
 mmdet/ops/masked_conv/modules/masked_conv.py  |  30 +
 mmdet/ops/masked_conv/setup.py                |  12 +
 .../masked_conv/src/masked_conv2d_cuda.cpp    |  74 +++
 .../masked_conv/src/masked_conv2d_kernel.cu   | 113 ++++
 32 files changed, 2983 insertions(+), 21 deletions(-)
 create mode 100644 configs/guided_anchoring/README.md
 create mode 100644 configs/guided_anchoring/ga_fast_r50_caffe_fpn_1x.py
 create mode 100644 configs/guided_anchoring/ga_faster_r50_caffe_fpn_1x.py
 create mode 100644 configs/guided_anchoring/ga_faster_x101_32x4d_fpn_1x.py
 create mode 100644 configs/guided_anchoring/ga_retinanet_r50_caffe_fpn_1x.py
 create mode 100644 configs/guided_anchoring/ga_retinanet_x101_32x4d_fpn_1x.py
 create mode 100644 configs/guided_anchoring/ga_rpn_r101_caffe_rpn_1x.py
 create mode 100644 configs/guided_anchoring/ga_rpn_r50_caffe_fpn_1x.py
 create mode 100644 configs/guided_anchoring/ga_rpn_x101_32x4d_fpn_1x.py
 create mode 100644 mmdet/core/anchor/guided_anchor_target.py
 create mode 100644 mmdet/core/bbox/assigners/approx_max_iou_assigner.py
 create mode 100644 mmdet/models/anchor_heads/ga_retina_head.py
 create mode 100644 mmdet/models/anchor_heads/ga_rpn_head.py
 create mode 100644 mmdet/models/anchor_heads/guided_anchor_head.py
 create mode 100644 mmdet/models/losses/iou_loss.py
 create mode 100644 mmdet/ops/masked_conv/__init__.py
 create mode 100644 mmdet/ops/masked_conv/functions/__init__.py
 create mode 100644 mmdet/ops/masked_conv/functions/masked_conv.py
 create mode 100644 mmdet/ops/masked_conv/modules/__Init__.py
 create mode 100644 mmdet/ops/masked_conv/modules/masked_conv.py
 create mode 100644 mmdet/ops/masked_conv/setup.py
 create mode 100644 mmdet/ops/masked_conv/src/masked_conv2d_cuda.cpp
 create mode 100644 mmdet/ops/masked_conv/src/masked_conv2d_kernel.cu

diff --git a/MODEL_ZOO.md b/MODEL_ZOO.md
index 9ba86ab8..41480bd0 100644
--- a/MODEL_ZOO.md
+++ b/MODEL_ZOO.md
@@ -214,6 +214,10 @@ Please refer to [Weight Standardization](configs/gn+ws/README.md) for details.
 
 Please refer to [Deformable Convolutional Networks](configs/dcn/README.md) for details.
 
+### Guided Anchoring
+
+Please refer to [Guided Anchoring](configs/guided_anchoring/README.md) for details.
+
 
 ## Comparison with Detectron and maskrcnn-benchmark
 
diff --git a/compile.sh b/compile.sh
index 335cf51d..c3853f1e 100755
--- a/compile.sh
+++ b/compile.sh
@@ -36,3 +36,10 @@ if [ -d "build" ]; then
     rm -r build
 fi
 $PYTHON setup.py build_ext --inplace
+
+echo "Building masked conv op..."
+cd ../masked_conv
+if [ -d "build" ]; then
+    rm -r build
+fi
+$PYTHON setup.py build_ext --inplace
diff --git a/configs/guided_anchoring/README.md b/configs/guided_anchoring/README.md
new file mode 100644
index 00000000..1d8bb009
--- /dev/null
+++ b/configs/guided_anchoring/README.md
@@ -0,0 +1,42 @@
+# Region Proposal by Guided Anchoring
+
+## Introduction
+
+We provide config files to reproduce the results in the CVPR 2019 paper for [Region Proposal by Guided Anchoring](https://arxiv.org/abs/1901.03278).
+
+```
+@inproceedings{wang2019region,
+    title={Region Proposal by Guided Anchoring},
+    author={Jiaqi Wang and Kai Chen and Shuo Yang and Chen Change Loy and Dahua Lin},
+    booktitle={IEEE Conference on Computer Vision and Pattern Recognition},
+    year={2019}
+}
+```
+
+## Results and Models
+
+The results on COCO 2017 val is shown in the below table. (results on test-dev are usually slightly higher than val).
+
+| Method |    Backbone     |  Style  | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | AR 1000 |                                                                   Download                                                                    |
+| :----: | :-------------: | :-----: | :-----: | :------: | :-----------------: | :------------: | :-----: | :-------------------------------------------------------------------------------------------------------------------------------------------: |
+| GA-RPN |    R-50-FPN     |  caffe  |   1x    |   5.0    |        0.55         |      13.3      |  68.5   | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/guided_anchoring/ga_rpn_r50_caffe_fpn_1x_20190513-95e91886.pth) |
+| GA-RPN |    R-101-FPN    |  caffe  |   1x    |    -     |          -          |       -        |  69.6   |                                                                       -                                                                       |
+| GA-RPN | X-101-32x4d-FPN | pytorch |   1x    |    -     |          -          |       -        |  70.0   |                                                                       -                                                                       |
+| GA-RPN | X-101-64x4d-FPN | pytorch |   1x    |    -     |          -          |       -        |  70.5   |                                                                       -                                                                       |
+
+
+|     Method     |    Backbone     |  Style  | Lr schd | Mem (GB) | Train time (s/iter) | Inf time (fps) | box AP |                                                                      Download                                                                       |
+| :------------: | :-------------: | :-----: | :-----: | :------: | :-----------------: | :------------: | :----: | :-------------------------------------------------------------------------------------------------------------------------------------------------: |
+|  GA-Fast RCNN  |    R-50-FPN     |  caffe  |   1x    |   3.3    |        0.23         |      14.9      |  39.5  |   [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/guided_anchoring/ga_fast_r50_caffe_fpn_1x_20190513-c5af9f8b.pth)    |
+| GA-Faster RCNN |    R-50-FPN     |  caffe  |   1x    |   5.1    |        0.64         |      9.6       |  39.9  |  [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/guided_anchoring/ga_faster_r50_caffe_fpn_1x_20190513-a52b31fa.pth)   |
+| GA-Faster RCNN |    R-101-FPN    |  caffe  |   1x    |    -     |          -          |       -        |  41.5  |                                                                          -                                                                          |
+| GA-Faster RCNN | X-101-32x4d-FPN | pytorch |   1x    |    -     |          -          |       -        |  42.9  |                                                                          -                                                                          |
+| GA-Faster RCNN | X-101-64x4d-FPN | pytorch |   1x    |    -     |          -          |       -        |  43.9  |                                                                          -                                                                          |
+|  GA-RetinaNet  |    R-50-FPN     |  caffe  |   1x    |   3.2    |        0.50         |      10.7      |  37.0  | [model](https://s3.ap-northeast-2.amazonaws.com/open-mmlab/mmdetection/models/guided_anchoring/ga_retinanet_r50_caffe_fpn_1x_20190513-29905101.pth) |
+|  GA-RetinaNet  |    R-101-FPN    |  caffe  |   1x    |    -     |          -          |       -        |  38.9  |                                                                          -                                                                          |
+|  GA-RetinaNet  | X-101-32x4d-FPN | pytorch |   1x    |    -     |          -          |       -        |  40.3  |                                                                          -                                                                          |
+|  GA-RetinaNet  | X-101-64x4d-FPN | pytorch |   1x    |    -     |          -          |       -        |  40.8  |                                                                          -                                                                          |
+
+
+
+- In the Guided Anchoring paper, `score_thr` is set to 0.001 in Fast/Faster RCNN and 0.05 in RetinaNet for both baselines and Guided Anchoring.
\ No newline at end of file
diff --git a/configs/guided_anchoring/ga_fast_r50_caffe_fpn_1x.py b/configs/guided_anchoring/ga_fast_r50_caffe_fpn_1x.py
new file mode 100644
index 00000000..269967df
--- /dev/null
+++ b/configs/guided_anchoring/ga_fast_r50_caffe_fpn_1x.py
@@ -0,0 +1,130 @@
+# model settings
+model = dict(
+    type='FastRCNN',
+    pretrained='open-mmlab://resnet50_caffe',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    bbox_roi_extractor=dict(
+        type='SingleRoIExtractor',
+        roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
+        out_channels=256,
+        featmap_strides=[4, 8, 16, 32]),
+    bbox_head=dict(
+        type='SharedFCBBoxHead',
+        num_fcs=2,
+        in_channels=256,
+        fc_out_channels=1024,
+        roi_feat_size=7,
+        num_classes=81,
+        target_means=[0., 0., 0., 0.],
+        target_stds=[0.05, 0.05, 0.1, 0.1],
+        reg_class_agnostic=False,
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)))
+# model training and testing settings
+train_cfg = dict(
+    rcnn=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.6,
+            neg_iou_thr=0.6,
+            min_pos_iou=0.6,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.25,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=True),
+        pos_weight=-1,
+        debug=False))
+test_cfg = dict(
+    rcnn=dict(
+        score_thr=1e-3, nms=dict(type='nms', iou_thr=0.5), max_per_img=100))
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False)
+data = dict(
+    imgs_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        num_max_proposals=300,
+        proposal_file=data_root + 'proposals/ga_rpn_r50_fpn_1x_train2017.pkl',
+        flip_ratio=0.5,
+        with_mask=False,
+        with_crowd=True,
+        with_label=True),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        num_max_proposals=300,
+        proposal_file=data_root + 'proposals/ga_rpn_r50_fpn_1x_val2017.pkl',
+        flip_ratio=0,
+        with_mask=False,
+        with_crowd=True,
+        with_label=True),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        num_max_proposals=300,
+        proposal_file=data_root + 'proposals/ga_rpn_r50_fpn_1x_val2017.pkl',
+        flip_ratio=0,
+        with_mask=False,
+        with_label=False,
+        test_mode=True))
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    step=[8, 11])
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 12
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/ga_fast_rcnn_r50_caffe_fpn_1x'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/configs/guided_anchoring/ga_faster_r50_caffe_fpn_1x.py b/configs/guided_anchoring/ga_faster_r50_caffe_fpn_1x.py
new file mode 100644
index 00000000..0b9f7254
--- /dev/null
+++ b/configs/guided_anchoring/ga_faster_r50_caffe_fpn_1x.py
@@ -0,0 +1,193 @@
+# model settings
+model = dict(
+    type='FasterRCNN',
+    pretrained='open-mmlab://resnet50_caffe',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='GARPNHead',
+        in_channels=256,
+        feat_channels=256,
+        octave_base_scale=8,
+        scales_per_octave=3,
+        octave_ratios=[0.5, 1.0, 2.0],
+        anchor_strides=[4, 8, 16, 32, 64],
+        anchor_base_sizes=None,
+        anchoring_means=[.0, .0, .0, .0],
+        anchoring_stds=[0.07, 0.07, 0.14, 0.14],
+        target_means=(.0, .0, .0, .0),
+        target_stds=[0.07, 0.07, 0.11, 0.11],
+        loc_filter_thr=0.01,
+        loss_loc=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_shape=dict(
+            type='IoULoss', style='bounded', beta=0.2, loss_weight=1.0),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)),
+    bbox_roi_extractor=dict(
+        type='SingleRoIExtractor',
+        roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
+        out_channels=256,
+        featmap_strides=[4, 8, 16, 32]),
+    bbox_head=dict(
+        type='SharedFCBBoxHead',
+        num_fcs=2,
+        in_channels=256,
+        fc_out_channels=1024,
+        roi_feat_size=7,
+        num_classes=81,
+        target_means=[0., 0., 0., 0.],
+        target_stds=[0.05, 0.05, 0.1, 0.1],
+        reg_class_agnostic=False,
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)))
+# model training and testing settings
+train_cfg = dict(
+    rpn=dict(
+        ga_assigner=dict(
+            type='ApproxMaxIoUAssigner',
+            pos_iou_thr=0.7,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            ignore_iof_thr=-1),
+        ga_sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.5,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False),
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.7,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.5,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False),
+        allowed_border=-1,
+        pos_weight=-1,
+        center_ratio=0.2,
+        ignore_ratio=0.5,
+        debug=False),
+    rpn_proposal=dict(
+        nms_across_levels=False,
+        nms_pre=2000,
+        nms_post=2000,
+        max_num=300,
+        nms_thr=0.7,
+        min_bbox_size=0),
+    rcnn=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.6,
+            neg_iou_thr=0.6,
+            min_pos_iou=0.6,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.25,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=True),
+        pos_weight=-1,
+        debug=False))
+test_cfg = dict(
+    rpn=dict(
+        nms_across_levels=False,
+        nms_pre=1000,
+        nms_post=1000,
+        max_num=300,
+        nms_thr=0.7,
+        min_bbox_size=0),
+    rcnn=dict(
+        score_thr=1e-3, nms=dict(type='nms', iou_thr=0.5), max_per_img=100))
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False)
+data = dict(
+    imgs_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0.5,
+        with_mask=False,
+        with_crowd=True,
+        with_label=True),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0,
+        with_mask=False,
+        with_crowd=True,
+        with_label=True),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0,
+        with_mask=False,
+        with_label=False,
+        test_mode=True))
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    step=[8, 11])
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 12
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/ga_faster_rcnn_r50_caffe_fpn_1x'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/configs/guided_anchoring/ga_faster_x101_32x4d_fpn_1x.py b/configs/guided_anchoring/ga_faster_x101_32x4d_fpn_1x.py
new file mode 100644
index 00000000..dabdf6c9
--- /dev/null
+++ b/configs/guided_anchoring/ga_faster_x101_32x4d_fpn_1x.py
@@ -0,0 +1,193 @@
+# model settings
+model = dict(
+    type='FasterRCNN',
+    pretrained='open-mmlab://resnext101_32x4d',
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='GARPNHead',
+        in_channels=256,
+        feat_channels=256,
+        octave_base_scale=8,
+        scales_per_octave=3,
+        octave_ratios=[0.5, 1.0, 2.0],
+        anchor_strides=[4, 8, 16, 32, 64],
+        anchor_base_sizes=None,
+        anchoring_means=[.0, .0, .0, .0],
+        anchoring_stds=[0.07, 0.07, 0.14, 0.14],
+        target_means=(.0, .0, .0, .0),
+        target_stds=[0.07, 0.07, 0.11, 0.11],
+        loc_filter_thr=0.01,
+        loss_loc=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_shape=dict(
+            type='IoULoss', style='bounded', beta=0.2, loss_weight=1.0),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)),
+    bbox_roi_extractor=dict(
+        type='SingleRoIExtractor',
+        roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2),
+        out_channels=256,
+        featmap_strides=[4, 8, 16, 32]),
+    bbox_head=dict(
+        type='SharedFCBBoxHead',
+        num_fcs=2,
+        in_channels=256,
+        fc_out_channels=1024,
+        roi_feat_size=7,
+        num_classes=81,
+        target_means=[0., 0., 0., 0.],
+        target_stds=[0.05, 0.05, 0.1, 0.1],
+        reg_class_agnostic=False,
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)))
+# model training and testing settings
+train_cfg = dict(
+    rpn=dict(
+        ga_assigner=dict(
+            type='ApproxMaxIoUAssigner',
+            pos_iou_thr=0.7,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            ignore_iof_thr=-1),
+        ga_sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.5,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False),
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.7,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.5,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False),
+        allowed_border=-1,
+        pos_weight=-1,
+        center_ratio=0.2,
+        ignore_ratio=0.5,
+        debug=False),
+    rpn_proposal=dict(
+        nms_across_levels=False,
+        nms_pre=2000,
+        nms_post=2000,
+        max_num=300,
+        nms_thr=0.7,
+        min_bbox_size=0),
+    rcnn=dict(
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.6,
+            neg_iou_thr=0.6,
+            min_pos_iou=0.6,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.25,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=True),
+        pos_weight=-1,
+        debug=False))
+test_cfg = dict(
+    rpn=dict(
+        nms_across_levels=False,
+        nms_pre=1000,
+        nms_post=1000,
+        max_num=300,
+        nms_thr=0.7,
+        min_bbox_size=0),
+    rcnn=dict(
+        score_thr=1e-3, nms=dict(type='nms', iou_thr=0.5), max_per_img=100))
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+data = dict(
+    imgs_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0.5,
+        with_mask=False,
+        with_crowd=True,
+        with_label=True),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0,
+        with_mask=False,
+        with_crowd=True,
+        with_label=True),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0,
+        with_mask=False,
+        with_label=False,
+        test_mode=True))
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    step=[8, 11])
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 12
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/ga_faster_rcnn_x101_32x4d_fpn_1x'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/configs/guided_anchoring/ga_retinanet_r50_caffe_fpn_1x.py b/configs/guided_anchoring/ga_retinanet_r50_caffe_fpn_1x.py
new file mode 100644
index 00000000..63ba9e74
--- /dev/null
+++ b/configs/guided_anchoring/ga_retinanet_r50_caffe_fpn_1x.py
@@ -0,0 +1,152 @@
+# model settings
+model = dict(
+    type='RetinaNet',
+    pretrained='open-mmlab://resnet50_caffe',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs=True,
+        num_outs=5),
+    bbox_head=dict(
+        type='GARetinaHead',
+        num_classes=81,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        octave_base_scale=4,
+        scales_per_octave=3,
+        octave_ratios=[0.5, 1.0, 2.0],
+        anchor_strides=[8, 16, 32, 64, 128],
+        anchor_base_sizes=None,
+        anchoring_means=[.0, .0, .0, .0],
+        anchoring_stds=[1.0, 1.0, 1.0, 1.0],
+        target_means=(.0, .0, .0, .0),
+        target_stds=[1.0, 1.0, 1.0, 1.0],
+        loc_filter_thr=0.01,
+        loss_loc=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_shape=dict(
+            type='IoULoss', style='bounded', beta=0.2, loss_weight=1.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=0.04, loss_weight=1.0)))
+# training and testing settings
+train_cfg = dict(
+    ga_assigner=dict(
+        type='ApproxMaxIoUAssigner',
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.4,
+        min_pos_iou=0.4,
+        ignore_iof_thr=-1),
+    ga_sampler=dict(
+        type='RandomSampler',
+        num=256,
+        pos_fraction=0.5,
+        neg_pos_ub=-1,
+        add_gt_as_proposals=False),
+    assigner=dict(
+        type='MaxIoUAssigner',
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.5,
+        min_pos_iou=0.0,
+        ignore_iof_thr=-1),
+    allowed_border=-1,
+    pos_weight=-1,
+    center_ratio=0.2,
+    ignore_ratio=0.5,
+    debug=False)
+test_cfg = dict(
+    nms_pre=1000,
+    min_bbox_size=0,
+    score_thr=0.05,
+    nms=dict(type='nms', iou_thr=0.5),
+    max_per_img=100)
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False)
+data = dict(
+    imgs_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0.5,
+        with_mask=False,
+        with_crowd=False,
+        with_label=True),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0,
+        with_mask=False,
+        with_crowd=False,
+        with_label=True),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0,
+        with_mask=False,
+        with_crowd=False,
+        with_label=False,
+        test_mode=True))
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    step=[8, 11])
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 12
+device_ids = range(8)
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/ga_retinanet_r50_caffe_fpn_1x'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/configs/guided_anchoring/ga_retinanet_x101_32x4d_fpn_1x.py b/configs/guided_anchoring/ga_retinanet_x101_32x4d_fpn_1x.py
new file mode 100644
index 00000000..bd39bf12
--- /dev/null
+++ b/configs/guided_anchoring/ga_retinanet_x101_32x4d_fpn_1x.py
@@ -0,0 +1,152 @@
+# model settings
+model = dict(
+    type='RetinaNet',
+    pretrained='open-mmlab://resnext101_32x4d',
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        start_level=1,
+        add_extra_convs=True,
+        num_outs=5),
+    bbox_head=dict(
+        type='GARetinaHead',
+        num_classes=81,
+        in_channels=256,
+        stacked_convs=4,
+        feat_channels=256,
+        octave_base_scale=4,
+        scales_per_octave=3,
+        octave_ratios=[0.5, 1.0, 2.0],
+        anchor_strides=[8, 16, 32, 64, 128],
+        anchor_base_sizes=None,
+        anchoring_means=[.0, .0, .0, .0],
+        anchoring_stds=[1.0, 1.0, 1.0, 1.0],
+        target_means=(.0, .0, .0, .0),
+        target_stds=[1.0, 1.0, 1.0, 1.0],
+        loc_filter_thr=0.01,
+        loss_loc=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_shape=dict(
+            type='IoULoss', style='bounded', beta=0.2, loss_weight=1.0),
+        loss_cls=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=0.04, loss_weight=1.0)))
+# training and testing settings
+train_cfg = dict(
+    ga_assigner=dict(
+        type='ApproxMaxIoUAssigner',
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.4,
+        min_pos_iou=0.4,
+        ignore_iof_thr=-1),
+    ga_sampler=dict(
+        type='RandomSampler',
+        num=256,
+        pos_fraction=0.5,
+        neg_pos_ub=-1,
+        add_gt_as_proposals=False),
+    assigner=dict(
+        type='MaxIoUAssigner',
+        pos_iou_thr=0.5,
+        neg_iou_thr=0.5,
+        min_pos_iou=0.0,
+        ignore_iof_thr=-1),
+    allowed_border=-1,
+    pos_weight=-1,
+    center_ratio=0.2,
+    ignore_ratio=0.5,
+    debug=False)
+test_cfg = dict(
+    nms_pre=1000,
+    min_bbox_size=0,
+    score_thr=0.05,
+    nms=dict(type='nms', iou_thr=0.5),
+    max_per_img=100)
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+data = dict(
+    imgs_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0.5,
+        with_mask=False,
+        with_crowd=False,
+        with_label=True),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0,
+        with_mask=False,
+        with_crowd=False,
+        with_label=True),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0,
+        with_mask=False,
+        with_crowd=False,
+        with_label=False,
+        test_mode=True))
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    step=[8, 11])
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 12
+device_ids = range(8)
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/ga_retinanet_x101_32x4d_fpn_1x'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/configs/guided_anchoring/ga_rpn_r101_caffe_rpn_1x.py b/configs/guided_anchoring/ga_rpn_r101_caffe_rpn_1x.py
new file mode 100644
index 00000000..d3acf87e
--- /dev/null
+++ b/configs/guided_anchoring/ga_rpn_r101_caffe_rpn_1x.py
@@ -0,0 +1,151 @@
+# model settings
+model = dict(
+    type='RPN',
+    pretrained='open-mmlab://resnet101_caffe',
+    backbone=dict(
+        type='ResNet',
+        depth=101,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='GARPNHead',
+        in_channels=256,
+        feat_channels=256,
+        octave_base_scale=8,
+        scales_per_octave=3,
+        octave_ratios=[0.5, 1.0, 2.0],
+        anchor_strides=[4, 8, 16, 32, 64],
+        anchor_base_sizes=None,
+        anchoring_means=[.0, .0, .0, .0],
+        anchoring_stds=[0.07, 0.07, 0.14, 0.14],
+        target_means=(.0, .0, .0, .0),
+        target_stds=[0.07, 0.07, 0.11, 0.11],
+        loc_filter_thr=0.01,
+        loss_loc=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_shape=dict(
+            type='IoULoss', style='bounded', beta=0.2, loss_weight=1.0),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)))
+# model training and testing settings
+train_cfg = dict(
+    rpn=dict(
+        ga_assigner=dict(
+            type='ApproxMaxIoUAssigner',
+            pos_iou_thr=0.7,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            ignore_iof_thr=-1),
+        ga_sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.5,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False),
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.7,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.5,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False),
+        allowed_border=-1,
+        pos_weight=-1,
+        center_ratio=0.2,
+        ignore_ratio=0.5,
+        debug=False))
+test_cfg = dict(
+    rpn=dict(
+        nms_across_levels=False,
+        nms_pre=2000,
+        nms_post=2000,
+        max_num=2000,
+        nms_thr=0.7,
+        min_bbox_size=0))
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False)
+data = dict(
+    imgs_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0.5,
+        with_mask=False,
+        with_crowd=False,
+        with_label=False),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0,
+        with_mask=False,
+        with_crowd=False,
+        with_label=False),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0,
+        with_mask=False,
+        with_label=False,
+        test_mode=True))
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+# runner configs
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    step=[8, 11])
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 12
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/ga_rpn_r101_caffe_fpn_1x'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/configs/guided_anchoring/ga_rpn_r50_caffe_fpn_1x.py b/configs/guided_anchoring/ga_rpn_r50_caffe_fpn_1x.py
new file mode 100644
index 00000000..cea9b76d
--- /dev/null
+++ b/configs/guided_anchoring/ga_rpn_r50_caffe_fpn_1x.py
@@ -0,0 +1,151 @@
+# model settings
+model = dict(
+    type='RPN',
+    pretrained='open-mmlab://resnet50_caffe',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=False),
+        norm_eval=True,
+        style='caffe'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='GARPNHead',
+        in_channels=256,
+        feat_channels=256,
+        octave_base_scale=8,
+        scales_per_octave=3,
+        octave_ratios=[0.5, 1.0, 2.0],
+        anchor_strides=[4, 8, 16, 32, 64],
+        anchor_base_sizes=None,
+        anchoring_means=[.0, .0, .0, .0],
+        anchoring_stds=[0.07, 0.07, 0.14, 0.14],
+        target_means=(.0, .0, .0, .0),
+        target_stds=[0.07, 0.07, 0.11, 0.11],
+        loc_filter_thr=0.01,
+        loss_loc=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_shape=dict(
+            type='IoULoss', style='bounded', beta=0.2, loss_weight=1.0),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)))
+# model training and testing settings
+train_cfg = dict(
+    rpn=dict(
+        ga_assigner=dict(
+            type='ApproxMaxIoUAssigner',
+            pos_iou_thr=0.7,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            ignore_iof_thr=-1),
+        ga_sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.5,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False),
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.7,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.5,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False),
+        allowed_border=-1,
+        pos_weight=-1,
+        center_ratio=0.2,
+        ignore_ratio=0.5,
+        debug=False))
+test_cfg = dict(
+    rpn=dict(
+        nms_across_levels=False,
+        nms_pre=2000,
+        nms_post=2000,
+        max_num=2000,
+        nms_thr=0.7,
+        min_bbox_size=0))
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[102.9801, 115.9465, 122.7717], std=[1.0, 1.0, 1.0], to_rgb=False)
+data = dict(
+    imgs_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0.5,
+        with_mask=False,
+        with_crowd=False,
+        with_label=False),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0,
+        with_mask=False,
+        with_crowd=False,
+        with_label=False),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0,
+        with_mask=False,
+        with_label=False,
+        test_mode=True))
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+# runner configs
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    step=[8, 11])
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 12
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/ga_rpn_r50_caffe_fpn_1x'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/configs/guided_anchoring/ga_rpn_x101_32x4d_fpn_1x.py b/configs/guided_anchoring/ga_rpn_x101_32x4d_fpn_1x.py
new file mode 100644
index 00000000..c0372544
--- /dev/null
+++ b/configs/guided_anchoring/ga_rpn_x101_32x4d_fpn_1x.py
@@ -0,0 +1,151 @@
+# model settings
+model = dict(
+    type='RPN',
+    pretrained='open-mmlab://resnext101_32x4d',
+    backbone=dict(
+        type='ResNeXt',
+        depth=101,
+        groups=32,
+        base_width=4,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='GARPNHead',
+        in_channels=256,
+        feat_channels=256,
+        octave_base_scale=8,
+        scales_per_octave=3,
+        octave_ratios=[0.5, 1.0, 2.0],
+        anchor_strides=[4, 8, 16, 32, 64],
+        anchor_base_sizes=None,
+        anchoring_means=[.0, .0, .0, .0],
+        anchoring_stds=[0.07, 0.07, 0.14, 0.14],
+        target_means=(.0, .0, .0, .0),
+        target_stds=[0.07, 0.07, 0.11, 0.11],
+        loc_filter_thr=0.01,
+        loss_loc=dict(
+            type='FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            loss_weight=1.0),
+        loss_shape=dict(
+            type='IoULoss', style='bounded', beta=0.2, loss_weight=1.0),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)))
+# model training and testing settings
+train_cfg = dict(
+    rpn=dict(
+        ga_assigner=dict(
+            type='ApproxMaxIoUAssigner',
+            pos_iou_thr=0.7,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            ignore_iof_thr=-1),
+        ga_sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.5,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False),
+        assigner=dict(
+            type='MaxIoUAssigner',
+            pos_iou_thr=0.7,
+            neg_iou_thr=0.3,
+            min_pos_iou=0.3,
+            ignore_iof_thr=-1),
+        sampler=dict(
+            type='RandomSampler',
+            num=256,
+            pos_fraction=0.5,
+            neg_pos_ub=-1,
+            add_gt_as_proposals=False),
+        allowed_border=-1,
+        pos_weight=-1,
+        center_ratio=0.2,
+        ignore_ratio=0.5,
+        debug=False))
+test_cfg = dict(
+    rpn=dict(
+        nms_across_levels=False,
+        nms_pre=2000,
+        nms_post=2000,
+        max_num=2000,
+        nms_thr=0.7,
+        min_bbox_size=0))
+# dataset settings
+dataset_type = 'CocoDataset'
+data_root = 'data/coco/'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+data = dict(
+    imgs_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_train2017.json',
+        img_prefix=data_root + 'train2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0.5,
+        with_mask=False,
+        with_crowd=False,
+        with_label=False),
+    val=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0,
+        with_mask=False,
+        with_crowd=False,
+        with_label=False),
+    test=dict(
+        type=dataset_type,
+        ann_file=data_root + 'annotations/instances_val2017.json',
+        img_prefix=data_root + 'val2017/',
+        img_scale=(1333, 800),
+        img_norm_cfg=img_norm_cfg,
+        size_divisor=32,
+        flip_ratio=0,
+        with_mask=False,
+        with_label=False,
+        test_mode=True))
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+# runner configs
+optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=1.0 / 3,
+    step=[8, 11])
+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+# runtime settings
+total_epochs = 12
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+work_dir = './work_dirs/ga_rpn_x101_32x4d_fpn_1x'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
diff --git a/mmdet/core/anchor/__init__.py b/mmdet/core/anchor/__init__.py
index 0ff430a4..304d4938 100644
--- a/mmdet/core/anchor/__init__.py
+++ b/mmdet/core/anchor/__init__.py
@@ -1,4 +1,8 @@
 from .anchor_generator import AnchorGenerator
-from .anchor_target import anchor_target
+from .anchor_target import anchor_target, anchor_inside_flags
+from .guided_anchor_target import ga_loc_target, ga_shape_target
 
-__all__ = ['AnchorGenerator', 'anchor_target']
+__all__ = [
+    'AnchorGenerator', 'anchor_target', 'anchor_inside_flags', 'ga_loc_target',
+    'ga_shape_target'
+]
diff --git a/mmdet/core/anchor/guided_anchor_target.py b/mmdet/core/anchor/guided_anchor_target.py
new file mode 100644
index 00000000..2e954064
--- /dev/null
+++ b/mmdet/core/anchor/guided_anchor_target.py
@@ -0,0 +1,285 @@
+import torch
+
+from ..bbox import build_assigner, build_sampler, PseudoSampler
+from ..utils import unmap, multi_apply
+
+
+def calc_region(bbox, ratio, featmap_size=None):
+    """Calculate a proportional bbox region.
+
+    The bbox center are fixed and the new h' and w' is h * ratio and w * ratio.
+
+    Args:
+        bbox (Tensor): Bboxes to calculate regions, shape (n, 4)
+        ratio (float): Ratio of the output region.
+        featmap_size (tuple): Feature map size used for clipping the boundary.
+
+    Returns:
+        tuple: x1, y1, x2, y2
+    """
+    x1 = torch.round((1 - ratio) * bbox[0] + ratio * bbox[2]).long()
+    y1 = torch.round((1 - ratio) * bbox[1] + ratio * bbox[3]).long()
+    x2 = torch.round(ratio * bbox[0] + (1 - ratio) * bbox[2]).long()
+    y2 = torch.round(ratio * bbox[1] + (1 - ratio) * bbox[3]).long()
+    if featmap_size is not None:
+        x1 = x1.clamp(min=0, max=featmap_size[1] - 1)
+        y1 = y1.clamp(min=0, max=featmap_size[0] - 1)
+        x2 = x2.clamp(min=0, max=featmap_size[1] - 1)
+        y2 = y2.clamp(min=0, max=featmap_size[0] - 1)
+    return (x1, y1, x2, y2)
+
+
+def ga_loc_target(gt_bboxes_list,
+                  featmap_sizes,
+                  anchor_scale,
+                  anchor_strides,
+                  center_ratio=0.2,
+                  ignore_ratio=0.5):
+    """Compute location targets for guided anchoring.
+
+    Each feature map is divided into positive, negative and ignore regions.
+    - positive regions: target 1, weight 1
+    - ignore regions: target 0, weight 0
+    - negative regions: target 0, weight 0.1
+
+    Args:
+        gt_bboxes_list (list[Tensor]): Gt bboxes of each image.
+        featmap_sizes (list[tuple]): Multi level sizes of each feature maps.
+        anchor_scale (int): Anchor scale.
+        anchor_strides ([list[int]]): Multi level anchor strides.
+        center_ratio (float): Ratio of center region.
+        ignore_ratio (float): Ratio of ignore region.
+
+    Returns:
+        tuple
+    """
+    img_per_gpu = len(gt_bboxes_list)
+    num_lvls = len(featmap_sizes)
+    r1 = (1 - center_ratio) / 2
+    r2 = (1 - ignore_ratio) / 2
+    all_loc_targets = []
+    all_loc_weights = []
+    all_ignore_map = []
+    for lvl_id in range(num_lvls):
+        h, w = featmap_sizes[lvl_id]
+        loc_targets = torch.zeros(img_per_gpu,
+                                  1,
+                                  h,
+                                  w,
+                                  device=gt_bboxes_list[0].device,
+                                  dtype=torch.float32)
+        loc_weights = torch.full_like(loc_targets, -1)
+        ignore_map = torch.zeros_like(loc_targets)
+        all_loc_targets.append(loc_targets)
+        all_loc_weights.append(loc_weights)
+        all_ignore_map.append(ignore_map)
+    for img_id in range(img_per_gpu):
+        gt_bboxes = gt_bboxes_list[img_id]
+        scale = torch.sqrt((gt_bboxes[:, 2] - gt_bboxes[:, 0] + 1) *
+                           (gt_bboxes[:, 3] - gt_bboxes[:, 1] + 1))
+        min_anchor_size = scale.new_full(
+            (1, ), float(anchor_scale * anchor_strides[0]))
+        # assign gt bboxes to different feature levels w.r.t. their scales
+        target_lvls = torch.floor(
+            torch.log2(scale) - torch.log2(min_anchor_size) + 0.5)
+        target_lvls = target_lvls.clamp(min=0, max=num_lvls - 1).long()
+        for gt_id in range(gt_bboxes.size(0)):
+            lvl = target_lvls[gt_id].item()
+            # rescaled to corresponding feature map
+            gt_ = gt_bboxes[gt_id, :4] / anchor_strides[lvl]
+            # calculate ignore regions
+            ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
+                gt_, r2, featmap_sizes[lvl])
+            # calculate positive (center) regions
+            ctr_x1, ctr_y1, ctr_x2, ctr_y2 = calc_region(
+                gt_, r1, featmap_sizes[lvl])
+            all_loc_targets[lvl][img_id, 0, ctr_y1:ctr_y2 + 1, ctr_x1:ctr_x2 +
+                                 1] = 1
+            all_loc_weights[lvl][img_id, 0, ignore_y1:ignore_y2 +
+                                 1, ignore_x1:ignore_x2 + 1] = 0
+            all_loc_weights[lvl][img_id, 0, ctr_y1:ctr_y2 + 1, ctr_x1:ctr_x2 +
+                                 1] = 1
+            # calculate ignore map on nearby low level feature
+            if lvl > 0:
+                d_lvl = lvl - 1
+                # rescaled to corresponding feature map
+                gt_ = gt_bboxes[gt_id, :4] / anchor_strides[d_lvl]
+                ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
+                    gt_, r2, featmap_sizes[d_lvl])
+                all_ignore_map[d_lvl][img_id, 0, ignore_y1:ignore_y2 +
+                                      1, ignore_x1:ignore_x2 + 1] = 1
+            # calculate ignore map on nearby high level feature
+            if lvl < num_lvls - 1:
+                u_lvl = lvl + 1
+                # rescaled to corresponding feature map
+                gt_ = gt_bboxes[gt_id, :4] / anchor_strides[u_lvl]
+                ignore_x1, ignore_y1, ignore_x2, ignore_y2 = calc_region(
+                    gt_, r2, featmap_sizes[u_lvl])
+                all_ignore_map[u_lvl][img_id, 0, ignore_y1:ignore_y2 +
+                                      1, ignore_x1:ignore_x2 + 1] = 1
+    for lvl_id in range(num_lvls):
+        # ignore negative regions w.r.t. ignore map
+        all_loc_weights[lvl_id][(all_loc_weights[lvl_id] < 0)
+                                & (all_ignore_map[lvl_id] > 0)] = 0
+        # set negative regions with weight 0.1
+        all_loc_weights[lvl_id][all_loc_weights[lvl_id] < 0] = 0.1
+    # loc average factor to balance loss
+    loc_avg_factor = sum(
+        [t.size(0) * t.size(-1) * t.size(-2) for t in all_loc_targets]) / 200
+    return all_loc_targets, all_loc_weights, loc_avg_factor
+
+
+def ga_shape_target(approx_list,
+                    inside_flag_list,
+                    square_list,
+                    gt_bboxes_list,
+                    img_metas,
+                    approxs_per_octave,
+                    cfg,
+                    gt_bboxes_ignore_list=None,
+                    sampling=True,
+                    unmap_outputs=True):
+    """Compute guided anchoring targets.
+
+    Args:
+        approx_list (list[list]): Multi level approxs of each image.
+        inside_flag_list (list[list]): Multi level inside flags of each image.
+        square_list (list[list]): Multi level squares of each image.
+        gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image.
+        img_metas (list[dict]): Meta info of each image.
+        approxs_per_octave (int): number of approxs per octave
+        cfg (dict): RPN train configs.
+        gt_bboxes_ignore_list (list[Tensor]): ignore list of gt bboxes.
+        sampling (bool): sampling or not.
+        unmap_outputs (bool): unmap outputs or not.
+
+    Returns:
+        tuple
+    """
+    num_imgs = len(img_metas)
+    assert len(approx_list) == len(inside_flag_list) == len(
+        square_list) == num_imgs
+    # anchor number of multi levels
+    num_level_squares = [squares.size(0) for squares in square_list[0]]
+    # concat all level anchors and flags to a single tensor
+    inside_flag_flat_list = []
+    approx_flat_list = []
+    square_flat_list = []
+    for i in range(num_imgs):
+        assert len(square_list[i]) == len(inside_flag_list[i])
+        inside_flag_flat_list.append(torch.cat(inside_flag_list[i]))
+        approx_flat_list.append(torch.cat(approx_list[i]))
+        square_flat_list.append(torch.cat(square_list[i]))
+
+    # compute targets for each image
+    if gt_bboxes_ignore_list is None:
+        gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
+    (all_bbox_anchors, all_bbox_gts, all_bbox_weights, pos_inds_list,
+     neg_inds_list) = multi_apply(ga_shape_target_single,
+                                  approx_flat_list,
+                                  inside_flag_flat_list,
+                                  square_flat_list,
+                                  gt_bboxes_list,
+                                  gt_bboxes_ignore_list,
+                                  img_metas,
+                                  approxs_per_octave=approxs_per_octave,
+                                  cfg=cfg,
+                                  sampling=sampling,
+                                  unmap_outputs=unmap_outputs)
+    # no valid anchors
+    if any([bbox_anchors is None for bbox_anchors in all_bbox_anchors]):
+        return None
+    # sampled anchors of all images
+    num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list])
+    num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list])
+    # split targets to a list w.r.t. multiple levels
+    bbox_anchors_list = images_to_levels(all_bbox_anchors, num_level_squares)
+    bbox_gts_list = images_to_levels(all_bbox_gts, num_level_squares)
+    bbox_weights_list = images_to_levels(all_bbox_weights, num_level_squares)
+    return (bbox_anchors_list, bbox_gts_list, bbox_weights_list, num_total_pos,
+            num_total_neg)
+
+
+def images_to_levels(target, num_level_anchors):
+    """Convert targets by image to targets by feature level.
+
+    [target_img0, target_img1] -> [target_level0, target_level1, ...]
+    """
+    target = torch.stack(target, 0)
+    level_targets = []
+    start = 0
+    for n in num_level_anchors:
+        end = start + n
+        level_targets.append(target[:, start:end].squeeze(0))
+        start = end
+    return level_targets
+
+
+def ga_shape_target_single(flat_approxs,
+                           inside_flags,
+                           flat_squares,
+                           gt_bboxes,
+                           gt_bboxes_ignore,
+                           img_meta,
+                           approxs_per_octave,
+                           cfg,
+                           sampling=True,
+                           unmap_outputs=True):
+    """Compute guided anchoring targets.
+
+    This function returns sampled anchors and gt bboxes directly
+    rather than calculates regression targets.
+
+    Args:
+        flat_approxs (Tensor): flat approxs of a single image,
+            shape (n, 4)
+        inside_flags (Tensor): inside flags of a single image,
+            shape (n, ).
+        flat_squares (Tensor): flat squares of a single image,
+            shape (approxs_per_octave * n, 4)
+        gt_bboxes (Tensor): Ground truth bboxes of a single image.
+        img_meta (dict): Meta info of a single image.
+        approxs_per_octave (int): number of approxs per octave
+        cfg (dict): RPN train configs.
+        sampling (bool): sampling or not.
+        unmap_outputs (bool): unmap outputs or not.
+
+    Returns:
+        tuple
+    """
+    if not inside_flags.any():
+        return (None, ) * 6
+    # assign gt and sample anchors
+    expand_inside_flags = inside_flags[:, None].expand(
+        -1, approxs_per_octave).reshape(-1)
+    approxs = flat_approxs[expand_inside_flags, :]
+    squares = flat_squares[inside_flags, :]
+
+    bbox_assigner = build_assigner(cfg.ga_assigner)
+    assign_result = bbox_assigner.assign(approxs, squares, approxs_per_octave,
+                                         gt_bboxes, gt_bboxes_ignore)
+    if sampling:
+        bbox_sampler = build_sampler(cfg.ga_sampler)
+    else:
+        bbox_sampler = PseudoSampler()
+    sampling_result = bbox_sampler.sample(assign_result, squares, gt_bboxes)
+
+    bbox_anchors = torch.zeros_like(squares)
+    bbox_gts = torch.zeros_like(squares)
+    bbox_weights = torch.zeros_like(squares)
+
+    pos_inds = sampling_result.pos_inds
+    neg_inds = sampling_result.neg_inds
+    if len(pos_inds) > 0:
+        bbox_anchors[pos_inds, :] = sampling_result.pos_bboxes
+        bbox_gts[pos_inds, :] = sampling_result.pos_gt_bboxes
+        bbox_weights[pos_inds, :] = 1.0
+
+    # map up to original set of anchors
+    if unmap_outputs:
+        num_total_anchors = flat_squares.size(0)
+        bbox_anchors = unmap(bbox_anchors, num_total_anchors, inside_flags)
+        bbox_gts = unmap(bbox_gts, num_total_anchors, inside_flags)
+        bbox_weights = unmap(bbox_weights, num_total_anchors, inside_flags)
+
+    return (bbox_anchors, bbox_gts, bbox_weights, pos_inds, neg_inds)
diff --git a/mmdet/core/bbox/assigners/__init__.py b/mmdet/core/bbox/assigners/__init__.py
index 40a89e9d..fafa3fa0 100644
--- a/mmdet/core/bbox/assigners/__init__.py
+++ b/mmdet/core/bbox/assigners/__init__.py
@@ -1,5 +1,8 @@
 from .base_assigner import BaseAssigner
 from .max_iou_assigner import MaxIoUAssigner
+from .approx_max_iou_assigner import ApproxMaxIoUAssigner
 from .assign_result import AssignResult
 
-__all__ = ['BaseAssigner', 'MaxIoUAssigner', 'AssignResult']
+__all__ = [
+    'BaseAssigner', 'MaxIoUAssigner', 'ApproxMaxIoUAssigner', 'AssignResult'
+]
diff --git a/mmdet/core/bbox/assigners/approx_max_iou_assigner.py b/mmdet/core/bbox/assigners/approx_max_iou_assigner.py
new file mode 100644
index 00000000..1283f7f5
--- /dev/null
+++ b/mmdet/core/bbox/assigners/approx_max_iou_assigner.py
@@ -0,0 +1,116 @@
+import torch
+
+from .max_iou_assigner import MaxIoUAssigner
+from ..geometry import bbox_overlaps
+
+
+class ApproxMaxIoUAssigner(MaxIoUAssigner):
+    """Assign a corresponding gt bbox or background to each bbox.
+
+    Each proposals will be assigned with `-1`, `0`, or a positive integer
+    indicating the ground truth index.
+
+    - -1: don't care
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        pos_iou_thr (float): IoU threshold for positive bboxes.
+        neg_iou_thr (float or tuple): IoU threshold for negative bboxes.
+        min_pos_iou (float): Minimum iou for a bbox to be considered as a
+            positive bbox. Positive samples can have smaller IoU than
+            pos_iou_thr due to the 4th step (assign max IoU sample to each gt).
+        gt_max_assign_all (bool): Whether to assign all bboxes with the same
+            highest overlap with some gt to that gt.
+        ignore_iof_thr (float): IoF threshold for ignoring bboxes (if
+            `gt_bboxes_ignore` is specified). Negative values mean not
+            ignoring any bboxes.
+        ignore_wrt_candidates (bool): Whether to compute the iof between
+            `bboxes` and `gt_bboxes_ignore`, or the contrary.
+    """
+
+    def __init__(self,
+                 pos_iou_thr,
+                 neg_iou_thr,
+                 min_pos_iou=.0,
+                 gt_max_assign_all=True,
+                 ignore_iof_thr=-1,
+                 ignore_wrt_candidates=True):
+        self.pos_iou_thr = pos_iou_thr
+        self.neg_iou_thr = neg_iou_thr
+        self.min_pos_iou = min_pos_iou
+        self.gt_max_assign_all = gt_max_assign_all
+        self.ignore_iof_thr = ignore_iof_thr
+        self.ignore_wrt_candidates = ignore_wrt_candidates
+
+    def assign(self,
+               approxs,
+               squares,
+               approxs_per_octave,
+               gt_bboxes,
+               gt_bboxes_ignore=None,
+               gt_labels=None):
+        """Assign gt to approxs.
+
+        This method assign a gt bbox to each group of approxs (bboxes),
+        each group of approxs is represent by a base approx (bbox) and
+        will be assigned with -1, 0, or a positive number.
+        -1 means don't care, 0 means negative sample,
+        positive number is the index (1-based) of assigned gt.
+        The assignment is done in following steps, the order matters.
+
+        1. assign every bbox to -1
+        2. use the max IoU of each group of approxs to assign
+        2. assign proposals whose iou with all gts < neg_iou_thr to 0
+        3. for each bbox, if the iou with its nearest gt >= pos_iou_thr,
+           assign it to that bbox
+        4. for each gt bbox, assign its nearest proposals (may be more than
+           one) to itself
+
+        Args:
+            approxs (Tensor): Bounding boxes to be assigned,
+        shape(approxs_per_octave*n, 4).
+            squares (Tensor): Base Bounding boxes to be assigned,
+        shape(n, 4).
+            approxs_per_octave (int): number of approxs per octave
+            gt_bboxes (Tensor): Groundtruth boxes, shape (k, 4).
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`, e.g., crowd boxes in COCO.
+            gt_labels (Tensor, optional): Label of gt_bboxes, shape (k, ).
+
+        Returns:
+            :obj:`AssignResult`: The assign result.
+        """
+
+        if squares.shape[0] == 0 or gt_bboxes.shape[0] == 0:
+            raise ValueError('No gt or approxs')
+        num_squares = squares.size(0)
+        num_gts = gt_bboxes.size(0)
+        # re-organize anchors by approxs_per_octave x num_squares
+        approxs = torch.transpose(
+            approxs.view(num_squares, approxs_per_octave, 4), 0,
+            1).contiguous().view(-1, 4)
+        all_overlaps = bbox_overlaps(approxs, gt_bboxes)
+
+        overlaps, _ = all_overlaps.view(approxs_per_octave, num_squares,
+                                        num_gts).max(dim=0)
+        overlaps = torch.transpose(overlaps, 0, 1)
+
+        bboxes = squares[:, :4]
+
+        if (self.ignore_iof_thr > 0) and (gt_bboxes_ignore is not None) and (
+                gt_bboxes_ignore.numel() > 0):
+            if self.ignore_wrt_candidates:
+                ignore_overlaps = bbox_overlaps(bboxes,
+                                                gt_bboxes_ignore,
+                                                mode='iof')
+                ignore_max_overlaps, _ = ignore_overlaps.max(dim=1)
+            else:
+                ignore_overlaps = bbox_overlaps(gt_bboxes_ignore,
+                                                bboxes,
+                                                mode='iof')
+                ignore_max_overlaps, _ = ignore_overlaps.max(dim=0)
+            overlaps[:, ignore_max_overlaps > self.ignore_iof_thr] = -1
+
+        assign_result = self.assign_wrt_overlaps(overlaps, gt_labels)
+        return assign_result
diff --git a/mmdet/core/loss/__init__.py b/mmdet/core/loss/__init__.py
index 88805187..c73b2211 100644
--- a/mmdet/core/loss/__init__.py
+++ b/mmdet/core/loss/__init__.py
@@ -1,12 +1,13 @@
-from .losses import (
-    weighted_nll_loss, weighted_cross_entropy, weighted_binary_cross_entropy,
-    sigmoid_focal_loss, py_sigmoid_focal_loss, weighted_sigmoid_focal_loss,
-    mask_cross_entropy, smooth_l1_loss, weighted_smoothl1, accuracy, iou_loss)
+from .losses import (weighted_nll_loss, weighted_cross_entropy,
+                     weighted_binary_cross_entropy, sigmoid_focal_loss,
+                     py_sigmoid_focal_loss, weighted_sigmoid_focal_loss,
+                     mask_cross_entropy, smooth_l1_loss, weighted_smoothl1,
+                     bounded_iou_loss, weighted_iou_loss, iou_loss, accuracy)
 
 __all__ = [
     'weighted_nll_loss', 'weighted_cross_entropy',
     'weighted_binary_cross_entropy', 'sigmoid_focal_loss',
     'py_sigmoid_focal_loss', 'weighted_sigmoid_focal_loss',
-    'mask_cross_entropy', 'smooth_l1_loss', 'weighted_smoothl1', 'accuracy',
-    'iou_loss'
+    'mask_cross_entropy', 'smooth_l1_loss', 'weighted_smoothl1',
+    'bounded_iou_loss', 'weighted_iou_loss', 'iou_loss', 'accuracy'
 ]
diff --git a/mmdet/core/loss/losses.py b/mmdet/core/loss/losses.py
index e541ec47..6bb09544 100644
--- a/mmdet/core/loss/losses.py
+++ b/mmdet/core/loss/losses.py
@@ -44,8 +44,8 @@ def py_sigmoid_focal_loss(pred,
     pt = (1 - pred_sigmoid) * target + pred_sigmoid * (1 - target)
     weight = (alpha * target + (1 - alpha) * (1 - target)) * weight
     weight = weight * pt.pow(gamma)
-    loss = F.binary_cross_entropy_with_logits(
-        pred, target, reduction='none') * weight
+    loss = F.binary_cross_entropy_with_logits(pred, target,
+                                              reduction='none') * weight
     reduction_enum = F._Reduction.get_enum(reduction)
     # none: 0, mean:1, sum: 2
     if reduction_enum == 0:
@@ -66,16 +66,17 @@ def weighted_sigmoid_focal_loss(pred,
     if avg_factor is None:
         avg_factor = torch.sum(weight > 0).float().item() / num_classes + 1e-6
     return torch.sum(
-        sigmoid_focal_loss(pred, target, gamma, alpha, 'none') * weight.view(
-            -1, 1))[None] / avg_factor
+        sigmoid_focal_loss(pred, target, gamma, alpha, 'none') *
+        weight.view(-1, 1))[None] / avg_factor
 
 
 def mask_cross_entropy(pred, target, label):
     num_rois = pred.size()[0]
     inds = torch.arange(0, num_rois, dtype=torch.long, device=pred.device)
     pred_slice = pred[inds, label].squeeze(1)
-    return F.binary_cross_entropy_with_logits(
-        pred_slice, target, reduction='mean')[None]
+    return F.binary_cross_entropy_with_logits(pred_slice,
+                                              target,
+                                              reduction='mean')[None]
 
 
 def smooth_l1_loss(pred, target, beta=1.0, reduction='mean'):
@@ -101,6 +102,85 @@ def weighted_smoothl1(pred, target, weight, beta=1.0, avg_factor=None):
     return torch.sum(loss * weight)[None] / avg_factor
 
 
+def bounded_iou_loss(pred, target, beta=0.2, eps=1e-3, reduction='mean'):
+    """Improving Object Localization with Fitness NMS and Bounded IoU Loss,
+    https://arxiv.org/abs/1711.00164.
+
+    Args:
+        pred (tensor): Predicted bboxes.
+        target (tensor): Target bboxes.
+        beta (float): beta parameter in smoothl1.
+        eps (float): eps to avoid NaN.
+        reduction (str): Reduction type.
+    """
+    pred_ctrx = (pred[:, 0] + pred[:, 2]) * 0.5
+    pred_ctry = (pred[:, 1] + pred[:, 3]) * 0.5
+    pred_w = pred[:, 2] - pred[:, 0] + 1
+    pred_h = pred[:, 3] - pred[:, 1] + 1
+    with torch.no_grad():
+        target_ctrx = (target[:, 0] + target[:, 2]) * 0.5
+        target_ctry = (target[:, 1] + target[:, 3]) * 0.5
+        target_w = target[:, 2] - target[:, 0] + 1
+        target_h = target[:, 3] - target[:, 1] + 1
+
+    dx = target_ctrx - pred_ctrx
+    dy = target_ctry - pred_ctry
+
+    loss_dx = 1 - torch.max(
+        (target_w - 2 * dx.abs()) /
+        (target_w + 2 * dx.abs() + eps), torch.zeros_like(dx))
+    loss_dy = 1 - torch.max(
+        (target_h - 2 * dy.abs()) /
+        (target_h + 2 * dy.abs() + eps), torch.zeros_like(dy))
+    loss_dw = 1 - torch.min(target_w / (pred_w + eps), pred_w /
+                            (target_w + eps))
+    loss_dh = 1 - torch.min(target_h / (pred_h + eps), pred_h /
+                            (target_h + eps))
+    loss_comb = torch.stack([loss_dx, loss_dy, loss_dw, loss_dh],
+                            dim=-1).view(loss_dx.size(0), -1)
+
+    loss = torch.where(loss_comb < beta, 0.5 * loss_comb * loss_comb / beta,
+                       loss_comb - 0.5 * beta)
+    reduction_enum = F._Reduction.get_enum(reduction)
+    # none: 0, mean:1, sum: 2
+    if reduction_enum == 0:
+        return loss
+    elif reduction_enum == 1:
+        return loss.sum() / pred.numel()
+    elif reduction_enum == 2:
+        return loss.sum()
+
+
+def weighted_iou_loss(pred,
+                      target,
+                      weight,
+                      style='naive',
+                      beta=0.2,
+                      eps=1e-3,
+                      avg_factor=None):
+    if style not in ['bounded', 'naive']:
+        raise ValueError('Only support bounded iou loss and naive iou loss.')
+    inds = torch.nonzero(weight[:, 0] > 0)
+    if avg_factor is None:
+        avg_factor = inds.numel() + 1e-6
+
+    if inds.numel() > 0:
+        inds = inds.squeeze(1)
+    else:
+        return (pred * weight).sum()[None] / avg_factor
+
+    if style == 'bounded':
+        loss = bounded_iou_loss(pred[inds],
+                                target[inds],
+                                beta=beta,
+                                eps=eps,
+                                reduction='sum')
+    else:
+        loss = iou_loss(pred[inds], target[inds], reduction='sum')
+    loss = loss[None] / avg_factor
+    return loss
+
+
 def accuracy(pred, target, topk=1):
     if isinstance(topk, int):
         topk = (topk, )
@@ -125,8 +205,9 @@ def _expand_binary_labels(labels, label_weights, label_channels):
     inds = torch.nonzero(labels >= 1).squeeze()
     if inds.numel() > 0:
         bin_labels[inds, labels[inds] - 1] = 1
-    bin_label_weights = label_weights.view(-1, 1).expand(
-        label_weights.size(0), label_channels)
+    bin_label_weights = label_weights.view(-1,
+                                           1).expand(label_weights.size(0),
+                                                     label_channels)
     return bin_labels, bin_label_weights
 
 
diff --git a/mmdet/models/anchor_heads/__init__.py b/mmdet/models/anchor_heads/__init__.py
index 86877a24..798b1bca 100644
--- a/mmdet/models/anchor_heads/__init__.py
+++ b/mmdet/models/anchor_heads/__init__.py
@@ -1,7 +1,13 @@
 from .anchor_head import AnchorHead
+from .guided_anchor_head import GuidedAnchorHead, FeatureAdaption
 from .fcos_head import FCOSHead
-from .retina_head import RetinaHead
 from .rpn_head import RPNHead
+from .ga_rpn_head import GARPNHead
+from .retina_head import RetinaHead
+from .ga_retina_head import GARetinaHead
 from .ssd_head import SSDHead
 
-__all__ = ['AnchorHead', 'RPNHead', 'RetinaHead', 'SSDHead', 'FCOSHead']
+__all__ = [
+    'AnchorHead', 'GuidedAnchorHead', 'FeatureAdaption', 'RPNHead',
+    'GARPNHead', 'RetinaHead', 'GARetinaHead', 'SSDHead', 'FCOSHead'
+]
diff --git a/mmdet/models/anchor_heads/ga_retina_head.py b/mmdet/models/anchor_heads/ga_retina_head.py
new file mode 100644
index 00000000..c39ab8d6
--- /dev/null
+++ b/mmdet/models/anchor_heads/ga_retina_head.py
@@ -0,0 +1,107 @@
+import torch.nn as nn
+from mmcv.cnn import normal_init
+
+from .guided_anchor_head import GuidedAnchorHead, FeatureAdaption
+from ..registry import HEADS
+from ..utils import bias_init_with_prob, ConvModule
+from mmdet.ops import MaskedConv2d
+
+
+@HEADS.register_module
+class GARetinaHead(GuidedAnchorHead):
+    """Guided-Anchor-based RetinaNet head."""
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 stacked_convs=4,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 **kwargs):
+        self.stacked_convs = stacked_convs
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        super(GARetinaHead, self).__init__(num_classes, in_channels, **kwargs)
+
+    def _init_layers(self):
+        self.relu = nn.ReLU(inplace=True)
+        self.cls_convs = nn.ModuleList()
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            self.cls_convs.append(
+                ConvModule(chn,
+                           self.feat_channels,
+                           3,
+                           stride=1,
+                           padding=1,
+                           conv_cfg=self.conv_cfg,
+                           norm_cfg=self.norm_cfg))
+            self.reg_convs.append(
+                ConvModule(chn,
+                           self.feat_channels,
+                           3,
+                           stride=1,
+                           padding=1,
+                           conv_cfg=self.conv_cfg,
+                           norm_cfg=self.norm_cfg))
+
+        self.conv_loc = nn.Conv2d(self.feat_channels, 1, 1)
+        self.conv_shape = nn.Conv2d(self.feat_channels, self.num_anchors * 2,
+                                    1)
+        self.feature_adaption_cls = FeatureAdaption(
+            self.feat_channels,
+            self.feat_channels,
+            kernel_size=3,
+            deformable_groups=self.deformable_groups)
+        self.feature_adaption_reg = FeatureAdaption(
+            self.feat_channels,
+            self.feat_channels,
+            kernel_size=3,
+            deformable_groups=self.deformable_groups)
+        self.retina_cls = MaskedConv2d(self.feat_channels,
+                                       self.num_anchors *
+                                       self.cls_out_channels,
+                                       3,
+                                       padding=1)
+        self.retina_reg = MaskedConv2d(self.feat_channels,
+                                       self.num_anchors * 4,
+                                       3,
+                                       padding=1)
+
+    def init_weights(self):
+        for m in self.cls_convs:
+            normal_init(m.conv, std=0.01)
+        for m in self.reg_convs:
+            normal_init(m.conv, std=0.01)
+
+        self.feature_adaption_cls.init_weights()
+        self.feature_adaption_reg.init_weights()
+
+        bias_cls = bias_init_with_prob(0.01)
+        normal_init(self.conv_loc, std=0.01, bias=bias_cls)
+        normal_init(self.conv_shape, std=0.01)
+        normal_init(self.retina_cls, std=0.01, bias=bias_cls)
+        normal_init(self.retina_reg, std=0.01)
+
+    def forward_single(self, x):
+        cls_feat = x
+        reg_feat = x
+        for cls_conv in self.cls_convs:
+            cls_feat = cls_conv(cls_feat)
+        for reg_conv in self.reg_convs:
+            reg_feat = reg_conv(reg_feat)
+
+        loc_pred = self.conv_loc(cls_feat)
+        shape_pred = self.conv_shape(reg_feat)
+
+        cls_feat = self.feature_adaption_cls(cls_feat, shape_pred)
+        reg_feat = self.feature_adaption_reg(reg_feat, shape_pred)
+
+        if not self.training:
+            mask = loc_pred.sigmoid()[0] >= self.loc_filter_thr
+        else:
+            mask = None
+        cls_score = self.retina_cls(cls_feat, mask)
+        bbox_pred = self.retina_reg(reg_feat, mask)
+        return cls_score, bbox_pred, shape_pred, loc_pred
diff --git a/mmdet/models/anchor_heads/ga_rpn_head.py b/mmdet/models/anchor_heads/ga_rpn_head.py
new file mode 100644
index 00000000..b7788b6a
--- /dev/null
+++ b/mmdet/models/anchor_heads/ga_rpn_head.py
@@ -0,0 +1,127 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import normal_init
+
+from mmdet.core import delta2bbox
+from mmdet.ops import nms
+from .guided_anchor_head import GuidedAnchorHead
+from ..registry import HEADS
+
+
+@HEADS.register_module
+class GARPNHead(GuidedAnchorHead):
+    """Guided-Anchor-based RPN head."""
+
+    def __init__(self, in_channels, **kwargs):
+        super(GARPNHead, self).__init__(2, in_channels, **kwargs)
+
+    def _init_layers(self):
+        self.rpn_conv = nn.Conv2d(self.in_channels,
+                                  self.feat_channels,
+                                  3,
+                                  padding=1)
+        super(GARPNHead, self)._init_layers()
+
+    def init_weights(self):
+        normal_init(self.rpn_conv, std=0.01)
+        super(GARPNHead, self).init_weights()
+
+    def forward_single(self, x):
+        x = self.rpn_conv(x)
+        x = F.relu(x, inplace=True)
+        (cls_score, bbox_pred, shape_pred,
+         loc_pred) = super(GARPNHead, self).forward_single(x)
+        return cls_score, bbox_pred, shape_pred, loc_pred
+
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             shape_preds,
+             loc_preds,
+             gt_bboxes,
+             img_metas,
+             cfg,
+             gt_bboxes_ignore=None):
+        losses = super(GARPNHead, self).loss(cls_scores,
+                                             bbox_preds,
+                                             shape_preds,
+                                             loc_preds,
+                                             gt_bboxes,
+                                             None,
+                                             img_metas,
+                                             cfg,
+                                             gt_bboxes_ignore=gt_bboxes_ignore)
+        return dict(loss_rpn_cls=losses['loss_cls'],
+                    loss_rpn_bbox=losses['loss_bbox'],
+                    loss_anchor_shape=losses['loss_shape'],
+                    loss_anchor_loc=losses['loss_loc'])
+
+    def get_bboxes_single(self,
+                          cls_scores,
+                          bbox_preds,
+                          mlvl_anchors,
+                          mlvl_masks,
+                          img_shape,
+                          scale_factor,
+                          cfg,
+                          rescale=False):
+        mlvl_proposals = []
+        for idx in range(len(cls_scores)):
+            rpn_cls_score = cls_scores[idx]
+            rpn_bbox_pred = bbox_preds[idx]
+            anchors = mlvl_anchors[idx]
+            mask = mlvl_masks[idx]
+            assert rpn_cls_score.size()[-2:] == rpn_bbox_pred.size()[-2:]
+            # if no location is kept, end.
+            if mask.sum() == 0:
+                continue
+            rpn_cls_score = rpn_cls_score.permute(1, 2, 0)
+            if self.use_sigmoid_cls:
+                rpn_cls_score = rpn_cls_score.reshape(-1)
+                scores = rpn_cls_score.sigmoid()
+            else:
+                rpn_cls_score = rpn_cls_score.reshape(-1, 2)
+                scores = rpn_cls_score.softmax(dim=1)[:, 1]
+            # filter scores, bbox_pred w.r.t. mask.
+            # anchors are filtered in get_anchors() beforehand.
+            scores = scores[mask]
+            rpn_bbox_pred = rpn_bbox_pred.permute(1, 2, 0).reshape(-1,
+                                                                   4)[mask, :]
+            if scores.dim() == 0:
+                rpn_bbox_pred = rpn_bbox_pred.unsqueeze(0)
+                anchors = anchors.unsqueeze(0)
+                scores = scores.unsqueeze(0)
+            # filter anchors, bbox_pred, scores w.r.t. scores
+            if cfg.nms_pre > 0 and scores.shape[0] > cfg.nms_pre:
+                _, topk_inds = scores.topk(cfg.nms_pre)
+                rpn_bbox_pred = rpn_bbox_pred[topk_inds, :]
+                anchors = anchors[topk_inds, :]
+                scores = scores[topk_inds]
+            # get proposals w.r.t. anchors and rpn_bbox_pred
+            proposals = delta2bbox(anchors, rpn_bbox_pred, self.target_means,
+                                   self.target_stds, img_shape)
+            # filter out too small bboxes
+            if cfg.min_bbox_size > 0:
+                w = proposals[:, 2] - proposals[:, 0] + 1
+                h = proposals[:, 3] - proposals[:, 1] + 1
+                valid_inds = torch.nonzero((w >= cfg.min_bbox_size) &
+                                           (h >= cfg.min_bbox_size)).squeeze()
+                proposals = proposals[valid_inds, :]
+                scores = scores[valid_inds]
+            proposals = torch.cat([proposals, scores.unsqueeze(-1)], dim=-1)
+            # NMS in current level
+            proposals, _ = nms(proposals, cfg.nms_thr)
+            proposals = proposals[:cfg.nms_post, :]
+            mlvl_proposals.append(proposals)
+        proposals = torch.cat(mlvl_proposals, 0)
+        if cfg.nms_across_levels:
+            # NMS across multi levels
+            proposals, _ = nms(proposals, cfg.nms_thr)
+            proposals = proposals[:cfg.max_num, :]
+        else:
+            scores = proposals[:, 4]
+            num = min(cfg.max_num, proposals.shape[0])
+            _, topk_inds = scores.topk(num)
+            proposals = proposals[topk_inds, :]
+        return proposals
diff --git a/mmdet/models/anchor_heads/guided_anchor_head.py b/mmdet/models/anchor_heads/guided_anchor_head.py
new file mode 100644
index 00000000..da43aa81
--- /dev/null
+++ b/mmdet/models/anchor_heads/guided_anchor_head.py
@@ -0,0 +1,589 @@
+from __future__ import division
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import normal_init
+
+from mmdet.core import (AnchorGenerator, anchor_target, anchor_inside_flags,
+                        ga_loc_target, ga_shape_target, delta2bbox,
+                        multi_apply, multiclass_nms)
+from mmdet.ops import DeformConv, MaskedConv2d
+from ..builder import build_loss
+from .anchor_head import AnchorHead
+from ..registry import HEADS
+from ..utils import bias_init_with_prob
+
+
+class FeatureAdaption(nn.Module):
+    """Feature Adaption Module.
+
+    Feature Adaption Module is implemented based on DCN v1.
+    It uses anchor shape prediction rather than feature map to
+    predict offsets of deformable conv layer.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+        out_channels (int): Number of channels in the output feature map.
+        kernel_size (int): Deformable conv kernel size.
+        deformable_groups (int): Deformable conv group size.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 deformable_groups=4):
+        super(FeatureAdaption, self).__init__()
+        offset_channels = kernel_size * kernel_size * 2
+        self.conv_offset = nn.Conv2d(2,
+                                     deformable_groups * offset_channels,
+                                     1,
+                                     bias=False)
+        self.conv_adaption = DeformConv(in_channels,
+                                        out_channels,
+                                        kernel_size=kernel_size,
+                                        padding=(kernel_size - 1) // 2,
+                                        deformable_groups=deformable_groups)
+        self.relu = nn.ReLU(inplace=True)
+
+    def init_weights(self):
+        normal_init(self.conv_offset, std=0.1)
+        normal_init(self.conv_adaption, std=0.01)
+
+    def forward(self, x, shape):
+        offset = self.conv_offset(shape.detach())
+        x = self.relu(self.conv_adaption(x, offset))
+        return x
+
+
+@HEADS.register_module
+class GuidedAnchorHead(AnchorHead):
+    """Guided-Anchor-based head (GA-RPN, GA-RetinaNet, etc.).
+
+    This GuidedAnchorHead will predict high-quality feature guided
+    anchors and locations where anchors will be kept in inference.
+    There are mainly 3 categories of bounding-boxes.
+    - Sampled (9) pairs for target assignment. (approxes)
+    - The square boxes where the predicted anchors are based on.
+        (squares)
+    - Guided anchors.
+    Please refer to https://arxiv.org/abs/1901.03278 for more details.
+
+    Args:
+        num_classes (int): Number of classes.
+        in_channels (int): Number of channels in the input feature map.
+        feat_channels (int): Number of channels of the feature map.
+        octave_base_scale (int): Base octave scale of each level of
+            feature map.
+        scales_per_octave (int): Number of octave scales in each level of
+            feature map
+        octave_ratios (Iterable): octave aspect ratios.
+        anchor_strides (Iterable): Anchor strides.
+        anchor_base_sizes (Iterable): Anchor base sizes.
+        anchoring_means (Iterable): Mean values of anchoring targets.
+        anchoring_stds (Iterable): Std values of anchoring targets.
+        target_means (Iterable): Mean values of regression targets.
+        target_stds (Iterable): Std values of regression targets.
+        deformable_groups: (int): Group number of DCN in
+            FeatureAdaption module.
+        loc_filter_thr (float): Threshold to filter out unconcerned regions.
+        loss_loc (dict): Config of location loss.
+        loss_shape (dict): Config of anchor shape loss.
+        loss_cls (dict): Config of classification loss.
+        loss_bbox (dict): Config of bbox regression loss.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 feat_channels=256,
+                 octave_base_scale=8,
+                 scales_per_octave=3,
+                 octave_ratios=[0.5, 1.0, 2.0],
+                 anchor_strides=[4, 8, 16, 32, 64],
+                 anchor_base_sizes=None,
+                 anchoring_means=(.0, .0, .0, .0),
+                 anchoring_stds=(1.0, 1.0, 1.0, 1.0),
+                 target_means=(.0, .0, .0, .0),
+                 target_stds=(1.0, 1.0, 1.0, 1.0),
+                 deformable_groups=4,
+                 loc_filter_thr=0.01,
+                 loss_loc=dict(type='FocalLoss',
+                               use_sigmoid=True,
+                               gamma=2.0,
+                               alpha=0.25,
+                               loss_weight=1.0),
+                 loss_shape=dict(type='IoULoss', beta=0.2, loss_weight=1.0),
+                 loss_cls=dict(type='CrossEntropyLoss',
+                               use_sigmoid=True,
+                               loss_weight=1.0),
+                 loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
+                                loss_weight=1.0)):
+        super(AnchorHead, self).__init__()
+        self.in_channels = in_channels
+        self.num_classes = num_classes
+        self.feat_channels = feat_channels
+        self.octave_base_scale = octave_base_scale
+        self.scales_per_octave = scales_per_octave
+        self.octave_scales = octave_base_scale * np.array(
+            [2**(i / scales_per_octave) for i in range(scales_per_octave)])
+        self.approxs_per_octave = len(self.octave_scales) * len(octave_ratios)
+        self.octave_ratios = octave_ratios
+        self.anchor_strides = anchor_strides
+        self.anchor_base_sizes = list(
+            anchor_strides) if anchor_base_sizes is None else anchor_base_sizes
+        self.anchoring_means = anchoring_means
+        self.anchoring_stds = anchoring_stds
+        self.target_means = target_means
+        self.target_stds = target_stds
+        self.deformable_groups = deformable_groups
+        self.loc_filter_thr = loc_filter_thr
+        self.approx_generators = []
+        self.square_generators = []
+        for anchor_base in self.anchor_base_sizes:
+            # Generators for approxs
+            self.approx_generators.append(
+                AnchorGenerator(anchor_base, self.octave_scales,
+                                self.octave_ratios))
+            # Generators for squares
+            self.square_generators.append(
+                AnchorGenerator(anchor_base, [self.octave_base_scale], [1.0]))
+        # one anchor per location
+        self.num_anchors = 1
+        self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
+        self.cls_focal_loss = loss_cls['type'] in ['FocalLoss']
+        self.loc_focal_loss = loss_loc['type'] in ['FocalLoss']
+        if self.use_sigmoid_cls:
+            self.cls_out_channels = self.num_classes - 1
+        else:
+            self.cls_out_channels = self.num_classes
+
+        # build losses
+        self.loss_loc = build_loss(loss_loc)
+        self.loss_shape = build_loss(loss_shape)
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox = build_loss(loss_bbox)
+
+        self._init_layers()
+
+    def _init_layers(self):
+        self.relu = nn.ReLU(inplace=True)
+        self.conv_loc = nn.Conv2d(self.feat_channels, 1, 1)
+        self.conv_shape = nn.Conv2d(self.feat_channels, self.num_anchors * 2,
+                                    1)
+        self.feature_adaption = FeatureAdaption(
+            self.feat_channels,
+            self.feat_channels,
+            kernel_size=3,
+            deformable_groups=self.deformable_groups)
+        self.conv_cls = MaskedConv2d(self.feat_channels,
+                                     self.num_anchors * self.cls_out_channels,
+                                     1)
+        self.conv_reg = MaskedConv2d(self.feat_channels, self.num_anchors * 4,
+                                     1)
+
+    def init_weights(self):
+        normal_init(self.conv_cls, std=0.01)
+        normal_init(self.conv_reg, std=0.01)
+
+        bias_cls = bias_init_with_prob(0.01)
+        normal_init(self.conv_loc, std=0.01, bias=bias_cls)
+        normal_init(self.conv_shape, std=0.01)
+
+        self.feature_adaption.init_weights()
+
+    def forward_single(self, x):
+        loc_pred = self.conv_loc(x)
+        shape_pred = self.conv_shape(x)
+        x = self.feature_adaption(x, shape_pred)
+        # masked conv is only used during inference for speed-up
+        if not self.training:
+            mask = loc_pred.sigmoid()[0] >= self.loc_filter_thr
+        else:
+            mask = None
+        cls_score = self.conv_cls(x, mask)
+        bbox_pred = self.conv_reg(x, mask)
+        return cls_score, bbox_pred, shape_pred, loc_pred
+
+    def forward(self, feats):
+        return multi_apply(self.forward_single, feats)
+
+    def get_sampled_approxs(self, featmap_sizes, img_metas, cfg):
+        """Get sampled approxs and inside flags according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            img_metas (list[dict]): Image meta info.
+
+        Returns:
+            tuple: approxes of each image, inside flags of each image
+        """
+        num_imgs = len(img_metas)
+        num_levels = len(featmap_sizes)
+
+        # since feature map sizes of all images are the same, we only compute
+        # approxes for one time
+        multi_level_approxs = []
+        for i in range(num_levels):
+            approxs = self.approx_generators[i].grid_anchors(
+                featmap_sizes[i], self.anchor_strides[i])
+            multi_level_approxs.append(approxs)
+        approxs_list = [multi_level_approxs for _ in range(num_imgs)]
+
+        # for each image, we compute inside flags of multi level approxes
+        inside_flag_list = []
+        for img_id, img_meta in enumerate(img_metas):
+            multi_level_flags = []
+            multi_level_approxs = approxs_list[img_id]
+            for i in range(num_levels):
+                approxs = multi_level_approxs[i]
+                anchor_stride = self.anchor_strides[i]
+                feat_h, feat_w = featmap_sizes[i]
+                h, w, _ = img_meta['pad_shape']
+                valid_feat_h = min(int(np.ceil(h / anchor_stride)), feat_h)
+                valid_feat_w = min(int(np.ceil(w / anchor_stride)), feat_w)
+                flags = self.approx_generators[i].valid_flags(
+                    (feat_h, feat_w), (valid_feat_h, valid_feat_w))
+                inside_flags_list = []
+                for i in range(self.approxs_per_octave):
+                    split_valid_flags = flags[i::self.approxs_per_octave]
+                    split_approxs = approxs[i::self.approxs_per_octave, :]
+                    inside_flags = anchor_inside_flags(
+                        split_approxs, split_valid_flags,
+                        img_meta['img_shape'][:2], cfg.allowed_border)
+                    inside_flags_list.append(inside_flags)
+                # inside_flag for a position is true if any anchor in this
+                # position is true
+                inside_flags = (torch.stack(inside_flags_list, 0).sum(dim=0) >
+                                0)
+                multi_level_flags.append(inside_flags)
+            inside_flag_list.append(multi_level_flags)
+        return approxs_list, inside_flag_list
+
+    def get_anchors(self,
+                    featmap_sizes,
+                    shape_preds,
+                    loc_preds,
+                    img_metas,
+                    use_loc_filter=False):
+        """Get squares according to feature map sizes and guided
+        anchors.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            shape_preds (list[tensor]): Multi-level shape predictions.
+            loc_preds (list[tensor]): Multi-level location predictions.
+            img_metas (list[dict]): Image meta info.
+            use_loc_filter (bool): Use loc filter or not.
+
+        Returns:
+            tuple: square approxs of each image, guided anchors of each image,
+                loc masks of each image
+        """
+        num_imgs = len(img_metas)
+        num_levels = len(featmap_sizes)
+
+        # since feature map sizes of all images are the same, we only compute
+        # squares for one time
+        multi_level_squares = []
+        for i in range(num_levels):
+            squares = self.square_generators[i].grid_anchors(
+                featmap_sizes[i], self.anchor_strides[i])
+            multi_level_squares.append(squares)
+        squares_list = [multi_level_squares for _ in range(num_imgs)]
+
+        # for each image, we compute multi level guided anchors
+        guided_anchors_list = []
+        loc_mask_list = []
+        for img_id, img_meta in enumerate(img_metas):
+            multi_level_guided_anchors = []
+            multi_level_loc_mask = []
+            for i in range(num_levels):
+                squares = squares_list[img_id][i]
+                shape_pred = shape_preds[i][img_id]
+                loc_pred = loc_preds[i][img_id]
+                guided_anchors, loc_mask = self.get_guided_anchors_single(
+                    squares,
+                    shape_pred,
+                    loc_pred,
+                    use_loc_filter=use_loc_filter)
+                multi_level_guided_anchors.append(guided_anchors)
+                multi_level_loc_mask.append(loc_mask)
+            guided_anchors_list.append(multi_level_guided_anchors)
+            loc_mask_list.append(multi_level_loc_mask)
+        return squares_list, guided_anchors_list, loc_mask_list
+
+    def get_guided_anchors_single(self,
+                                  squares,
+                                  shape_pred,
+                                  loc_pred,
+                                  use_loc_filter=False):
+        """Get guided anchors and loc masks for a single level.
+
+        Args:
+            square (tensor): Squares of a single level.
+            shape_pred (tensor): Shape predections of a single level.
+            loc_pred (tensor): Loc predections of a single level.
+            use_loc_filter (list[tensor]): Use loc filter or not.
+
+        Returns:
+            tuple: guided anchors, location masks
+        """
+        # calculate location filtering mask
+        loc_pred = loc_pred.sigmoid().detach()
+        if use_loc_filter:
+            loc_mask = loc_pred >= self.loc_filter_thr
+        else:
+            loc_mask = loc_pred >= 0.0
+        mask = loc_mask.permute(1, 2, 0).expand(-1, -1, self.num_anchors)
+        mask = mask.contiguous().view(-1)
+        # calculate guided anchors
+        squares = squares[mask]
+        anchor_deltas = shape_pred.permute(1, 2, 0).contiguous().view(
+            -1, 2).detach()[mask]
+        bbox_deltas = anchor_deltas.new_full(squares.size(), 0)
+        bbox_deltas[:, 2:] = anchor_deltas
+        guided_anchors = delta2bbox(squares,
+                                    bbox_deltas,
+                                    self.anchoring_means,
+                                    self.anchoring_stds,
+                                    wh_ratio_clip=1e-6)
+        return guided_anchors, mask
+
+    def loss_shape_single(self, shape_pred, bbox_anchors, bbox_gts,
+                          anchor_weights, anchor_total_num):
+        shape_pred = shape_pred.permute(0, 2, 3, 1).contiguous().view(-1, 2)
+        bbox_anchors = bbox_anchors.contiguous().view(-1, 4)
+        bbox_gts = bbox_gts.contiguous().view(-1, 4)
+        anchor_weights = anchor_weights.contiguous().view(-1, 4)
+        bbox_deltas = bbox_anchors.new_full(bbox_anchors.size(), 0)
+        bbox_deltas[:, 2:] += shape_pred
+        # filter out negative samples to speed-up weighted_bounded_iou_loss
+        inds = torch.nonzero(anchor_weights[:, 0] > 0).squeeze(1)
+        bbox_deltas_ = bbox_deltas[inds]
+        bbox_anchors_ = bbox_anchors[inds]
+        bbox_gts_ = bbox_gts[inds]
+        anchor_weights_ = anchor_weights[inds]
+        pred_anchors_ = delta2bbox(bbox_anchors_,
+                                   bbox_deltas_,
+                                   self.anchoring_means,
+                                   self.anchoring_stds,
+                                   wh_ratio_clip=1e-6)
+        loss_shape = self.loss_shape(pred_anchors_,
+                                     bbox_gts_,
+                                     anchor_weights_,
+                                     avg_factor=anchor_total_num)
+        return loss_shape
+
+    def loss_loc_single(self, loc_pred, loc_target, loc_weight, loc_avg_factor,
+                        cfg):
+        loss_loc = self.loss_loc(loc_pred.reshape(-1, 1),
+                                 loc_target.reshape(-1, 1).long(),
+                                 loc_weight.reshape(-1, 1),
+                                 avg_factor=loc_avg_factor)
+        return loss_loc
+
+    def loss(self,
+             cls_scores,
+             bbox_preds,
+             shape_preds,
+             loc_preds,
+             gt_bboxes,
+             gt_labels,
+             img_metas,
+             cfg,
+             gt_bboxes_ignore=None):
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        assert len(featmap_sizes) == len(self.approx_generators)
+
+        # get loc targets
+        loc_targets, loc_weights, loc_avg_factor = ga_loc_target(
+            gt_bboxes,
+            featmap_sizes,
+            self.octave_base_scale,
+            self.anchor_strides,
+            center_ratio=cfg.center_ratio,
+            ignore_ratio=cfg.ignore_ratio)
+
+        # get sampled approxes
+        approxs_list, inside_flag_list = self.get_sampled_approxs(
+            featmap_sizes, img_metas, cfg)
+        # get squares and guided anchors
+        squares_list, guided_anchors_list, _ = self.get_anchors(
+            featmap_sizes, shape_preds, loc_preds, img_metas)
+
+        # get shape targets
+        sampling = False if not hasattr(cfg, 'ga_sampler') else True
+        shape_targets = ga_shape_target(approxs_list,
+                                        inside_flag_list,
+                                        squares_list,
+                                        gt_bboxes,
+                                        img_metas,
+                                        self.approxs_per_octave,
+                                        cfg,
+                                        sampling=sampling)
+        if shape_targets is None:
+            return None
+        (bbox_anchors_list, bbox_gts_list, anchor_weights_list, anchor_fg_num,
+         anchor_bg_num) = shape_targets
+        anchor_total_num = (anchor_fg_num if not sampling else anchor_fg_num +
+                            anchor_bg_num)
+
+        # get anchor targets
+        sampling = False if self.cls_focal_loss else True
+        label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
+        cls_reg_targets = anchor_target(guided_anchors_list,
+                                        inside_flag_list,
+                                        gt_bboxes,
+                                        img_metas,
+                                        self.target_means,
+                                        self.target_stds,
+                                        cfg,
+                                        gt_bboxes_ignore_list=gt_bboxes_ignore,
+                                        gt_labels_list=gt_labels,
+                                        label_channels=label_channels,
+                                        sampling=sampling)
+        if cls_reg_targets is None:
+            return None
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        num_total_samples = (num_total_pos if self.cls_focal_loss else
+                             num_total_pos + num_total_neg)
+
+        # get classification and bbox regression losses
+        losses_cls, losses_bbox = multi_apply(
+            self.loss_single,
+            cls_scores,
+            bbox_preds,
+            labels_list,
+            label_weights_list,
+            bbox_targets_list,
+            bbox_weights_list,
+            num_total_samples=num_total_samples,
+            cfg=cfg)
+
+        # get anchor location loss
+        losses_loc, = multi_apply(self.loss_loc_single,
+                                  loc_preds,
+                                  loc_targets,
+                                  loc_weights,
+                                  loc_avg_factor=loc_avg_factor,
+                                  cfg=cfg)
+
+        # get anchor shape loss
+        losses_shape, = multi_apply(self.loss_shape_single,
+                                    shape_preds,
+                                    bbox_anchors_list,
+                                    bbox_gts_list,
+                                    anchor_weights_list,
+                                    anchor_total_num=anchor_total_num)
+        return dict(loss_cls=losses_cls,
+                    loss_bbox=losses_bbox,
+                    loss_shape=losses_shape,
+                    loss_loc=losses_loc)
+
+    def get_bboxes(self,
+                   cls_scores,
+                   bbox_preds,
+                   shape_preds,
+                   loc_preds,
+                   img_metas,
+                   cfg,
+                   rescale=False):
+        assert len(cls_scores) == len(bbox_preds) == len(shape_preds) == len(
+            loc_preds)
+        num_levels = len(cls_scores)
+        featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
+        # get guided anchors
+        _, guided_anchors, loc_masks = self.get_anchors(
+            featmap_sizes,
+            shape_preds,
+            loc_preds,
+            img_metas,
+            use_loc_filter=not self.training)
+        result_list = []
+        for img_id in range(len(img_metas)):
+            cls_score_list = [
+                cls_scores[i][img_id].detach() for i in range(num_levels)
+            ]
+            bbox_pred_list = [
+                bbox_preds[i][img_id].detach() for i in range(num_levels)
+            ]
+            guided_anchor_list = [
+                guided_anchors[img_id][i].detach() for i in range(num_levels)
+            ]
+            loc_mask_list = [
+                loc_masks[img_id][i].detach() for i in range(num_levels)
+            ]
+            img_shape = img_metas[img_id]['img_shape']
+            scale_factor = img_metas[img_id]['scale_factor']
+            proposals = self.get_bboxes_single(cls_score_list, bbox_pred_list,
+                                               guided_anchor_list,
+                                               loc_mask_list, img_shape,
+                                               scale_factor, cfg, rescale)
+            result_list.append(proposals)
+        return result_list
+
+    def get_bboxes_single(self,
+                          cls_scores,
+                          bbox_preds,
+                          mlvl_anchors,
+                          mlvl_masks,
+                          img_shape,
+                          scale_factor,
+                          cfg,
+                          rescale=False):
+        assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
+        mlvl_bboxes = []
+        mlvl_scores = []
+        for cls_score, bbox_pred, anchors, mask in zip(cls_scores, bbox_preds,
+                                                       mlvl_anchors,
+                                                       mlvl_masks):
+            assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
+            # if no location is kept, end.
+            if mask.sum() == 0:
+                continue
+            # reshape scores and bbox_pred
+            cls_score = cls_score.permute(1, 2,
+                                          0).reshape(-1, self.cls_out_channels)
+            if self.use_sigmoid_cls:
+                scores = cls_score.sigmoid()
+            else:
+                scores = cls_score.softmax(-1)
+            bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, 4)
+            # filter scores, bbox_pred w.r.t. mask.
+            # anchors are filtered in get_anchors() beforehand.
+            scores = scores[mask, :]
+            bbox_pred = bbox_pred[mask, :]
+            if scores.dim() == 0:
+                anchors = anchors.unsqueeze(0)
+                scores = scores.unsqueeze(0)
+                bbox_pred = bbox_pred.unsqueeze(0)
+            # filter anchors, bbox_pred, scores w.r.t. scores
+            nms_pre = cfg.get('nms_pre', -1)
+            if nms_pre > 0 and scores.shape[0] > nms_pre:
+                if self.use_sigmoid_cls:
+                    max_scores, _ = scores.max(dim=1)
+                else:
+                    max_scores, _ = scores[:, 1:].max(dim=1)
+                _, topk_inds = max_scores.topk(nms_pre)
+                anchors = anchors[topk_inds, :]
+                bbox_pred = bbox_pred[topk_inds, :]
+                scores = scores[topk_inds, :]
+            bboxes = delta2bbox(anchors, bbox_pred, self.target_means,
+                                self.target_stds, img_shape)
+            mlvl_bboxes.append(bboxes)
+            mlvl_scores.append(scores)
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        if rescale:
+            mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
+        mlvl_scores = torch.cat(mlvl_scores)
+        if self.use_sigmoid_cls:
+            padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
+            mlvl_scores = torch.cat([padding, mlvl_scores], dim=1)
+        # multi class NMS
+        det_bboxes, det_labels = multiclass_nms(mlvl_bboxes, mlvl_scores,
+                                                cfg.score_thr, cfg.nms,
+                                                cfg.max_per_img)
+        return det_bboxes, det_labels
diff --git a/mmdet/models/losses/__init__.py b/mmdet/models/losses/__init__.py
index efe40eae..3b002459 100644
--- a/mmdet/models/losses/__init__.py
+++ b/mmdet/models/losses/__init__.py
@@ -1,5 +1,6 @@
 from .cross_entropy_loss import CrossEntropyLoss
 from .focal_loss import FocalLoss
 from .smooth_l1_loss import SmoothL1Loss
+from .iou_loss import IoULoss
 
-__all__ = ['CrossEntropyLoss', 'FocalLoss', 'SmoothL1Loss']
+__all__ = ['CrossEntropyLoss', 'FocalLoss', 'SmoothL1Loss', 'IoULoss']
diff --git a/mmdet/models/losses/iou_loss.py b/mmdet/models/losses/iou_loss.py
new file mode 100644
index 00000000..8c9d602f
--- /dev/null
+++ b/mmdet/models/losses/iou_loss.py
@@ -0,0 +1,26 @@
+import torch.nn as nn
+from mmdet.core import weighted_iou_loss
+
+from ..registry import LOSSES
+
+
+@LOSSES.register_module
+class IoULoss(nn.Module):
+
+    def __init__(self, style='naive', beta=0.2, eps=1e-3, loss_weight=1.0):
+        super(IoULoss, self).__init__()
+        self.style = style
+        self.beta = beta
+        self.eps = eps
+        self.loss_weight = loss_weight
+
+    def forward(self, pred, target, weight, *args, **kwargs):
+        loss = self.loss_weight * weighted_iou_loss(
+            pred,
+            target,
+            weight,
+            beta=self.beta,
+            eps=self.eps,
+            *args,
+            **kwargs)
+        return loss
diff --git a/mmdet/ops/__init__.py b/mmdet/ops/__init__.py
index b3cbc266..34467bf4 100644
--- a/mmdet/ops/__init__.py
+++ b/mmdet/ops/__init__.py
@@ -6,11 +6,13 @@ from .nms import nms, soft_nms
 from .roi_align import RoIAlign, roi_align
 from .roi_pool import RoIPool, roi_pool
 from .sigmoid_focal_loss import SigmoidFocalLoss, sigmoid_focal_loss
+from .masked_conv import MaskedConv2d
 
 __all__ = [
     'nms', 'soft_nms', 'RoIAlign', 'roi_align', 'RoIPool', 'roi_pool',
     'DeformConv', 'DeformConvPack', 'DeformRoIPooling', 'DeformRoIPoolingPack',
     'ModulatedDeformRoIPoolingPack', 'ModulatedDeformConv',
     'ModulatedDeformConvPack', 'deform_conv', 'modulated_deform_conv',
-    'deform_roi_pooling', 'SigmoidFocalLoss', 'sigmoid_focal_loss'
+    'deform_roi_pooling', 'SigmoidFocalLoss', 'sigmoid_focal_loss',
+    'MaskedConv2d'
 ]
diff --git a/mmdet/ops/masked_conv/__init__.py b/mmdet/ops/masked_conv/__init__.py
new file mode 100644
index 00000000..feab9531
--- /dev/null
+++ b/mmdet/ops/masked_conv/__init__.py
@@ -0,0 +1,4 @@
+from .functions.masked_conv import masked_conv2d
+from .modules.masked_conv import MaskedConv2d
+
+__all__ = ['masked_conv2d', 'MaskedConv2d']
diff --git a/mmdet/ops/masked_conv/functions/__init__.py b/mmdet/ops/masked_conv/functions/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/mmdet/ops/masked_conv/functions/masked_conv.py b/mmdet/ops/masked_conv/functions/masked_conv.py
new file mode 100644
index 00000000..41ba5a75
--- /dev/null
+++ b/mmdet/ops/masked_conv/functions/masked_conv.py
@@ -0,0 +1,55 @@
+import math
+import torch
+from torch.autograd import Function
+from torch.nn.modules.utils import _pair
+from .. import masked_conv2d_cuda
+
+
+class MaskedConv2dFunction(Function):
+
+    @staticmethod
+    def forward(ctx, features, mask, weight, bias, padding=0, stride=1):
+        assert mask.dim() == 3 and mask.size(0) == 1
+        assert features.dim() == 4 and features.size(0) == 1
+        assert features.size()[2:] == mask.size()[1:]
+        pad_h, pad_w = _pair(padding)
+        stride_h, stride_w = _pair(stride)
+        if stride_h != 1 or stride_w != 1:
+            raise ValueError(
+                'Stride could not only be 1 in masked_conv2d currently.')
+        if not features.is_cuda:
+            raise NotImplementedError
+
+        out_channel, in_channel, kernel_h, kernel_w = weight.size()
+
+        batch_size = features.size(0)
+        out_h = int(
+            math.floor((features.size(2) + 2 * pad_h -
+                        (kernel_h - 1) - 1) / stride_h + 1))
+        out_w = int(
+            math.floor((features.size(3) + 2 * pad_w -
+                        (kernel_h - 1) - 1) / stride_w + 1))
+        mask_inds = torch.nonzero(mask[0] > 0)
+        mask_h_idx = mask_inds[:, 0].contiguous()
+        mask_w_idx = mask_inds[:, 1].contiguous()
+        data_col = features.new_zeros(in_channel * kernel_h * kernel_w,
+                                      mask_inds.size(0))
+        masked_conv2d_cuda.masked_im2col_forward(features, mask_h_idx,
+                                                 mask_w_idx, kernel_h,
+                                                 kernel_w, pad_h, pad_w,
+                                                 data_col)
+
+        masked_output = torch.addmm(1, bias[:, None], 1,
+                                    weight.view(out_channel, -1), data_col)
+        output = features.new_zeros(batch_size, out_channel, out_h, out_w)
+        masked_conv2d_cuda.masked_col2im_forward(masked_output, mask_h_idx,
+                                                 mask_w_idx, out_h, out_w,
+                                                 out_channel, output)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        return (None, ) * 5
+
+
+masked_conv2d = MaskedConv2dFunction.apply
diff --git a/mmdet/ops/masked_conv/modules/__Init__.py b/mmdet/ops/masked_conv/modules/__Init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/mmdet/ops/masked_conv/modules/masked_conv.py b/mmdet/ops/masked_conv/modules/masked_conv.py
new file mode 100644
index 00000000..1b8c434a
--- /dev/null
+++ b/mmdet/ops/masked_conv/modules/masked_conv.py
@@ -0,0 +1,30 @@
+import torch.nn as nn
+from ..functions.masked_conv import masked_conv2d
+
+
+class MaskedConv2d(nn.Conv2d):
+    """A MaskedConv2d which inherits the official Conv2d.
+
+    The masked forward doesn't implement the backward function and only
+    supports the stride parameter to be 1 currently.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True):
+        super(MaskedConv2d,
+              self).__init__(in_channels, out_channels, kernel_size, stride,
+                             padding, dilation, groups, bias)
+
+    def forward(self, input, mask=None):
+        if mask is None:  # fallback to the normal Conv2d
+            return super(MaskedConv2d, self).forward(input)
+        else:
+            return masked_conv2d(input, mask, self.weight, self.bias,
+                                 self.padding)
diff --git a/mmdet/ops/masked_conv/setup.py b/mmdet/ops/masked_conv/setup.py
new file mode 100644
index 00000000..fdff5f20
--- /dev/null
+++ b/mmdet/ops/masked_conv/setup.py
@@ -0,0 +1,12 @@
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+setup(
+    name='masked_conv2d_cuda',
+    ext_modules=[
+        CUDAExtension('masked_conv2d_cuda', [
+            'src/masked_conv2d_cuda.cpp',
+            'src/masked_conv2d_kernel.cu',
+        ]),
+    ],
+    cmdclass={'build_ext': BuildExtension})
diff --git a/mmdet/ops/masked_conv/src/masked_conv2d_cuda.cpp b/mmdet/ops/masked_conv/src/masked_conv2d_cuda.cpp
new file mode 100644
index 00000000..f9d53735
--- /dev/null
+++ b/mmdet/ops/masked_conv/src/masked_conv2d_cuda.cpp
@@ -0,0 +1,74 @@
+#include <torch/extension.h>
+
+#include <cmath>
+#include <vector>
+
+int MaskedIm2colForwardLaucher(const at::Tensor im, const int height,
+                               const int width, const int channels,
+                               const int kernel_h, const int kernel_w,
+                               const int pad_h, const int pad_w,
+                               const at::Tensor mask_h_idx,
+                               const at::Tensor mask_w_idx, const int mask_cnt,
+                               at::Tensor col);
+
+int MaskedCol2imForwardLaucher(const at::Tensor col, const int height,
+                               const int width, const int channels,
+                               const at::Tensor mask_h_idx,
+                               const at::Tensor mask_w_idx, const int mask_cnt,
+                               at::Tensor im);
+
+#define CHECK_CUDA(x) AT_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
+#define CHECK_CONTIGUOUS(x) \
+  AT_CHECK(x.is_contiguous(), #x, " must be contiguous ")
+#define CHECK_INPUT(x) \
+  CHECK_CUDA(x);       \
+  CHECK_CONTIGUOUS(x)
+
+int masked_im2col_forward_cuda(const at::Tensor im, const at::Tensor mask_h_idx,
+                               const at::Tensor mask_w_idx, const int kernel_h,
+                               const int kernel_w, const int pad_h,
+                               const int pad_w, at::Tensor col) {
+  CHECK_INPUT(im);
+  CHECK_INPUT(mask_h_idx);
+  CHECK_INPUT(mask_w_idx);
+  CHECK_INPUT(col);
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
+
+  int channels = im.size(1);
+  int height = im.size(2);
+  int width = im.size(3);
+  int mask_cnt = mask_h_idx.size(0);
+
+  MaskedIm2colForwardLaucher(im, height, width, channels, kernel_h, kernel_w,
+                             pad_h, pad_w, mask_h_idx, mask_w_idx, mask_cnt,
+                             col);
+
+  return 1;
+}
+
+int masked_col2im_forward_cuda(const at::Tensor col,
+                               const at::Tensor mask_h_idx,
+                               const at::Tensor mask_w_idx, int height,
+                               int width, int channels, at::Tensor im) {
+  CHECK_INPUT(col);
+  CHECK_INPUT(mask_h_idx);
+  CHECK_INPUT(mask_w_idx);
+  CHECK_INPUT(im);
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
+
+  int mask_cnt = mask_h_idx.size(0);
+
+  MaskedCol2imForwardLaucher(col, height, width, channels, mask_h_idx,
+                             mask_w_idx, mask_cnt, im);
+
+  return 1;
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("masked_im2col_forward", &masked_im2col_forward_cuda,
+        "masked_im2col forward (CUDA)");
+  m.def("masked_col2im_forward", &masked_col2im_forward_cuda,
+        "masked_col2im forward (CUDA)");
+}
\ No newline at end of file
diff --git a/mmdet/ops/masked_conv/src/masked_conv2d_kernel.cu b/mmdet/ops/masked_conv/src/masked_conv2d_kernel.cu
new file mode 100644
index 00000000..394af13e
--- /dev/null
+++ b/mmdet/ops/masked_conv/src/masked_conv2d_kernel.cu
@@ -0,0 +1,113 @@
+#include <ATen/ATen.h>
+#include <THC/THCAtomics.cuh>
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                            \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
+       i += blockDim.x * gridDim.x)
+
+#define THREADS_PER_BLOCK 1024
+
+inline int GET_BLOCKS(const int N) {
+  int optimal_block_num = (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+  int max_block_num = 65000;
+  return min(optimal_block_num, max_block_num);
+}
+
+template <typename scalar_t>
+__global__ void MaskedIm2colForward(const int n, const scalar_t *data_im,
+                                    const int height, const int width,
+                                    const int kernel_h, const int kernel_w,
+                                    const int pad_h, const int pad_w,
+                                    const long *mask_h_idx,
+                                    const long *mask_w_idx, const int mask_cnt,
+                                    scalar_t *data_col) {
+  // mask_cnt * channels
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int m_index = index % mask_cnt;
+    const int h_col = mask_h_idx[m_index];
+    const int w_col = mask_w_idx[m_index];
+    const int c_im = index / mask_cnt;
+    const int c_col = c_im * kernel_h * kernel_w;
+    const int h_offset = h_col - pad_h;
+    const int w_offset = w_col - pad_w;
+    scalar_t *data_col_ptr = data_col + c_col * mask_cnt + m_index;
+    for (int i = 0; i < kernel_h; ++i) {
+      int h_im = h_offset + i;
+      for (int j = 0; j < kernel_w; ++j) {
+        int w_im = w_offset + j;
+        if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+          *data_col_ptr =
+              (scalar_t)data_im[(c_im * height + h_im) * width + w_im];
+        } else {
+          *data_col_ptr = 0.0;
+        }
+        data_col_ptr += mask_cnt;
+      }
+    }
+  }
+}
+
+int MaskedIm2colForwardLaucher(const at::Tensor bottom_data, const int height,
+                               const int width, const int channels,
+                               const int kernel_h, const int kernel_w,
+                               const int pad_h, const int pad_w,
+                               const at::Tensor mask_h_idx,
+                               const at::Tensor mask_w_idx, const int mask_cnt,
+                               at::Tensor top_data) {
+  const int output_size = mask_cnt * channels;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      bottom_data.type(), "MaskedIm2colLaucherForward", ([&] {
+        const scalar_t *bottom_data_ = bottom_data.data<scalar_t>();
+        const long *mask_h_idx_ = mask_h_idx.data<long>();
+        const long *mask_w_idx_ = mask_w_idx.data<long>();
+        scalar_t *top_data_ = top_data.data<scalar_t>();
+        MaskedIm2colForward<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
+                output_size, bottom_data_, height, width, kernel_h, kernel_w,
+                pad_h, pad_w, mask_h_idx_, mask_w_idx_, mask_cnt, top_data_);
+      }));
+  THCudaCheck(cudaGetLastError());
+  return 1;
+}
+
+template <typename scalar_t>
+__global__ void MaskedCol2imForward(const int n, const scalar_t *data_col,
+                                    const int height, const int width,
+                                    const int channels, const long *mask_h_idx,
+                                    const long *mask_w_idx, const int mask_cnt,
+                                    scalar_t *data_im) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int m_index = index % mask_cnt;
+    const int h_im = mask_h_idx[m_index];
+    const int w_im = mask_w_idx[m_index];
+    const int c_im = index / mask_cnt;
+    // int kernel_extent_w = (kernel_w - 1) + 1;
+    // int kernel_extent_h = (kernel_h - 1) + 1;
+    // compute the start and end of the output
+    data_im[(c_im * height + h_im) * width + w_im] = data_col[index];
+  }
+}
+
+int MaskedCol2imForwardLaucher(const at::Tensor bottom_data, const int height,
+                               const int width, const int channels,
+                               const at::Tensor mask_h_idx,
+                               const at::Tensor mask_w_idx, const int mask_cnt,
+                               at::Tensor top_data) {
+  const int output_size = mask_cnt * channels;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      bottom_data.type(), "MaskedCol2imLaucherForward", ([&] {
+        const scalar_t *bottom_data_ = bottom_data.data<scalar_t>();
+        const long *mask_h_idx_ = mask_h_idx.data<long>();
+        const long *mask_w_idx_ = mask_w_idx.data<long>();
+        scalar_t *top_data_ = top_data.data<scalar_t>();
+
+        MaskedCol2imForward<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
+                output_size, bottom_data_, height, width, channels, mask_h_idx_,
+                mask_w_idx_, mask_cnt, top_data_);
+      }));
+  THCudaCheck(cudaGetLastError());
+  return 1;
+}
-- 
GitLab