提交 d506eff2 编辑于 作者: Michael Carilli's avatar Michael Carilli
浏览文件

Macros based on torch.__version__ to compile with 0.4 and 0.5

上级 61b452e8
......@@ -5,8 +5,8 @@ This repo is designed to hold PyTorch modules and utilities that are under activ
# Requirements
Python 3
PyTorch 0.3 or newer
CUDA 9
PyTorch 0.4 or newer. We recommend to use the latest stable release, obtainable from https://pytorch.org/. We also test against the latest master branch, obtainable from https://github.com/pytorch/pytorch. If you have any problems building, please file an issue.
# [Full Documentation](https://nvidia.github.io/apex)
......@@ -23,7 +23,7 @@ import apex
```
and optionally (if required for your use)
```
import apex._C as apex_backend
import apex_C as apex_backend
```
# What's included
......
......@@ -4,7 +4,7 @@ import warnings
import torch
from apex._C import scale_check_overflow
from apex_C import scale_check_overflow
class AmpHandle(object):
def __init__(self, enable_caching=True):
......
import torch
from torch.autograd import Variable
from torch.autograd.function import Function, once_differentiable
import apex._C
import apex_C
def check_contig_cuda(tensors, names):
for tensor, name in zip(tensors, names):
......@@ -71,7 +71,7 @@ class Fused_Weight_Norm(Function):
[output_size(0),1,1,...].
"""
apex._C.weight_norm_fwd(output, norms, input, g, dim)
apex_C.weight_norm_fwd(output, norms, input, g, dim)
ctx.save_for_backward(input, g)
# save_for_backward can only save input or output tensors,
......@@ -102,7 +102,7 @@ class Fused_Weight_Norm(Function):
grad_input = grad_output_contig.new(grad_output.size()).contiguous()
grad_g = savedg.new(savedg.size()).contiguous()
apex._C.weight_norm_bwd(grad_input,
apex_C.weight_norm_bwd(grad_input,
grad_g,
grad_output_contig,
savedInput,
......
......@@ -4,7 +4,7 @@
// here, but I can't make nvcc play well with torch.h. For now, use a layer of indirection
// and separate .cu implementation files.
// If we want everything to be part of "apex._C", we need all the interface functions defined
// If we want everything to be part of "apex_C", we need all the interface functions defined
// in this file, or linker will complain about "multiple definitions of PyInit".
// TODO: multiple modules?
......@@ -54,15 +54,23 @@ void scale_check_overflow_cuda
float scale,
const at::Tensor& d_buf);
#ifdef VERSION_LE_04
#define VERSION_AGNOSTIC_CHECK AT_ASSERT
#else
#define VERSION_AGNOSTIC_CHECK AT_CHECK
#endif
void scale_check_overflow
(at::Tensor grads,
float scale,
at::Tensor overflow_buf)
{
AT_CHECK(grads.type().is_cuda(), "x must be a CUDA tensor");
AT_CHECK(overflow_buf.type().is_cuda(), "y must be a CUDA tensor");
VERSION_AGNOSTIC_CHECK
(grads.type().is_cuda(), "x must be a CUDA tensor");
VERSION_AGNOSTIC_CHECK
(overflow_buf.type().is_cuda(), "y must be a CUDA tensor");
// Make sure we are downscaling the FP32 master grads
AT_CHECK
VERSION_AGNOSTIC_CHECK
(grads.type().scalarType() == at::ScalarType::Float,
"grads supplied to scale_check_overflow should be fp32 (master grads).")
scale_check_overflow_cuda(grads, scale, overflow_buf);
......
......@@ -13,6 +13,19 @@
#define __SYNCWARP
#endif
#ifdef VERSION_LE_04
#define USING_ACCSCALAR_T using accscalar_t = cuda::acc_type<cuda_scalar_t>;
#else
#define USING_ACCSCALAR_T using accscalar_t = acc_type<cuda_scalar_t, true>;
#endif
#ifdef VERSION_LE_04
#define REDUCE_ADD ReduceAdd<accscalar_t, accscalar_t>()
#else
#define REDUCE_ADD ReduceAdd<accscalar_t>()
#endif
// Block size for weight_norm_*_first_dim_kernel.
// Currently, kernels are non-persistent.
// Dialing up the block size to, say 1024, can improve performance by
......
#include <ATen/ATen.h>
#include "ATen/AccumulateType.h"
// #include "ATen/AccumulateType.h"
#include "ATen/cuda/CUDATensorMethods.cuh"
#include "ATen/cuda/CUDATypeConversion.cuh"
#include <THC/THCTensorMathReduce.cuh>
......
#include "kernel_utils.cuh"
#include <ATen/ATen.h>
#ifdef VERSION_LE_04
#include "ATen/cuda/AccumulateType.cuh"
#else
#include "ATen/AccumulateType.h"
#endif
#include "ATen/cuda/CUDATensorMethods.cuh"
#include "ATen/cuda/CUDATypeConversion.cuh"
#include <THC/THCTensorMathReduce.cuh>
......@@ -40,7 +46,7 @@ __global__ void weight_norm_bwd_first_dim_kernel
thread_sum += pLpwi*savedvi; // AccumOp, could do Kahan here
}
reduce_block_into_lanes(s, thread_sum, 1, ReduceAdd<accscalar_t>());
reduce_block_into_lanes(s, thread_sum, 1, REDUCE_ADD);
accscalar_t result = s[0];
// Could choose to save reciprocal of norm instead I suppose, but norms is probably
......@@ -99,7 +105,7 @@ __global__ void weight_norm_bwd_last_dim_kernel
slower_dims_location += blockDim.y;
}
reduce_block_into_lanes(s, thread_sum, blockDim.x, ReduceAdd<accscalar_t>());
reduce_block_into_lanes(s, thread_sum, blockDim.x, REDUCE_ADD);
accscalar_t result = s[threadIdx.x];
// Broadcast load; could use shared memory instead.
......@@ -159,7 +165,7 @@ void weight_norm_bwd_cuda
[&]
{
using cuda_scalar_t = cuda::type<scalar_t>;
using accscalar_t = acc_type<cuda_scalar_t, true>;
USING_ACCSCALAR_T
weight_norm_bwd_first_dim_kernel
<<<pLpw.size(0),
......@@ -192,7 +198,7 @@ void weight_norm_bwd_cuda
[&]
{
using cuda_scalar_t = cuda::type<scalar_t>;
using accscalar_t = acc_type<cuda_scalar_t, true>;
USING_ACCSCALAR_T
weight_norm_bwd_last_dim_kernel
<<<(fast_dim_size+TILE_W-1)/TILE_W,
......
#include "kernel_utils.cuh"
#include <ATen/ATen.h>
#ifdef VERSION_LE_04
#include "ATen/cuda/AccumulateType.cuh"
#else
#include "ATen/AccumulateType.h"
#endif
#include "ATen/cuda/CUDATensorMethods.cuh"
#include "ATen/cuda/CUDATypeConversion.cuh"
#include <THC/THCTensorMathReduce.cuh>
......@@ -38,7 +44,7 @@ __global__ void weight_norm_fwd_first_dim_kernel
thread_sum += val_f*val_f; // AccumOp, could do Kahan here
}
reduce_block_into_lanes(s, thread_sum, 1, ReduceAdd<accscalar_t>());
reduce_block_into_lanes(s, thread_sum, 1, REDUCE_ADD);
accscalar_t result = s[0];
result = sqrtf(result);
......@@ -92,7 +98,7 @@ __global__ void weight_norm_fwd_last_dim_kernel
slower_dims_location += blockDim.y;
}
reduce_block_into_lanes(s, thread_sum, blockDim.x, ReduceAdd<accscalar_t>());
reduce_block_into_lanes(s, thread_sum, blockDim.x, REDUCE_ADD);
// Better to pass an EpilogueOp to reduce_block_into_lanes, implement later
if(threadIdx.y == 0)
......@@ -150,7 +156,7 @@ void weight_norm_fwd_cuda
[&]
{
using cuda_scalar_t = cuda::type<scalar_t>;
using accscalar_t = acc_type<cuda_scalar_t, true>;
USING_ACCSCALAR_T
weight_norm_fwd_first_dim_kernel
<<<v.size(0),
......@@ -181,7 +187,7 @@ void weight_norm_fwd_cuda
[&]
{
using cuda_scalar_t = cuda::type<scalar_t>;
using accscalar_t = acc_type<cuda_scalar_t, true>;
USING_ACCSCALAR_T
// just trying this formatting out to see how it feels...
weight_norm_fwd_last_dim_kernel
......
......@@ -14,6 +14,18 @@ if not torch.cuda.is_available():
print("Warning: Torch did not find available GPUs on this system.\n",
"If your intention is to cross-compile, this is not an error.")
print("torch.__version__ = ", torch.__version__)
TORCH_MAJOR = int(torch.__version__.split('.')[0])
TORCH_MINOR = int(torch.__version__.split('.')[1])
if TORCH_MAJOR == 0 and TORCH_MINOR < 4:
raise RuntimeError("APEx requires Pytorch 0.4 or newer.\n" +
"The latest stable release can be obtained from https://pytorch.org/")
version_le_04 = []
if TORCH_MAJOR == 0 and TORCH_MINOR == 4:
version_le_04 = ['-DVERSION_LE_04']
def find(path, regex_func, collect=False):
collection = [] if collect else None
for root, dirs, files in os.walk(path):
......@@ -35,37 +47,37 @@ def get_cuda_version():
CUDA_LIB = re.compile(', V[0-9]+\.[0-9]+\.[0-9]+').search(nvcc_output).group(0).split('V')[1]
print("Found CUDA_LIB = ", CUDA_LIB)
CUDA_MAJOR_VERSION = int(CUDA_LIB.split('.')[0])
print("Found CUDA_MAJOR_VERSION = ", CUDA_MAJOR_VERSION)
CUDA_MAJOR = int(CUDA_LIB.split('.')[0])
print("Found CUDA_MAJOR = ", CUDA_MAJOR)
if CUDA_MAJOR_VERSION < 8:
if CUDA_MAJOR < 8:
raise RuntimeError("APex requires CUDA 8.0 or newer")
return CUDA_MAJOR_VERSION
return CUDA_MAJOR
if CUDA_HOME is not None:
print("Found CUDA_HOME = ", CUDA_HOME)
CUDA_MAJOR_VERSION = get_cuda_version()
CUDA_MAJOR = get_cuda_version()
gencodes = ['-gencode', 'arch=compute_52,code=sm_52',
'-gencode', 'arch=compute_60,code=sm_60',
'-gencode', 'arch=compute_61,code=sm_61',]
if CUDA_MAJOR_VERSION > 8:
if CUDA_MAJOR > 8:
gencodes += ['-gencode', 'arch=compute_70,code=sm_70',
'-gencode', 'arch=compute_70,code=compute_70',]
ext_modules = []
extension = CUDAExtension(
'apex._C', [
'apex_C', [
'csrc/interface.cpp',
'csrc/weight_norm_fwd_cuda.cu',
'csrc/weight_norm_bwd_cuda.cu',
'csrc/scale_cuda.cu',
],
extra_compile_args={'cxx': ['-g'],
'nvcc': ['-O3'] + gencodes})
extra_compile_args={'cxx': ['-g'] + version_le_04,
'nvcc': ['-O3'] + version_le_04 + gencodes})
ext_modules.append(extension)
else:
raise RuntimeError("Could not find Cuda install directory")
......
......@@ -3,9 +3,9 @@ import numpy as np
def compare(cuda_out, pt_out, pt_out_control, rows):
print( "Pytorch ops in fp16: ", pt_out )
print( "Kernel result: ", cuda_out )
print("Control (Pytorch ops, sticking to fp32): ", pt_out_control)
# print( "Pytorch ops in fp16: ", pt_out )
# print( "Kernel result: ", cuda_out )
# print("Control (Pytorch ops, sticking to fp32): ", pt_out_control)
# Make upconverted copies for error check against fp32 control
cuda_out_fp32 = cuda_out.float()
......@@ -22,21 +22,21 @@ def compare(cuda_out, pt_out, pt_out_control, rows):
pt_maxdiffs, pt_maxdiff_locs = torch.max((pt_out_control - pt_out_fp32 ).abs(),1)
print( "cuda_maxdiffs = ", cuda_maxdiffs )
print("cuda_maxdiff_locs = ", cuda_maxdiff_locs)
# print("cuda_maxdiff_locs = ", cuda_maxdiff_locs)
print( "pt_maxdiffs = ", pt_maxdiffs )
print( "pt_maxdiff_locs = ", pt_maxdiff_locs )
# print( "pt_maxdiff_locs = ", pt_maxdiff_locs )
row_indices = torch.LongTensor(np.arange(rows))
print("cuda_out at cuda_maxdiff_locs in each row:")
# bizarrely, this will work if you do it at the python prompt:
# print(cuda_out[row_indices,cuda_maxdiff_locs])
# ...but it only seems to work here if you wrap with numpy arrays:
print( cuda_out[np.array(row_indices),np.array(cuda_maxdiff_locs)])
print("pt_out_control at cuda_maxdiff_locs in each row:")
print(pt_out_control[np.array(row_indices),np.array(cuda_maxdiff_locs)])
print("pt_out at pt_maxdiff_locs in each row:" )
print( pt_out[np.array(row_indices),np.array(pt_maxdiff_locs)])
print("pt_out_control at pt_maxdiff_locs in each row:" )
print(pt_out_control[np.array(row_indices),np.array(pt_maxdiff_locs)])
# print("cuda_out at cuda_maxdiff_locs in each row:")
# # bizarrely, this will work if you do it at the python prompt:
# # print(cuda_out[row_indices,cuda_maxdiff_locs])
# # ...but it only seems to work here if you wrap with numpy arrays:
# print( cuda_out[np.array(row_indices),np.array(cuda_maxdiff_locs)])
# print("pt_out_control at cuda_maxdiff_locs in each row:")
# print(pt_out_control[np.array(row_indices),np.array(cuda_maxdiff_locs)])
#
# print("pt_out at pt_maxdiff_locs in each row:" )
# print( pt_out[np.array(row_indices),np.array(pt_maxdiff_locs)])
# print("pt_out_control at pt_maxdiff_locs in each row:" )
# print(pt_out_control[np.array(row_indices),np.array(pt_maxdiff_locs)])
import torch
from torch.autograd import Variable
import apex._C
import apex_C
import numpy as np
from compare import compare
from norm import pt_norm, get_norm_shape
......@@ -88,7 +88,7 @@ for rows, cols, fast in sizes:
pLpg_cuda = pLpg_cuda .half()
torch.cuda.nvtx.range_push("kernel weight norm backward")
apex._C.weight_norm_bwd(pLpInput_cuda,
apex_C.weight_norm_bwd(pLpInput_cuda,
pLpg_cuda,
pLpOutput_fp16,
pt_input_fp16,
......
import torch
import sys
import apex._C
import apex_C
import numpy as np
from compare import compare
from norm import pt_norm, get_norm_shape
......@@ -60,7 +60,7 @@ for rows, cols, fast in sizes:
g = g.half()
cuda_out = cuda_out.half()
apex._C.weight_norm_fwd(cuda_out, cuda_norms, pt_in, g, dim)
apex_C.weight_norm_fwd(cuda_out, cuda_norms, pt_in, g, dim)
torch.cuda.synchronize()
# quit()
......
Supports Markdown
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册