From fbf1f065212e28ab1b19a124f2b5ceba8ebd9592 Mon Sep 17 00:00:00 2001
From: Adam Paszke <adam.paszke@gmail.com>
Date: Sat, 24 Feb 2018 11:15:43 +0100
Subject: [PATCH 1/8] Implement no-attribute dispatch of ATen ops from the JIT
 (#5298)

---
 tools/jit/gen_jit_dispatch.py         | 167 ++++++++++++++++++--------
 tools/jit/templates/aten_dispatch.cpp |   7 ++
 torch/csrc/jit/tensor_conversions.h   |  89 ++++++++++++--
 3 files changed, 208 insertions(+), 55 deletions(-)

diff --git a/tools/jit/gen_jit_dispatch.py b/tools/jit/gen_jit_dispatch.py
index 657eedcca570d9..40c0792e37670d 100644
--- a/tools/jit/gen_jit_dispatch.py
+++ b/tools/jit/gen_jit_dispatch.py
@@ -28,19 +28,25 @@
     'IntList': 'std::vector<int64_t>',
 }
 
-ATTR_ASSIGNMENT = CodeTemplate("""\
+KW_ASSIGNMENT = CodeTemplate("""\
 auto ${name} = ${type_cast}(node->${method}(Symbol("${name}")));\
 """)
 
+POS_ASSIGNMENT = CodeTemplate("""\
+auto ${name} = tensor_as<${type}>(std::move(fromLast(stack, ${arg_idx})));\
+""")
+
 CALL_NAMESPACE = CodeTemplate("at::${name}(${args})")
 CALL_METHOD = CodeTemplate("(${first}).${name}(${args})")
 
 CONSTRUCTOR = CodeTemplate("""\
 {"${descriptor}", [](Node *node) {
-  ${assignments}
+  ${kw_assignments}
   return TensorOp([=](Stack & stack) {
     autograd::profiler::RecordFunction record("${name}");
-    AutoGPU device_guard(deviceForInputs(stack, ${num_inputs}));
+    AutoGPU device_guard(deviceForInputs(stack, ${num_inputs} + ${num_dropped_args}));
+    ${pos_assignments}
+    ${pos_arg_drop}
     auto result = ${call};
     drop(stack, ${num_inputs});
     pack(stack, std::move(result));
@@ -51,18 +57,33 @@
 
 
 def is_jit_op(decl):
+    uses_tensors = any(arg['simple_type'] in {'Tensor', 'TensorList'} for arg in decl['arguments']) or \
+        'Tensor' in decl['method_of']
     return (not decl['api_name'].endswith('_') and
             not decl['name'].endswith('_out') and
             not any(arg['simple_type'] == 'Generator' for arg in decl['arguments']) and
             not any(arg['simple_type'] == 'SparseTensor' for arg in decl['arguments']) and
             not any(arg['simple_type'] == 'Storage' for arg in decl['arguments']) and
             not any(arg['simple_type'] == 'Type' for arg in decl['arguments']) and
-            any(arg['simple_type'] in {'Tensor', 'TensorList'} for arg in decl['arguments']) and
-            'Tensor' in decl['return_type'])
+            uses_tensors)
+
+
+skip_scalar_overload = {
+    'lt-2': [1], 'gt-2': [1], 'le-2': [1], 'ge-2': [1], 'eq-2': [1], 'ne-2': [1],
+    'pow-2': [0, 1], 'add-3': [1], 'sub-3': [1], 'mul-2': [1], 'div-2': [1],
+    'fmod-2': [1], 'remainder-2': [1]
+}
 
 
 def gen_jit_dispatch(declarations, out):
-    aten_decls = load_aten_declarations(declarations)
+    # We need to add methods implemented manually in TensorImpl
+    tensor_impl_methods = [{
+        'name': name,
+        'api_name': name,
+        'method_of': ['Tensor'],
+        'arguments': [{'name': 'self', 'simple_type': 'Tensor'}],
+    } for name in ['sizes', 'strides', 'dim']]
+    aten_decls = load_aten_declarations(declarations) + tensor_impl_methods
     jit_decls = [d for d in aten_decls if is_jit_op(d)]
 
     def is_tensor_arg(arg):
@@ -72,54 +93,104 @@ def is_tensor_arg(arg):
     for decl in jit_decls:
         arguments = decl['arguments']
         name = decl['name']
-        scalar_args = [arg for arg in arguments if not is_tensor_arg(arg)]
         has_tensorlist = any(arg['simple_type'] == 'TensorList' for arg in arguments)
-
-        # Descriptor is a unique identified for a particular overload of an op
-        attr_names = sorted([arg['name'] for arg in scalar_args])
-        num_inputs = len(arguments) - len(scalar_args) if not has_tensorlist else "*"
-        descriptor = '-'.join([decl['name'], str(num_inputs)] + attr_names)
-
-        # All scalar args need to be assigned, so they can be captured by a lambda
-        assignments = [ATTR_ASSIGNMENT.substitute(type=arg['simple_type'],
-                                                  type_cast=TYPE_CASTS.get(arg['simple_type'], arg['simple_type']),
-                                                  name=arg['name'],
-                                                  method=ATTR_METHOD_MAP[arg['simple_type']])
-                       for arg in scalar_args]
-        if num_inputs == "*":
-            assignments.append('auto varargs_length = node->inputs().size();')
-            num_inputs = 'varargs_length'
-
-        # Generate the actuall ATen call. This gets a bit tricky because of
-        # TensorList arguments, and functions that are only available as methods.
-        if 'namespace' in decl['method_of']:
-            if has_tensorlist:
-                if sum(map(is_tensor_arg, arguments)) != 1:
-                    # TODO: support this
+        scalar_arg_idx = [i for i, arg in enumerate(arguments) if not is_tensor_arg(arg)]
+        num_tensor_args = sum(map(is_tensor_arg, arguments))
+        # TODO: support this
+        if has_tensorlist and (num_tensor_args != 1 or not is_tensor_arg(arguments[0])):
+            continue
+
+        # Right now, we generate dispatch methods that either take all non-tensor arguments
+        # as attributes, or don't use any attributes at all. In the future we might want to
+        # have something in the middle too (might be useful for e.g. constant propagation
+        # into attributes, as that would allow us to avoid reparsing tensors into scalar
+        # args at every invocation).
+        # NB: if there are no scalar args then both options on LHS are equivalent, so deduplicate them.
+        scalar_arg_idx_iter = ([], scalar_arg_idx) if scalar_arg_idx else ([],)
+        for pos_scalar_arg_idx in scalar_arg_idx_iter:
+            num_args = len(arguments)
+            num_inputs = num_tensor_args + len(pos_scalar_arg_idx) if not has_tensorlist else '*'
+
+            # Scatter arguments into positional and keyword, and compute stack offsets
+            # of posiitional args.
+            pos_scalar_args, kw_scalar_args = [], []
+            scalar_stack_off, tensor_stack_off = [], []
+            for i, arg in enumerate(arguments):
+                # XXX: we currently support only TensorList ops that have a TensorList as
+                # the first argument, that is then followed by a number of positional args.
+                stack_off = (num_args if num_inputs == '*' else num_inputs) - i - 1
+                if is_tensor_arg(arg):
+                    tensor_stack_off.append(stack_off)
+                else:
+                    if i in pos_scalar_arg_idx:
+                        pos_scalar_args.append(arg)
+                        scalar_stack_off.append(stack_off)
+                    else:
+                        kw_scalar_args.append(arg)
+
+            # Descriptor is a unique identifier for a particular overload of an op.
+            attr_names = sorted([arg['name'] for arg in kw_scalar_args])
+            descriptor = '-'.join([decl['name'], str(num_inputs)] + attr_names)
+
+            # If there are two overloads with the same descriptor, that differ only by a type of a
+            # single argument, where one of them takes a tensor, while another one takes an
+            # at::Scalar as a positional scalar arg, then prefer the tensor overload.
+            # It should get broadcasted correctly.
+            if descriptor in skip_scalar_overload:
+                if any(arguments[idx]['simple_type'] == 'Scalar'
+                       for idx in skip_scalar_overload[descriptor]):
                     continue
 
-                args = ['last(stack, varargs_length)' if is_tensor_arg(arg) else arg['name']
-                        for arg in arguments]
+            kw_assignments = [KW_ASSIGNMENT.substitute(type_cast=TYPE_CASTS.get(arg['simple_type'], arg['simple_type']),
+                                                       name=arg['name'],
+                                                       method=ATTR_METHOD_MAP[arg['simple_type']])
+                              for arg in kw_scalar_args]
+            if num_inputs == "*":
+                kw_assignments.append('size_t varargs_length = node->inputs().size();')
+                num_inputs = 'varargs_length'
+            pos_assignments = [POS_ASSIGNMENT.substitute(type=arg['simple_type'],
+                                                         name=arg['name'],
+                                                         arg_idx=arg_idx)
+                               for arg_idx, arg in zip(scalar_stack_off, pos_scalar_args)]
+
+            # Generate the actuall ATen call. This gets a bit tricky because of
+            # TensorList arguments, and functions that are only available as methods.
+            pos_arg_drop = ''
+            num_dropped_args = 0
+            if 'namespace' in decl['method_of']:
+                if has_tensorlist:
+                    # We need to drop the scalar args following varargs before we use last
+                    if pos_scalar_args:
+                        num_dropped_args = len(pos_scalar_args)
+                        pos_arg_drop = 'drop(stack, {});'.format(num_dropped_args)
+                    args = ['last(stack, varargs_length)' if is_tensor_arg(arg) else arg['name']
+                            for arg in arguments]
+                else:
+                    tensor_id = iter(tensor_stack_off)
+                    args = ['std::move(fromLast(stack, {}))'.format(1 + next(tensor_id))
+                            if is_tensor_arg(arg) else arg['name']
+                            for arg in arguments]
+                call = CALL_NAMESPACE.substitute(name=name, args=args)
             else:
-                tensor_id = iter(count(start=num_inputs, step=-1))
-                args = ['std::move(fromLast(stack,{}))'.format(
-                    next(tensor_id)) if is_tensor_arg(arg) else arg['name']
-                    for arg in arguments]
-            call = CALL_NAMESPACE.substitute(name=name, args=args)
-        else:
-            tensor_id = iter(count(start=num_inputs, step=-1))
-            args = ['std::move(fromLast(stack,{}))'.format(next(tensor_id)) if is_tensor_arg(arg) else arg['name']
-                    for arg in arguments]
-            call = CALL_METHOD.substitute(name=name, first=args[0], args=args[1:])
-
-        constructor = CONSTRUCTOR.substitute(descriptor=descriptor, name=name, call=call,
-                                             assignments=assignments,
-                                             num_inputs=num_inputs)
-        assert descriptor not in ops, descriptor
-        ops[descriptor] = constructor
+                tensor_id = iter(tensor_stack_off)
+                args = ['std::move(fromLast(stack, {}))'.format(1 + next(tensor_id))
+                        if is_tensor_arg(arg) else arg['name']
+                        for arg in arguments]
+                call = CALL_METHOD.substitute(name=name, first=args[0], args=args[1:])
+
+            constructor = CONSTRUCTOR.substitute(descriptor=descriptor, name=name,
+                                                 num_dropped_args=num_dropped_args,
+                                                 pos_arg_drop=pos_arg_drop,
+                                                 call=call,
+                                                 kw_assignments=kw_assignments,
+                                                 pos_assignments=pos_assignments,
+                                                 num_inputs=num_inputs)
+
+            assert descriptor not in ops, descriptor
+            ops[descriptor] = constructor
 
     # Sort the generated snippets to ensure that the generation is deterministic
-    env = {'constructors': sorted(list(ops.values()))}
+    env = {'constructors': sorted(ops.values())}
     write(out, 'aten_dispatch.h', ATEN_DISPATCH_H, env)
     write(out, 'aten_dispatch.cpp', ATEN_DISPATCH_CPP, env)
 
diff --git a/tools/jit/templates/aten_dispatch.cpp b/tools/jit/templates/aten_dispatch.cpp
index b3122c6eb80f10..67998c9cb0d68c 100644
--- a/tools/jit/templates/aten_dispatch.cpp
+++ b/tools/jit/templates/aten_dispatch.cpp
@@ -1,6 +1,7 @@
 #include "aten_dispatch.h"
 #include "torch/csrc/autograd/profiler.h"
 #include "torch/csrc/jit/interned_strings.h"
+#include "torch/csrc/jit/tensor_conversions.h"
 #include "torch/csrc/utils/functional.h"
 
 #include <unordered_map>
@@ -25,9 +26,15 @@ namespace {
 // copies.
 
 // pack takes the return values of aten functions pushes them onto the stack
+template<typename T>
+void pack(Stack & stack, T&& v) {
+  stack.push_back(as_tensor(std::move(v)));
+}
+template<>
 void pack(Stack & stack, Tensor&& v) {
   stack.push_back(std::move(v));
 }
+template<>
 void pack(Stack & stack, std::vector<Tensor>&& ts) {
   for(auto& t : ts) {
     stack.push_back(std::move(t));
diff --git a/torch/csrc/jit/tensor_conversions.h b/torch/csrc/jit/tensor_conversions.h
index d23d96dbaea13d..2eec5f2eadc5a5 100644
--- a/torch/csrc/jit/tensor_conversions.h
+++ b/torch/csrc/jit/tensor_conversions.h
@@ -1,15 +1,90 @@
 #pragma once
 #include "ATen/ATen.h"
 
+#include <array>
+#include <type_traits>
+
+namespace torch { namespace jit {
+
+//////////////////////////////////////////////////////////////////////////////////
+// Tensor -> T conversion
+//////////////////////////////////////////////////////////////////////////////////
+
+namespace detail {
+
+template<typename T, typename EnableIf = void>
+struct tensor_as_impl {};
+
 template<typename T>
-static inline T tensor_as(at::Tensor&& t) = delete;
+struct tensor_as_impl<T, typename std::enable_if<std::is_arithmetic<T>::value>::type> {
+  T operator()(at::Tensor&& t) {
+    // workaround for 1-dim 1-element pytorch tensors until zero-dim
+    // tensors are fully supported
+    if(t.ndimension() == 1 && t.size(0) == 1) {
+      t = t[0];
+    }
+    return at::Scalar(t).to<T>();
+  }
+};
+
+template<size_t N>
+struct tensor_as_impl<std::array<bool, N>> {
+  std::array<bool, N> operator()(at::Tensor&& t) {
+    throw std::runtime_error("tensor_as<std::array<bool, N>>: NYI");
+  }
+};
+
+template<>
+struct tensor_as_impl<at::IntList> {
+  at::IntList operator()(at::Tensor&& t) {
+    if (t.type().scalarType() != at::ScalarType::Long)
+      throw std::runtime_error("Expected a LongTensor");
+    if (t.dim() != 1)
+      throw std::runtime_error("Expected a 1D LongTensor");
+    if (!t.is_contiguous())
+      throw std::runtime_error("Expected a contiguous LongTensor");
+    return at::IntList{t.data<int64_t>(), static_cast<size_t>(t.numel())};
+  }
+};
 
 template<>
-inline int64_t tensor_as(at::Tensor&& t) {
-  // workaround for 1-dim 1-element pytorch tensors until zero-dim
-  // tensors are fully supported
-  if(t.ndimension() == 1 && t.size(0) == 1) {
-    t = t[0];
+struct tensor_as_impl<at::Scalar> {
+  at::Scalar operator()(at::Tensor&& t) {
+    return at::Scalar(t.view({}));
   }
-  return at::Scalar(t).to<int64_t>();
+};
+
+}
+
+template<typename T>
+inline T tensor_as(at::Tensor&& t) {
+  return detail::tensor_as_impl<T>()(std::move(t));
+}
+
+//////////////////////////////////////////////////////////////////////////////////
+// T -> Tensor conversion
+//////////////////////////////////////////////////////////////////////////////////
+
+inline at::Tensor as_tensor(int64_t v) {
+  return at::Scalar(v).toTensor();
+}
+
+inline at::Tensor as_tensor(double v) {
+  return at::Scalar(v).toTensor();
+}
+
+inline at::Tensor as_tensor(bool v) {
+  return at::Scalar(v).toTensor();
+}
+
+inline at::Tensor as_tensor(at::IntList l) {
+  return at::CPU(at::kLong).tensorFromBlob(const_cast<void*>(reinterpret_cast<const void*>(l.data())),
+                                           {static_cast<int64_t>(l.size())}).clone();
 }
+
+
+inline at::Tensor as_tensor(at::Scalar&& s) {
+  return s.toTensor();
+}
+
+}} // namespace torch::jit

From a0118533ef1bebddb8df9f0261ab9ee27f3b157f Mon Sep 17 00:00:00 2001
From: Adam Paszke <adam.paszke@gmail.com>
Date: Sat, 24 Feb 2018 11:15:55 +0100
Subject: [PATCH 2/8] Add a print() function to the JIT script (#5274)

Additionally:
- add support for calling functions that are not methods in the Python frontend
- add an end-to-end test for the Python frontend
- add a capture_stdout helper for checking that `print` actually works
---
 .../TestJit.test_python_frontend.expect       | 10 ++-
 ...Jit.test_python_frontend_run-stdout.expect |  5 ++
 test/test_jit.py                              | 72 ++++++++++++++++++-
 torch/csrc/jit/init.cpp                       |  2 +-
 torch/csrc/jit/interned_strings.h             |  1 +
 torch/csrc/jit/interpreter.cpp                | 21 ++++++
 torch/csrc/jit/script/compiler.cpp            | 10 +++
 torch/csrc/jit/script/compiler.h              |  4 +-
 torch/csrc/jit/script/init.cpp                |  8 +--
 torch/csrc/jit/script/python_tree_views.cpp   |  2 +-
 torch/csrc/jit/script/tree_views.h            |  3 +
 torch/jit/frontend.py                         | 31 ++++++--
 12 files changed, 153 insertions(+), 16 deletions(-)
 create mode 100644 test/expect/TestJit.test_python_frontend_run-stdout.expect

diff --git a/test/expect/TestJit.test_python_frontend.expect b/test/expect/TestJit.test_python_frontend.expect
index 88baca9d5872fb..1e7b39f4a9eb32 100644
--- a/test/expect/TestJit.test_python_frontend.expect
+++ b/test/expect/TestJit.test_python_frontend.expect
@@ -16,7 +16,15 @@
         (+
           (variable (ident x))
           (variable (ident y)))
-        (variable (ident z))))
+        (apply
+          (ident sigmoid)
+          (list (variable (ident z)))
+          (list))))
+    (expression statement
+      (apply
+        (ident print)
+        (list (variable (ident q)))
+        (list)))
     (assign
       (list (ident w))
       (=)
diff --git a/test/expect/TestJit.test_python_frontend_run-stdout.expect b/test/expect/TestJit.test_python_frontend_run-stdout.expect
new file mode 100644
index 00000000000000..9a4852689eb850
--- /dev/null
+++ b/test/expect/TestJit.test_python_frontend_run-stdout.expect
@@ -0,0 +1,5 @@
+ 0.5000
+ 0.9526
+ 0.9975
+ 0.9999
+[ Variable{4} ]
diff --git a/test/test_jit.py b/test/test_jit.py
index e0c6ba353b7189..eceeb1efdbeaf1 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -19,6 +19,7 @@
 except ImportError:
     HAS_TORCHVISION = False
 
+
 skipIfNoTorchVision = unittest.skipIf(not HAS_TORCHVISION, "no torchvision")
 
 RUN_CUDA = torch.cuda.is_available()
@@ -32,6 +33,47 @@
 RUN_CUDA_MULTI_GPU = RUN_CUDA and torch.cuda.device_count() > 1
 
 PY2 = sys.version_info[0] == 2
+WINDOWS = sys.platform == 'win32'
+
+
+@contextmanager
+def capture_stdout():
+    # No idea how to capture stdout from C++ on Windows
+    if WINDOWS:
+        yield ['']
+        return
+    import os
+    import fcntl
+    import errno
+    stdout_fd = os.dup(1)
+    r, w = os.pipe()
+    try:
+        # Override stdout with r - dup is guaranteed to return the lowest free fd
+        os.close(1)
+        os.dup(w)
+
+        captured_stdout = ['']
+        yield captured_stdout
+        sys.stdout.flush()  # Make sure that Python hasn't buffered anything
+
+        # Do the ugly dance to read all the data that was written into the pipe
+        fcntl.fcntl(r, fcntl.F_SETFL, os.O_NONBLOCK)
+        total_stdout = ''
+        while True:
+            try:
+                total_stdout += os.read(r, 1000).decode('ascii')
+            except OSError as e:
+                if e.errno != errno.EAGAIN:
+                    raise
+                break
+        captured_stdout[0] = total_stdout
+    finally:
+        # Revert the change, and clean up all fds
+        os.close(1)
+        os.dup(stdout_fd)
+        os.close(stdout_fd)
+        os.close(r)
+        os.close(w)
 
 
 def LSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None):
@@ -1309,11 +1351,19 @@ def foo(a, b):
         self.assertEqual(g2result, g2result2)
 
     def checkScript(self, script, inputs, outputs, optimize, name='func'):
-        cu = torch.jit._jit_script_compile(script)
+        if isinstance(script, str):
+            cu = torch.jit._jit_script_compile(script)
+        else:
+            ast = torch.jit.frontend.get_jit_ast(script)
+            cu = torch._C.CompilationUnit()
+            cu.define_function(ast)
         graph = cu.get_graph(name)
         ge = torch._C.GraphExecutor(graph, optimize)
-        outputs_ge = ge(*inputs)
+        with capture_stdout() as captured:
+            outputs_ge = ge(*inputs)
         self.assertEqual(outputs, outputs_ge)
+        if captured[0]:
+            self.assertExpected(captured[0], subname='stdout')
 
     def test_script_add(self):
         script = '''
@@ -1395,7 +1445,8 @@ def to_int(x) -> (y):
 
     def test_python_frontend(self):
         def fn(x, y, z):
-            q = x + y - z
+            q = x + y - z.sigmoid()
+            print(q)
             w = -z
             if not x and not y and z:
                 m = x if not z else y
@@ -1558,5 +1609,20 @@ def test_ternary(a, b) -> (c):
             str(cu2.get_graph('test_ternary')),
         )
 
+    def test_python_frontend_run(self):
+        def func(x, y):
+            q = (x + y).sigmoid()
+            print(q)
+            w = -q
+            return w * w
+
+        x = Variable(torch.arange(4), requires_grad=True)
+        y = Variable(torch.arange(4) * 2, requires_grad=True)
+        with capture_stdout():
+            expected_out = func(x, y)
+        expected_out = (x + y).sigmoid().pow(2)
+        self.checkScript(func, [x, y], [expected_out], False)
+
+
 if __name__ == '__main__':
     run_tests()
diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp
index 5f227f81623987..c8600e00b2adb9 100644
--- a/torch/csrc/jit/init.cpp
+++ b/torch/csrc/jit/init.cpp
@@ -134,8 +134,8 @@ void initJITBindings(PyObject *module) {
   initPythonIRBindings(module);
   initPythonTracerBindings(module);
   python::initCompilerMixin(module);
-  script::initJitScriptBindings(module);
   script::initTreeViewBindings(module);
+  script::initJitScriptBindings(module);
 }
 
 }}
diff --git a/torch/csrc/jit/interned_strings.h b/torch/csrc/jit/interned_strings.h
index 89d774b9b91c9e..bfd3caba1c2cad 100644
--- a/torch/csrc/jit/interned_strings.h
+++ b/torch/csrc/jit/interned_strings.h
@@ -144,6 +144,7 @@ _(device) \
 _(ReplaceIfUndef) \
 _(is_zero) \
 _(GraphExecutor) \
+_(Print) \
 _(mm) \
 _(t) \
 _(Loop) \
diff --git a/torch/csrc/jit/interpreter.cpp b/torch/csrc/jit/interpreter.cpp
index d438b22207a4e6..73d66128539635 100644
--- a/torch/csrc/jit/interpreter.cpp
+++ b/torch/csrc/jit/interpreter.cpp
@@ -16,6 +16,8 @@
 #include "torch/csrc/jit/tensor_conversions.h"
 #include "torch/csrc/utils/auto_gil.h"
 
+#include <typeinfo>
+
 namespace py = pybind11;
 
 namespace torch { namespace jit {
@@ -528,6 +530,25 @@ Operation getOperation(jit::Node *node, bool constants_are_variables) {
       }
       return 0;
     };
+  IR_ELSEIF(Print)
+    size_t num_inputs = value->inputs().size();
+    return [num_inputs](Stack & stack) {
+      bool first = true;
+      for (at::Tensor i : last(stack, num_inputs)) {
+        if (!first) std::cout << " ";
+        first = false;
+        if (auto tensor_impl = dynamic_cast<at::TensorImpl*>(i.get())) {
+          std::cout << at::Tensor(tensor_impl, true);
+        } else if (!i.defined()) {
+          std::cout << "<undefined tensor>";
+        } else {
+          std::cout << "<" << typeid(*i.get()).name() << " at " << i << ">";
+        }
+      }
+      drop(stack, num_inputs);
+      std::cout << std::endl;
+      return 0;
+    };
   IR_ELSEIF(GraphExecutor)
     GraphExecutor executor(value->g(kSubgraph));
     auto num_inputs = value->inputs().size();
diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp
index 90a7e500fbaa6b..536ef6775c1f30 100644
--- a/torch/csrc/jit/script/compiler.cpp
+++ b/torch/csrc/jit/script/compiler.cpp
@@ -479,6 +479,12 @@ struct to_ir {
         auto apply = Apply(tree);
         if (function_table.count(apply.name().name()) > 0) {
           return emitFunctionCall(apply, output_size);
+        } else if (apply.name().name() == "print") {
+          expectOutputs(tree, output_size, 0);
+          if (!apply.attributes().empty())
+            throw ErrorReport(tree) << "print doesn't accept any keyword arguments";
+          return emitNode(kPrint, getValues(apply.inputs()), 0,
+                          AttributeMap{}, ListAttributeMap{})->outputs();
         } else {
           const auto& inputs = getValues(apply.inputs());
           NodeKind kind{apply.name().name()};
@@ -712,6 +718,10 @@ void CompilationUnit::define(const std::string& script) {
   return pImpl->define(script);
 }
 
+void CompilationUnit::defineFunction(const Def& def) {
+  return pImpl->defineFunction(def);
+}
+
 std::shared_ptr<Graph> CompilationUnit::getGraph(const std::string& func_name) {
   return pImpl->getGraph(func_name);
 }
diff --git a/torch/csrc/jit/script/compiler.h b/torch/csrc/jit/script/compiler.h
index b20124756a7679..c43931f5b00344 100644
--- a/torch/csrc/jit/script/compiler.h
+++ b/torch/csrc/jit/script/compiler.h
@@ -3,6 +3,7 @@
 #include <string>
 
 #include "torch/csrc/jit/ir.h"
+#include "torch/csrc/jit/script/tree_views.h"
 
 namespace torch {
 namespace jit {
@@ -11,7 +12,8 @@ namespace script {
 struct CompilationUnitImpl;
 struct CompilationUnit {
   CompilationUnit();
-  void define(const std::string& str);
+  void define(const std::string& source);
+  void defineFunction(const Def& def);
   std::shared_ptr<Graph> getGraph(const std::string& func_name);
   ~CompilationUnit();
 
diff --git a/torch/csrc/jit/script/init.cpp b/torch/csrc/jit/script/init.cpp
index 00613440d5ef7a..3add566bdb2d62 100644
--- a/torch/csrc/jit/script/init.cpp
+++ b/torch/csrc/jit/script/init.cpp
@@ -8,10 +8,10 @@ namespace script {
 void initJitScriptBindings(PyObject* module) {
   auto m = py::handle(module).cast<py::module>();
   py::class_<CompilationUnit>(m, "CompilationUnit")
-      .def(
-          "get_graph",
-          &CompilationUnit::getGraph,
-          py::return_value_policy::reference);
+    .def(py::init<>())
+    .def("get_graph", &CompilationUnit::getGraph,
+         py::return_value_policy::reference)
+    .def("define_function", &CompilationUnit::defineFunction);
   m.def("_jit_script_compile", jitScriptCompile);
 }
 
diff --git a/torch/csrc/jit/script/python_tree_views.cpp b/torch/csrc/jit/script/python_tree_views.cpp
index d25aec3fc9ebe9..915026b3ae6072 100644
--- a/torch/csrc/jit/script/python_tree_views.cpp
+++ b/torch/csrc/jit/script/python_tree_views.cpp
@@ -130,7 +130,7 @@ void initTreeViewBindings(PyObject *module) {
     .def(py::init([](const Ident& name) {
       return Var::create(name.range(), name);
     }))
-    .def("name", [](const Var& var) { return var.name(); });
+    .def_property_readonly("name", [](const Var& var) { return var.name(); });
   py::class_<BinOp, Expr>(m, "BinOp")
     .def(py::init([](std::string kind, const Expr& lhs, const Expr& rhs) {
       return BinOp::create(lhs.range(), stringToKind(kind), lhs, rhs);
diff --git a/torch/csrc/jit/script/tree_views.h b/torch/csrc/jit/script/tree_views.h
index 95115f0426e840..ae6e0b5d6f1def 100644
--- a/torch/csrc/jit/script/tree_views.h
+++ b/torch/csrc/jit/script/tree_views.h
@@ -135,6 +135,9 @@ struct List : public TreeView {
   iterator end() const {
     return iterator(tree_->trees().end());
   }
+  bool empty() const {
+    return tree_->trees().begin() == tree_->trees().end();
+  }
   T operator[](size_t i) const {
     return T(subtree(i));
   }
diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py
index 1f8cf306e2093d..cffcf6986eb074 100644
--- a/torch/jit/frontend.py
+++ b/torch/jit/frontend.py
@@ -10,9 +10,16 @@
 
 PY2 = sys.version_info[0] == 2
 _reserved_prefix = '__jit'
+_reserved_names = {'print'}
 _identifier_chars = set(string.ascii_lowercase + string.ascii_uppercase + string.digits)
 
+
+def is_reserved_name(name):
+    return name.startswith(_reserved_prefix) or name in _reserved_names
+
+
 pretty_node_names = {
+    ast.FunctionDef: "function definitions",
     ast.For: "for loops",
     ast.Delete: "del statements",
     ast.ClassDef: "class definitions",
@@ -27,6 +34,7 @@
 }
 
 node_start_tokens = {
+    ast.FunctionDef: "def",
     ast.For: "for",
     ast.Delete: "del",
     ast.ClassDef: "class",
@@ -56,6 +64,7 @@
     })
 else:
     pretty_node_names.update({
+        ast.AsyncFunctionDef: "async function definitions",
         ast.AsyncFor: "async for loops",
         ast.AsyncWith: "async with statements",
         ast.Try: "try blocks",
@@ -63,6 +72,7 @@
     })
 
     node_start_tokens.update({
+        ast.AsyncFunctionDef: "async def",
         ast.AsyncFor: "async for",
         ast.AsyncWith: "async with",
         ast.Try: "try",
@@ -210,7 +220,7 @@ def get_assign_ident(ctx, expr):
         if not isinstance(var, Var):
             raise NotSupportedError("the only expressions allowed on the left hand side of "
                                     "assignments are variable names", var.range())
-        return var.name()
+        return var.name
 
     @staticmethod
     def build_Assign(ctx, stmt):
@@ -247,6 +257,14 @@ def build_If(ctx, stmt):
                   [build_stmt(ctx, s) for s in stmt.body],
                   [build_stmt(ctx, s) for s in stmt.orelse])
 
+    @staticmethod
+    def build_Print(ctx, stmt):
+        r = ctx.make_range(stmt.lineno, stmt.col_offset, stmt.col_offset + len("print"))
+        if stmt.dest:
+            raise NotSupportedError(r, "print statements with non-default destinations aren't supported")
+        args = [build_expr(ctx, val) for val in stmt.values]
+        return ExprStmt(Apply(Ident(r, "print"), args, []))
+
 
 class ExprBuilder(Builder):
     _MethodRef = namedtuple('MethodRef', ['self', 'name'])
@@ -294,15 +312,18 @@ def build_Attribute(ctx, expr):
     @staticmethod
     def build_Call(ctx, expr):
         ref = build_expr(ctx, expr.func, allow_methods=True)
-        if type(ref) is not ExprBuilder._MethodRef:
+        args = [build_expr(ctx, py_arg) for py_arg in expr.args]
+        kwargs = [Attribute(Ident(name), build_expr(ctx, value)) for name, value in expr.keywords]
+        if type(ref) is ExprBuilder._MethodRef:  # Method call
+            return Apply(ref.name, [ref.self] + args, kwargs)
+        elif isinstance(ref, Var):  # Top-level function call
+            return Apply(ref.name, args, kwargs)
+        else:
             ref_range = ref.range()
             parenthesis_range = find_after(ctx, ref_range.end, '(')
             raise FrontendTypeError(
                 ctx.make_raw_range(ref_range.start, parenthesis_range.end),
                 "trying to call a non-function object")
-        args = [build_expr(ctx, py_arg) for py_arg in expr.args]
-        kwargs = [Attribute(Ident(name), build_expr(ctx, value)) for name, value in expr.keywords]
-        return Apply(ref.name, [ref.self] + args, kwargs)
 
     @staticmethod
     def build_Name(ctx, expr):

From c06c6046e39f87a8e086709eed891fac10879615 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@mit.edu>
Date: Sat, 24 Feb 2018 12:24:24 -0500
Subject: [PATCH 3/8] Accept GPU perf test regression. (#5395)

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
---
 .jenkins/perf_test/perf_test_numbers.json | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.jenkins/perf_test/perf_test_numbers.json b/.jenkins/perf_test/perf_test_numbers.json
index c5c91070155e86..c49b1ecfff6522 100644
--- a/.jenkins/perf_test/perf_test_numbers.json
+++ b/.jenkins/perf_test/perf_test_numbers.json
@@ -22,8 +22,8 @@
 	},
 
 	"test_gpu_speed_word_language_model": {
-		"mean": "5.65807",
-		"sigma": "0.1132"
+		"mean": "5.9411499999999995",
+		"sigma": "0.02134777505971057"
 	},
 
 	"test_gpu_speed_cudnn_lstm": {

From 1ff537ca7198ddef7e76d0f75c001d0b3f41d7fd Mon Sep 17 00:00:00 2001
From: Tongzhou Wang <SsnL@users.noreply.github.com>
Date: Sat, 24 Feb 2018 13:32:13 -0500
Subject: [PATCH 4/8] Ignore FileNotFoundError when shutting down in
 data_queue.get (#5380)

* Ignore FileNotFoundError when shutting down in data_queue.get

* Address @apaszke comments
---
 torch/utils/data/dataloader.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py
index 43f4cb50d77e9b..bc2d8b213083bd 100644
--- a/torch/utils/data/dataloader.py
+++ b/torch/utils/data/dataloader.py
@@ -314,9 +314,15 @@ def _shutdown_workers(self):
             if not self.shutdown:
                 self.shutdown = True
                 self.done_event.set()
-                # if worker_manager_thread is waiting to put
-                while not self.data_queue.empty():
-                    self.data_queue.get()
+                # if worker_manager_thread is waiting to put, make place for it
+                try:
+                    while not self.data_queue.empty():
+                        self.data_queue.get()
+                except FileNotFoundError:
+                    # FileNotFoundError can happen when we rebuild the fd
+                    # fetched from the queue but the socket is already closed
+                    # from the worker side (e.g. due to Python shutting down).
+                    pass
                 for _ in self.workers:
                     self.index_queue.put(None)
                 # done_event should be sufficient to exit worker_manager_thread,

From 40d79e4447acf218836132eabb9d4b0d7bfede77 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@mit.edu>
Date: Sat, 24 Feb 2018 17:04:25 -0500
Subject: [PATCH 5/8] Turn on ASAN in continuous integration. (#5271)

I know this works because I had to squelch a bunch of ASAN
errors in multiprocessing.

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
---
 .jenkins/build.sh             | 28 +++++++++++++++++++++++++++-
 .jenkins/disabled-configs.txt |  4 ----
 .jenkins/enabled-configs.txt  |  2 ++
 .jenkins/test.sh              | 10 ++++++++++
 test/test_multiprocessing.py  | 15 +++++++++++++++
 5 files changed, 54 insertions(+), 5 deletions(-)

diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index 5d2a3b870e9ac7..0ae5c8082cd548 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -47,7 +47,33 @@ python --version
 
 pip install -r requirements.txt || true
 
-time python setup.py install
+if [[ "$JOB_NAME" == *asan* ]]; then
+    export ASAN_OPTIONS=detect_leaks=0:symbolize=1
+    # Disable Valgrind tests in run_aten_tests.sh; otherwise
+    # we'll be valgrind'ing an ASAN'ed binary!  ASANity.
+    export VALGRIND=0
+
+    sudo apt-get update
+    sudo apt-get install clang-5.0
+
+    export PATH="/usr/lib/llvm-5.0/bin:$PATH"
+
+    # TODO: Figure out how to avoid hard-coding these paths
+    LD_LIBRARY_PATH=/usr/lib/llvm-5.0/lib/clang/5.0.0/lib/linux \
+      CC="sccache clang" \
+      CXX="sccache clang++" \
+      LDSHARED="clang --shared" \
+      LDFLAGS="-stdlib=libstdc++" \
+      CFLAGS="-fsanitize=address -shared-libasan" \
+      NO_CUDA=1 \
+      python setup.py install
+
+    export LD_PRELOAD=/usr/lib/llvm-5.0/lib/clang/5.0.0/lib/linux/libclang_rt.asan-x86_64.so
+
+else
+    python setup.py install
+
+fi
 
 if [[ "$JOB_NAME" != *cuda* ]]; then
    echo "Testing ATen"
diff --git a/.jenkins/disabled-configs.txt b/.jenkins/disabled-configs.txt
index 2d37dbd199a605..cdd51d3fb54a56 100644
--- a/.jenkins/disabled-configs.txt
+++ b/.jenkins/disabled-configs.txt
@@ -3,7 +3,3 @@
 # fail.  You can use this to temporarily reserve a test name to
 # turn on CI side before PyTorch repository supports it.  This
 # file has the same format as .jenkins/enabled-configs.txt
-
-pytorch-linux-xenial-py3-clang5-asan
-pytorch-linux-xenial-py3-clang5-asan-build
-pytorch-linux-xenial-py3-clang5-asan-test
diff --git a/.jenkins/enabled-configs.txt b/.jenkins/enabled-configs.txt
index 3b3539abf7c041..7b43e9fcb15f74 100644
--- a/.jenkins/enabled-configs.txt
+++ b/.jenkins/enabled-configs.txt
@@ -11,6 +11,8 @@ pytorch-linux-xenial-cuda9-cudnn7-py2-build
 pytorch-linux-xenial-cuda9-cudnn7-py2-test
 pytorch-linux-xenial-cuda9-cudnn7-py3-build
 pytorch-linux-xenial-cuda9-cudnn7-py3-test
+pytorch-linux-xenial-py3-clang5-asan-build
+pytorch-linux-xenial-py3-clang5-asan-test
 pytorch-linux-trusty-py2.7.9-build
 pytorch-linux-trusty-py2.7.9-test
 pytorch-linux-trusty-py2.7-build
diff --git a/.jenkins/test.sh b/.jenkins/test.sh
index 9ab4c592428d65..6614bcc8d7f46a 100755
--- a/.jenkins/test.sh
+++ b/.jenkins/test.sh
@@ -40,6 +40,12 @@ echo "Testing pytorch"
 export OMP_NUM_THREADS=4
 export MKL_NUM_THREADS=4
 
+if [[ "$JOB_NAME" == *asan* ]]; then
+    export PATH="/usr/lib/llvm-5.0/bin:$PATH"
+    export ASAN_OPTIONS=detect_leaks=0:symbolize=1
+    export PYTORCH_TEST_WITH_ASAN=1
+fi
+
 # JIT C++ extensions require ninja.
 git clone https://github.com/ninja-build/ninja --quiet
 pushd ninja
@@ -47,6 +53,10 @@ python ./configure.py --bootstrap
 export PATH="$PWD:$PATH"
 popd
 
+if [[ "$JOB_NAME" == *asan* ]]; then
+    export LD_PRELOAD=/usr/lib/llvm-5.0/lib/clang/5.0.0/lib/linux/libclang_rt.asan-x86_64.so
+fi
+
 time test/run_test.sh -- -v
 
 rm -rf ninja
diff --git a/test/test_multiprocessing.py b/test/test_multiprocessing.py
index a23af037db6c8c..181c881b73e862 100644
--- a/test/test_multiprocessing.py
+++ b/test/test_multiprocessing.py
@@ -21,6 +21,7 @@
     sys.platform != 'darwin' and \
     sys.platform != 'win32'
 TEST_MULTIGPU = TEST_CUDA_IPC and torch.cuda.device_count() > 1
+TEST_WITH_ASAN = os.getenv('PYTORCH_TEST_WITH_ASAN', False)
 
 
 class SubProcess(mp.Process):
@@ -246,10 +247,14 @@ def do_test():
                 do_test()
 
     @unittest.skipIf(platform == 'darwin', "file descriptor strategy is not supported on macOS")
+    @unittest.skipIf(TEST_WITH_ASAN,
+                     "seems to hang with ASAN, see https://github.com/pytorch/pytorch/issues/5326")
     def test_fd_sharing(self):
         self._test_sharing(repeat=TEST_REPEATS)
 
     @unittest.skipIf(platform == 'darwin', "file descriptor strategy is not supported on macOS")
+    @unittest.skipIf(TEST_WITH_ASAN,
+                     "test_fd_preserve_sharing is known buggy, see https://github.com/pytorch/pytorch/issues/5311")
     def test_fd_preserve_sharing(self):
         self._test_preserve_sharing(repeat=TEST_REPEATS)
 
@@ -257,19 +262,27 @@ def test_fd_preserve_sharing(self):
     def test_fd_pool(self):
         self._test_pool(repeat=TEST_REPEATS)
 
+    @unittest.skipIf(TEST_WITH_ASAN,
+                     "test_fs_sharing is known buggy, see https://github.com/pytorch/pytorch/issues/5325")
     def test_fs_sharing(self):
         with fs_sharing():
             self._test_sharing(repeat=TEST_REPEATS)
 
+    @unittest.skipIf(TEST_WITH_ASAN,
+                     "test_fs_preserve_sharing is known buggy, see https://github.com/pytorch/pytorch/issues/5311")
     def test_fs_preserve_sharing(self):
         with fs_sharing():
             self._test_preserve_sharing(repeat=TEST_REPEATS)
 
+    @unittest.skipIf(TEST_WITH_ASAN,
+                     "test_fs_pool is known buggy, see https://github.com/pytorch/pytorch/issues/5325")
     def test_fs_pool(self):
         with fs_sharing():
             self._test_pool(repeat=TEST_REPEATS)
 
     @unittest.skipIf(not HAS_SHM_FILES, "don't not how to check if shm files exist")
+    @unittest.skipIf(TEST_WITH_ASAN,
+                     "test_fs is known buggy, see https://github.com/pytorch/pytorch/issues/5325")
     def test_fs(self):
         def queue_put():
             x = torch.DoubleStorage(4)
@@ -409,6 +422,8 @@ def _test_is_shared(self):
     def test_is_shared(self):
         self._test_is_shared()
 
+    @unittest.skipIf(TEST_WITH_ASAN,
+                     "test_fs_is_shared is known buggy, see https://github.com/pytorch/pytorch/issues/5325")
     def test_fs_is_shared(self):
         with fs_sharing():
             self._test_is_shared()

From d2f71cbdebe48b64888f27b0c97ffb346dacecc2 Mon Sep 17 00:00:00 2001
From: Soumith Chintala <soumith@gmail.com>
Date: Sat, 24 Feb 2018 19:37:00 -0500
Subject: [PATCH 6/8] make CuDNN finders respect library major version (#5399)

---
 aten/cmake/FindCuDNN.cmake   | 12 ++++++++++--
 setup.py                     |  3 +--
 tools/setup_helpers/cudnn.py | 25 +++++++++++++++++--------
 3 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/aten/cmake/FindCuDNN.cmake b/aten/cmake/FindCuDNN.cmake
index 78a6d0eede1735..f7ceed91da96b8 100644
--- a/aten/cmake/FindCuDNN.cmake
+++ b/aten/cmake/FindCuDNN.cmake
@@ -15,13 +15,21 @@ include(FindPackageHandleStandardArgs)
 
 set(CUDNN_ROOT_DIR "" CACHE PATH "Folder contains NVIDIA cuDNN")
 
-find_path(CUDNN_INCLUDE_DIR cudnn.h
+if($ENV{CUDNN_INCLUDE_DIR})
+  SET(CUDNN_INCLUDE_DIR $ENV{CUDNN_INCLUDE_DIR})
+else($ENV{CUDNN_INCLUDE_DIR})
+  find_path(CUDNN_INCLUDE_DIR cudnn.h
     HINTS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
     PATH_SUFFIXES cuda/include include)
+endif($ENV{CUDNN_INCLUDE_DIR})
 
-find_library(CUDNN_LIBRARY cudnn
+if($ENV{CUDNN_LIBRARY})
+  SET(CUDNN_LIBRARY $ENV{CUDNN_LIBRARY})
+else($ENV{CUDNN_LIBRARY})
+  find_library(CUDNN_LIBRARY cudnn
     HINTS ${CUDNN_LIB_DIR} ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}
     PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64)
+endif($ENV{CUDNN_LIBRARY})
 
 find_package_handle_standard_args(
     CUDNN DEFAULT_MSG CUDNN_INCLUDE_DIR CUDNN_LIBRARY)
diff --git a/setup.py b/setup.py
index 0afe1895c56e8c..b9da0241e28251 100644
--- a/setup.py
+++ b/setup.py
@@ -709,8 +709,7 @@ def run(self):
         "torch/csrc/cuda/python_nccl.cpp",
     ]
 if WITH_CUDNN:
-    main_libraries += ['cudnn']
-    library_dirs.insert(0, CUDNN_LIB_DIR)
+    main_libraries += [CUDNN_LIBRARY]
     # NOTE: these are at the front, in case there's another cuDNN in CUDA path
     include_dirs.insert(0, CUDNN_INCLUDE_DIR)
     if not IS_WINDOWS:
diff --git a/tools/setup_helpers/cudnn.py b/tools/setup_helpers/cudnn.py
index 1cc48a7c11179d..566635681171b0 100644
--- a/tools/setup_helpers/cudnn.py
+++ b/tools/setup_helpers/cudnn.py
@@ -35,6 +35,22 @@
     if IS_CONDA:
         lib_paths.append(os.path.join(CONDA_DIR, 'lib'))
         include_paths.append(os.path.join(CONDA_DIR, 'include'))
+    for path in include_paths:
+        if path is None or not os.path.exists(path):
+            continue
+        include_file_path = os.path.join(path, 'cudnn.h')
+        if os.path.exists(include_file_path):
+            CUDNN_INCLUDE_DIR = path
+            CUDNN_INCLUDE_VERSION = -1
+            with open(include_file_path) as f:
+                for line in f:
+                    if "#define CUDNN_MAJOR" in line:
+                        CUDNN_INCLUDE_VERSION = int(line.split()[-1])
+                        break
+            if CUDNN_INCLUDE_VERSION == -1:
+                raise AssertionError("Could not find #define CUDNN_MAJOR in " + include_file_path)
+            break
+
     for path in lib_paths:
         if path is None or not os.path.exists(path):
             continue
@@ -45,18 +61,11 @@
                 CUDNN_LIB_DIR = path
                 break
         else:
-            libraries = sorted(glob.glob(os.path.join(path, 'libcudnn*')))
+            libraries = sorted(glob.glob(os.path.join(path, 'libcudnn*' + str(CUDNN_INCLUDE_VERSION) + "*")))
             if libraries:
                 CUDNN_LIBRARY = libraries[0]
                 CUDNN_LIB_DIR = path
                 break
-    for path in include_paths:
-        if path is None or not os.path.exists(path):
-            continue
-        if os.path.exists((os.path.join(path, 'cudnn.h'))):
-            CUDNN_INCLUDE_DIR = path
-            break
-
     # Specifying the library directly will overwrite the lib directory
     library = os.getenv('CUDNN_LIBRARY')
     if library is not None and os.path.exists(library):

From d7488f4b59406e47994da587b54fb019a22a0130 Mon Sep 17 00:00:00 2001
From: Rachit Singh <rachitsingh@outlook.com>
Date: Tue, 16 Jan 2018 19:17:09 +0000
Subject: [PATCH 7/8] Refactor standard_gamma and implement CUDA gamma sampling

---
 aten/src/ATen/Declarations.cwrap              | 20 ------
 aten/src/ATen/native/Distributions.cpp        | 27 ++++++++
 aten/src/ATen/native/Distributions.cuh        | 63 +++++++++++++++++++
 aten/src/ATen/native/cuda/Distributions.cu    | 31 +++++++++
 aten/src/ATen/native/native_functions.yaml    |  6 ++
 aten/src/TH/THRandom.cpp                      | 29 ---------
 aten/src/TH/THRandom.h                        |  6 --
 aten/src/TH/generic/THTensorRandom.cpp        | 10 ---
 aten/src/TH/generic/THTensorRandom.h          |  1 -
 test/test_distributions.py                    | 13 +++-
 tools/autograd/derivatives.yaml               |  4 +-
 torch/csrc/Module.cpp                         |  2 -
 torch/csrc/generic/methods/TensorRandom.cwrap | 20 ------
 torch/distributions/dirichlet.py              |  3 +-
 torch/distributions/gamma.py                  |  4 +-
 15 files changed, 145 insertions(+), 94 deletions(-)
 create mode 100644 aten/src/ATen/native/Distributions.cuh

diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap
index a59e1c1a0b6438..d426adaad541d5 100644
--- a/aten/src/ATen/Declarations.cwrap
+++ b/aten/src/ATen/Declarations.cwrap
@@ -3966,26 +3966,6 @@
       kwarg_only: True
     - THTensor* self
 ]]
-[[
-  name: _standard_gamma
-  types:
-    - floating_point
-  backends:
-    - CPU
-  return: argument 0
-  variants:
-    - method
-    - function
-  options:
-    - cname: standard_gamma
-      arguments:
-        - arg: THTensor* output
-          output: True
-        - arg: THGenerator* generator
-          default: nullptr
-          kwarg_only: True
-        - THTensor* self
-]]
 [[
   name: _dirichlet_grad
   types:
diff --git a/aten/src/ATen/native/Distributions.cpp b/aten/src/ATen/native/Distributions.cpp
index b1d7a086c6b448..67fc5f607450ab 100644
--- a/aten/src/ATen/native/Distributions.cpp
+++ b/aten/src/ATen/native/Distributions.cpp
@@ -8,6 +8,8 @@
 #include "ATen/CheckGenerator.h"
 #include "ATen/Generator.h"
 
+#include <ATen/native/Distributions.cuh>
+
 #include "TH/THRandom.h"
 
 namespace at {
@@ -155,6 +157,24 @@ namespace dist {
     return gen_->generator;
   }
 
+  template <typename scalar>
+  struct GammaOp {
+    static void apply(Tensor& ret, const Tensor& alpha, THGenerator *generator) {
+      CPU_tensor_apply2<scalar, double>(ret, alpha,
+        [generator](scalar& ret_val, const double& alpha){
+          dist::baseSampler<float> standard_uniform([generator] () {
+            return THRandom_standard_uniform(generator);
+          });
+          dist::baseSampler<float> standard_normal([generator] () {
+            return THRandom_normal(generator, 0.0, 1.0);
+          });
+          auto sample = dist::sample_gamma<float>(alpha, standard_uniform, standard_normal);
+          ret_val = std::max(std::numeric_limits<scalar>::min(), (scalar) sample);
+        }
+      );
+    }
+  };
+
   template <typename scalar>
   struct PoissonOp {
     static int64_t sample_poisson(double lambda, THGenerator *generator) {
@@ -227,5 +247,12 @@ Tensor _s_poisson_cpu(const Tensor& lambda, Generator *gen) {
   return ret;
 }
 
+Tensor _s_gamma_cpu(const Tensor& alpha, Generator *gen) {
+  Tensor ret = alpha.type().zeros(alpha.sizes());
+  auto alpha_ = alpha.toType(ScalarType::Double);
+  dispatch_floating_types<void, dist::GammaOp>(ret.type(), "gamma", ret, alpha_, dist::get_generator(gen));
+  return ret;
+}
+
 } // at::native
 } // at
diff --git a/aten/src/ATen/native/Distributions.cuh b/aten/src/ATen/native/Distributions.cuh
new file mode 100644
index 00000000000000..7d6da545eea228
--- /dev/null
+++ b/aten/src/ATen/native/Distributions.cuh
@@ -0,0 +1,63 @@
+#include "ATen/Config.h"
+#include <functional>
+#if AT_CUDA_ENABLED()
+#include <nvfunctional>
+#endif
+
+namespace at {
+namespace native {
+namespace dist {
+
+  // this wraps sampling primitives to expose a common interface
+  template<typename precision_t>
+  struct baseSampler {
+#if AT_CUDA_ENABLED()
+    nvstd::function<precision_t(void)> sampler;
+    __device__ baseSampler(nvstd::function<precision_t(void)> sampler): sampler(sampler) {}
+    __device__ precision_t sample() {
+      return sampler();
+    }
+#else
+    std::function<precision_t(void)> sampler;
+    baseSampler(std::function<precision_t(void)> sampler): sampler(sampler) {}
+    precision_t sample() {
+      return sampler();
+    }
+#endif
+  };
+
+  template<typename precision_t>
+#if AT_CUDA_ENABLED()
+  __host__ __device__
+#endif
+  precision_t sample_gamma(precision_t alpha, baseSampler<precision_t>& standard_uniform, baseSampler<precision_t>& standard_normal) {
+		precision_t scale = 1.0;
+
+		// Boost alpha for higher acceptance probability.
+		if (alpha < 1.0) {
+			scale *= ::pow(1 - standard_uniform.sample(), 1.0 / alpha);
+			alpha += 1.0;
+		}
+
+		// This implements the acceptance-rejection method of Marsaglia and Tsang (2000)
+		// doi:10.1145/358407.358414
+		const precision_t d = alpha - 1.0 / 3.0;
+		const precision_t c = 1.0 / ::sqrt(9.0 * d);
+		for (;;) {
+			precision_t x, y;
+			do {
+				x = standard_normal.sample();
+				y = 1.0 + c * x;
+			} while (y <= 0);
+			const precision_t v = y * y * y;
+			const precision_t u = 1 - standard_uniform.sample();
+			const precision_t xx = x * x;
+			if (u < 1.0 - 0.0331 * xx * xx)
+				return scale * d * v;
+			if (::log(u) < 0.5 * xx + d * (1.0 - v + ::log(v)))
+				return scale * d * v;
+		}
+	}
+} // dist
+} // native
+} // at
diff --git a/aten/src/ATen/native/cuda/Distributions.cu b/aten/src/ATen/native/cuda/Distributions.cu
index 859e97e0e4503a..81f26715dfee95 100644
--- a/aten/src/ATen/native/cuda/Distributions.cu
+++ b/aten/src/ATen/native/cuda/Distributions.cu
@@ -5,6 +5,10 @@
 #include <curand_kernel.h>
 #include <curand_philox4x32_x.h>
 #include <utility>
+#include <functional>
+#include <nvfunctional>
+
+#include "ATen/native/Distributions.cuh"
 
 #include <TH/THAtomic.h>
 
@@ -26,6 +30,26 @@ namespace dist {
     return std::make_pair(gen_->initial_seed, offset);
   }
 
+  template <typename scalar>
+  struct GammaOpCUDA {
+    static void apply(Tensor& ret, const Tensor& alpha, std::pair<uint64_t, uint64_t> seeds) {
+      at::cuda::CUDA_tensor_apply2<scalar, float>(ret, alpha,
+        [seeds] __device__ (scalar& ret_val, const float& alpha, bool early_exit) {
+          curandStatePhilox4_32_10_t state;
+          curand_init(seeds.first, blockIdx.x * blockDim.x + threadIdx.x, seeds.second, &state);
+          baseSampler<float> standard_uniform([&state] __device__ () {
+            return curand_uniform(&state);
+          });
+          baseSampler<float> standard_normal([&state] __device__ () {
+            return curand_normal(&state);
+          });
+          auto sample = scalar_cast<scalar>(sample_gamma<float>(alpha, standard_uniform, standard_normal));
+          ret_val = ::max(THCNumerics<scalar>::min(), (scalar) sample);
+        }
+      );
+    }
+  };
+
   template <typename scalar>
   struct PoissonOpCUDA {
     static void apply(Tensor& ret, const Tensor& lambda, std::pair<uint64_t, uint64_t> seeds) {
@@ -48,5 +72,12 @@ Tensor _s_poisson_cuda(const Tensor& lambda, Generator* gen) {
   return ret;
 }
 
+Tensor _s_gamma_cuda(const Tensor& alpha, Generator* gen) {
+  Tensor ret = alpha.type().tensor(alpha.sizes());
+  auto alpha_ = alpha.toType(ScalarType::Float);
+  dispatch_floating_types<void, dist::GammaOpCUDA>(ret.type(), "gamma", ret, alpha_, dist::next_philox_seed(gen));
+  return ret;
+}
+
 } // at::native
 } // at
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 1460ca4bbb0c5e..f3a81814c64895 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -392,6 +392,12 @@
     CPU: _s_poisson_cpu
     CUDA: _s_poisson_cuda
 
+- func: standard_gamma(Tensor self, Generator* generator=nullptr) -> Tensor
+  variants: function
+  dispatch:
+    CPU: _s_gamma_cpu
+    CUDA: _s_gamma_cuda
+
 - func: _cudnn_rnn_flatten_weight(TensorList weight_arr, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, bool bidirectional) -> Tensor
   variants: function
 
diff --git a/aten/src/TH/THRandom.cpp b/aten/src/TH/THRandom.cpp
index 99cf4ff58ba67d..a5c35fdea75ffa 100644
--- a/aten/src/TH/THRandom.cpp
+++ b/aten/src/TH/THRandom.cpp
@@ -290,35 +290,6 @@ double THRandom_exponential(THGenerator *_generator, double lambda)
   return(-1. / lambda * log(1-uniform_double(_generator)));
 }
 
-double THRandom_standard_gamma(THGenerator *_generator, double alpha) {
-  double scale = 1.0;
-
-  // Boost alpha for higher acceptance probability.
-  if(alpha < 1.0) {
-    scale *= pow(1 - uniform_double(_generator), 1.0 / alpha);
-    alpha += 1.0;
-  }
-
-  // This implements the acceptance-rejection method of Marsaglia and Tsang (2000)
-  // doi:10.1145/358407.358414
-  const double d = alpha - 1.0 / 3.0;
-  const double c = 1.0 / sqrt(9.0 * d);
-  for(;;) {
-    double x, y;
-    do {
-      x = THRandom_normal(_generator, 0.0, 1.0);
-      y = 1.0 + c * x;
-    } while(y <= 0);
-    const double v = y * y * y;
-    const double u = 1 - uniform_double(_generator);
-    const double xx = x * x;
-    if(u < 1.0 - 0.0331 * xx * xx)
-      return scale * d * v;
-    if(log(u) < 0.5 * xx + d * (1.0 - v + log(v)))
-      return scale * d * v;
-  }
-}
-
 double THRandom_cauchy(THGenerator *_generator, double median, double sigma)
 {
   return(median + sigma * tan(M_PI*(uniform_double(_generator)-0.5)));
diff --git a/aten/src/TH/THRandom.h b/aten/src/TH/THRandom.h
index 33720af4ea95df..b7a827f49e0dba 100644
--- a/aten/src/TH/THRandom.h
+++ b/aten/src/TH/THRandom.h
@@ -63,12 +63,6 @@ TH_API double THRandom_normal(THGenerator *_generator, double mean, double stdv)
 */
 TH_API double THRandom_exponential(THGenerator *_generator, double lambda);
 
-/** Generates a random number from a standard Gamma distribution.
-    The Gamma density is proportional to $x^{alpha-1} exp(-x)$
-    The shape parameter alpha (a.k.a. k) is a positive real number.
-*/
-TH_API double THRandom_standard_gamma(THGenerator *_generator, double alpha);
-
 /** Returns a random number from a Cauchy distribution.
     The Cauchy density is $p(x) = sigma/(pi*(sigma^2 + (x-median)^2))$
 */
diff --git a/aten/src/TH/generic/THTensorRandom.cpp b/aten/src/TH/generic/THTensorRandom.cpp
index 746f11073a2de4..01ba6a81e97173 100644
--- a/aten/src/TH/generic/THTensorRandom.cpp
+++ b/aten/src/TH/generic/THTensorRandom.cpp
@@ -138,16 +138,6 @@ void THTensor_(exponential)(THTensor *self, THGenerator *_generator, double lamb
   TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_exponential(_generator, lambda););
 }
 
-void THTensor_(standard_gamma)(THTensor *self, THGenerator *_generator, THTensor *alpha)
-{
-  std::lock_guard<std::mutex> lock(_generator->mutex);
-  THTensor_(resizeAs)(self, alpha);
-  TH_TENSOR_APPLY2(real, self, real, alpha, {
-    const real sample = THRandom_standard_gamma(_generator, *alpha_data);
-    *self_data = sample > 0 ? sample : TH_REAL_MIN;
-  });
-}
-
 #undef TH_REAL_MIN
 
 void THTensor_(cauchy)(THTensor *self, THGenerator *_generator, double median, double sigma)
diff --git a/aten/src/TH/generic/THTensorRandom.h b/aten/src/TH/generic/THTensorRandom.h
index 646e497f325f81..dc6bdafaae703c 100644
--- a/aten/src/TH/generic/THTensorRandom.h
+++ b/aten/src/TH/generic/THTensorRandom.h
@@ -18,7 +18,6 @@ TH_API void THTensor_(normal_means)(THTensor *self, THGenerator *gen, THTensor *
 TH_API void THTensor_(normal_stddevs)(THTensor *self, THGenerator *gen, double mean, THTensor *stddevs);
 TH_API void THTensor_(normal_means_stddevs)(THTensor *self, THGenerator *gen, THTensor *means, THTensor *stddevs);
 TH_API void THTensor_(exponential)(THTensor *self, THGenerator *_generator, double lambda);
-TH_API void THTensor_(standard_gamma)(THTensor *self, THGenerator *_generator, THTensor *alpha);
 TH_API void THTensor_(cauchy)(THTensor *self, THGenerator *_generator, double median, double sigma);
 TH_API void THTensor_(logNormal)(THTensor *self, THGenerator *_generator, double mean, double stdv);
 TH_API void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTensor *prob_dist, int n_sample, int with_replacement);
diff --git a/test/test_distributions.py b/test/test_distributions.py
index 718986dda45bb4..26a3108ab5cf7b 100644
--- a/test/test_distributions.py
+++ b/test/test_distributions.py
@@ -712,7 +712,7 @@ def test_poisson_sample(self):
     @unittest.skipIf(not TEST_CUDA, "CUDA not found")
     @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
     def test_poisson_gpu_sample(self):
-        set_rng_seed(0)
+        set_rng_seed(1)
         for rate in [0.12, 0.9, 4.0]:
             self._check_sampler_discrete(Poisson(torch.Tensor([rate]).cuda()),
                                          scipy.stats.poisson(rate),
@@ -1089,6 +1089,17 @@ def test_gamma_sample(self):
                                         scipy.stats.gamma(alpha, scale=1.0 / beta),
                                         'Gamma(concentration={}, rate={})'.format(alpha, beta))
 
+    @unittest.skipIf(not TEST_CUDA, "CUDA not found")
+    @unittest.skipIf(not TEST_NUMPY, "Numpy not found")
+    def test_gamma_gpu_sample(self):
+        set_rng_seed(0)
+        for alpha, beta in product([0.1, 1.0, 5.0], [0.1, 1.0, 10.0]):
+            a, b = torch.Tensor([alpha]).cuda(), torch.Tensor([beta]).cuda()
+            self._check_sampler_sampler(Gamma(a, b),
+                                        scipy.stats.gamma(alpha, scale=1.0 / beta),
+                                        'Gamma(alpha={}, beta={})'.format(alpha, beta),
+                                        failure_rate=1e-4)
+
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
     def test_pareto(self):
         scale = Variable(torch.randn(2, 3).abs(), requires_grad=True)
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index fbd4efdb08c795..7f867d3b7a2992 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -663,8 +663,8 @@
   self: not_implemented("_sparse_mask")
   mask: not_implemented("_sparse_mask")
 
-- name: _standard_gamma(Tensor self, Generator generator)
-  self: grad * self._standard_gamma_grad(output)
+- name: standard_gamma(Tensor self, Generator generator)
+  self: grad * self._standard_gamma_grad(result)
 
 - name: _standard_gamma_grad(Tensor self, Tensor output)
   self: not_implemented("_standard_gamma_grad")
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index aa957711ac54af..249416edd290f0 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -302,7 +302,6 @@ IMPLEMENT_STATELESS(bmm)
 // TODO: this doesn't implement options that return numbers!
 IMPLEMENT_STATELESS(multinomial)
 IMPLEMENT_STATELESS(normal)
-IMPLEMENT_STATELESS(_standard_gamma)
 IMPLEMENT_STATELESS(_dirichlet_grad)
 IMPLEMENT_STATELESS(bernoulli)
 IMPLEMENT_STATELESS(range)
@@ -752,7 +751,6 @@ static PyMethodDef TorchMethods[] = {
   {"bmm",             (PyCFunction)THPModule_bmm,               METH_VARARGS | METH_KEYWORDS, NULL},
   {"multinomial",     (PyCFunction)THPModule_multinomial,       METH_VARARGS | METH_KEYWORDS, NULL},
   {"normal",          (PyCFunction)THPModule_normal,            METH_VARARGS | METH_KEYWORDS, NULL},
-  {"_standard_gamma", (PyCFunction)THPModule__standard_gamma,   METH_VARARGS | METH_KEYWORDS, NULL},
   {"_dirichlet_grad", (PyCFunction)THPModule__dirichlet_grad,   METH_VARARGS | METH_KEYWORDS, NULL},
   {"bernoulli",       (PyCFunction)THPModule_bernoulli,         METH_VARARGS | METH_KEYWORDS, NULL},
   {"rand",            (PyCFunction)THPModule_rand,              METH_VARARGS | METH_KEYWORDS, NULL},
diff --git a/torch/csrc/generic/methods/TensorRandom.cwrap b/torch/csrc/generic/methods/TensorRandom.cwrap
index a0ee66ec033481..62dc18dd273671 100644
--- a/torch/csrc/generic/methods/TensorRandom.cwrap
+++ b/torch/csrc/generic/methods/TensorRandom.cwrap
@@ -210,26 +210,6 @@
       default: 1
 ]]
 
-[[
-  name: _standard_gamma
-  types:
-    - floating_point
-  backends:
-    - CPU
-  return: argument 0
-  variants:
-    - function
-  options:
-    - cname: standard_gamma
-      arguments:
-        - arg: THTensor* output
-          output: True
-        - arg: THGenerator* generator
-          default: THPGenerator_TH_CData(THPDefaultGenerator)
-          kwarg_only: True
-        - THTensor* alpha
-]]
-
 [[
   name: _dirichlet_grad
   types:
diff --git a/torch/distributions/dirichlet.py b/torch/distributions/dirichlet.py
index 87b7bb9204fc2d..0c945f37d4c10d 100644
--- a/torch/distributions/dirichlet.py
+++ b/torch/distributions/dirichlet.py
@@ -5,11 +5,12 @@
 from torch.autograd.function import once_differentiable
 from torch.distributions import constraints
 from torch.distributions.exp_family import ExponentialFamily
+from torch.distributions.gamma import _standard_gamma
 from torch.distributions.utils import _finfo, broadcast_all
 
 
 def _dirichlet_sample_nograd(concentration):
-    probs = torch._C._standard_gamma(concentration)
+    probs = _standard_gamma(concentration)
     probs /= probs.sum(-1, True)
     eps = _finfo(probs).eps
     return probs.clamp_(min=eps, max=1 - eps)
diff --git a/torch/distributions/gamma.py b/torch/distributions/gamma.py
index 60ec109ef1a359..8123e4bce46ae6 100644
--- a/torch/distributions/gamma.py
+++ b/torch/distributions/gamma.py
@@ -10,8 +10,8 @@
 
 def _standard_gamma(concentration):
     if not isinstance(concentration, Variable):
-        return torch._C._standard_gamma(concentration)
-    return concentration._standard_gamma()
+        return torch._C._VariableFunctions.standard_gamma(Variable(concentration)).data
+    return torch._C._VariableFunctions.standard_gamma(concentration)
 
 
 class Gamma(ExponentialFamily):

From 6ff5d33ed7e00df70b8914c9e23d49d71077eda5 Mon Sep 17 00:00:00 2001
From: Rachit Singh <rachitsingh@outlook.com>
Date: Wed, 31 Jan 2018 13:21:25 +0000
Subject: [PATCH 8/8] Attempt fixes for AT_CUDA_ENABLED changes

---
 aten/src/ATen/SharedDist.cu                | 24 +++++++++
 aten/src/ATen/native/Distributions.cpp     | 55 +++++++++++++++++--
 aten/src/ATen/native/Distributions.cuh     | 63 ----------------------
 aten/src/ATen/native/cuda/Distributions.cu |  6 ++-
 4 files changed, 80 insertions(+), 68 deletions(-)
 create mode 100644 aten/src/ATen/SharedDist.cu
 delete mode 100644 aten/src/ATen/native/Distributions.cuh

diff --git a/aten/src/ATen/SharedDist.cu b/aten/src/ATen/SharedDist.cu
new file mode 100644
index 00000000000000..3bba673e5b073c
--- /dev/null
+++ b/aten/src/ATen/SharedDist.cu
@@ -0,0 +1,24 @@
+#include "ATen/ATen.h"
+#include "ATen/TensorUtils.h"
+#include "ATen/NativeFunctions.h"
+#include "ATen/Dispatch.h"
+#include "ATen/Config.h"
+
+#include <nvfunctional>
+ 
+namespace at {
+  namespace native {
+    namespace dist {
+      template<typename precision_t>
+      struct baseSampler {
+        nvstd::function<precision_t(void)> sampler;
+        baseSampler(nvstd::function<precision_t(void)> sampler): sampler(sampler) {}
+        precision_t sample() {
+          return sampler();
+        }
+      };
+    }
+  }
+}
+
+// this version is only linked if CUDA is enabled, so we can safely just use CUDA features here
diff --git a/aten/src/ATen/native/Distributions.cpp b/aten/src/ATen/native/Distributions.cpp
index 67fc5f607450ab..7ca705e267ceca 100644
--- a/aten/src/ATen/native/Distributions.cpp
+++ b/aten/src/ATen/native/Distributions.cpp
@@ -1,6 +1,7 @@
 #include "ATen/ATen.h"
 #include "ATen/CPUApplyUtils.h"
 #include "ATen/Dispatch.h"
+#include "ATen/Config.h"
 #include "ATen/ExpandUtils.h"
 #include "ATen/NativeFunctions.h"
 
@@ -8,7 +9,7 @@
 #include "ATen/CheckGenerator.h"
 #include "ATen/Generator.h"
 
-#include <ATen/native/Distributions.cuh>
+#include <functional>
 
 #include "TH/THRandom.h"
 
@@ -121,12 +122,23 @@ Tensor _standard_gamma_grad_cuda(const Tensor& self, const Tensor& output) {
 
 /*
  * This section is a counterpart to Distributions.cu
- *
  */
 
 namespace dist {
-  // The function `sample_poisson`
-  // is adapted from Numpy's distributions.c implementation.
+
+#if !AT_CUDA_ENABLED()
+  template<typename precision_t>
+  struct baseSampler {
+    std::function<precision_t(void)> sampler;
+    baseSampler(std::function<precision_t(void)> sampler): sampler(sampler) {}
+    precision_t sample() {
+      return sampler();
+    }
+  };
+#endif
+  
+  // The functions `sample_poisson`, `sample_gamma`
+  // are adapted from Numpy's distributions.c implementation.
   // It is MIT licensed, so here is the copyright:
 
   /* Copyright 2005 Robert Kern (robert.kern@gmail.com)
@@ -151,6 +163,41 @@ namespace dist {
    * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
    */
 
+  
+  template<typename precision_t>
+#if AT_CUDA_ENABLED()
+  __host__ __device__
+#endif
+  precision_t sample_gamma(precision_t alpha, baseSampler<precision_t>& standard_uniform, baseSampler<precision_t>& standard_normal) {
+
+    precision_t scale = 1.0;
+
+		// Boost alpha for higher acceptance probability.
+		if (alpha < 1.0) {
+			scale *= ::pow(1 - standard_uniform.sample(), 1.0 / alpha);
+			alpha += 1.0;
+		}
+
+		// This implements the acceptance-rejection method of Marsaglia and Tsang (2000)
+		// doi:10.1145/358407.358414
+		const precision_t d = alpha - 1.0 / 3.0;
+		const precision_t c = 1.0 / ::sqrt(9.0 * d);
+		for (;;) {
+			precision_t x, y;
+			do {
+				x = standard_normal.sample();
+				y = 1.0 + c * x;
+			} while (y <= 0);
+			const precision_t v = y * y * y;
+			const precision_t u = 1 - standard_uniform.sample();
+			const precision_t xx = x * x;
+			if (u < 1.0 - 0.0331 * xx * xx)
+				return scale * d * v;
+			if (::log(u) < 0.5 * xx + d * (1.0 - v + ::log(v)))
+				return scale * d * v;
+		}
+	}
+
   THGenerator * get_generator(Generator *gen) {
     auto default_gen = &at::globalContext().defaultGenerator(Backend::CPU);
     auto gen_ = check_generator<CPUGenerator>(gen, default_gen);
diff --git a/aten/src/ATen/native/Distributions.cuh b/aten/src/ATen/native/Distributions.cuh
deleted file mode 100644
index 7d6da545eea228..00000000000000
--- a/aten/src/ATen/native/Distributions.cuh
+++ /dev/null
@@ -1,63 +0,0 @@
-#include "ATen/Config.h"
-#include <functional>
-#if AT_CUDA_ENABLED()
-#include <nvfunctional>
-#endif
-
-namespace at {
-namespace native {
-namespace dist {
-
-  // this wraps sampling primitives to expose a common interface
-  template<typename precision_t>
-  struct baseSampler {
-#if AT_CUDA_ENABLED()
-    nvstd::function<precision_t(void)> sampler;
-    __device__ baseSampler(nvstd::function<precision_t(void)> sampler): sampler(sampler) {}
-    __device__ precision_t sample() {
-      return sampler();
-    }
-#else
-    std::function<precision_t(void)> sampler;
-    baseSampler(std::function<precision_t(void)> sampler): sampler(sampler) {}
-    precision_t sample() {
-      return sampler();
-    }
-#endif
-  };
-
-  template<typename precision_t>
-#if AT_CUDA_ENABLED()
-  __host__ __device__
-#endif
-  precision_t sample_gamma(precision_t alpha, baseSampler<precision_t>& standard_uniform, baseSampler<precision_t>& standard_normal) {
-		precision_t scale = 1.0;
-
-		// Boost alpha for higher acceptance probability.
-		if (alpha < 1.0) {
-			scale *= ::pow(1 - standard_uniform.sample(), 1.0 / alpha);
-			alpha += 1.0;
-		}
-
-		// This implements the acceptance-rejection method of Marsaglia and Tsang (2000)
-		// doi:10.1145/358407.358414
-		const precision_t d = alpha - 1.0 / 3.0;
-		const precision_t c = 1.0 / ::sqrt(9.0 * d);
-		for (;;) {
-			precision_t x, y;
-			do {
-				x = standard_normal.sample();
-				y = 1.0 + c * x;
-			} while (y <= 0);
-			const precision_t v = y * y * y;
-			const precision_t u = 1 - standard_uniform.sample();
-			const precision_t xx = x * x;
-			if (u < 1.0 - 0.0331 * xx * xx)
-				return scale * d * v;
-			if (::log(u) < 0.5 * xx + d * (1.0 - v + ::log(v)))
-				return scale * d * v;
-		}
-	}
-} // dist
-} // native
-} // at
diff --git a/aten/src/ATen/native/cuda/Distributions.cu b/aten/src/ATen/native/cuda/Distributions.cu
index 81f26715dfee95..b6e62c7b6d70a3 100644
--- a/aten/src/ATen/native/cuda/Distributions.cu
+++ b/aten/src/ATen/native/cuda/Distributions.cu
@@ -1,5 +1,8 @@
+#include "ATen/ATen.h"
+#include "ATen/TensorUtils.h"
 #include "ATen/NativeFunctions.h"
 #include "ATen/Dispatch.h"
+#include "ATen/Config.h"
 #include "ATen/cuda/CUDAApplyUtils.cuh"
 #include <curand.h>
 #include <curand_kernel.h>
@@ -8,7 +11,8 @@
 #include <functional>
 #include <nvfunctional>
 
-#include "ATen/native/Distributions.cuh"
+#include "ATen/SharedDist.cu"
+#include "ATen/native/Distributions.cpp"
 
 #include <TH/THAtomic.h>