From fbf1f065212e28ab1b19a124f2b5ceba8ebd9592 Mon Sep 17 00:00:00 2001 From: Adam Paszke Date: Sat, 24 Feb 2018 11:15:43 +0100 Subject: [PATCH 1/8] Implement no-attribute dispatch of ATen ops from the JIT (#5298) --- tools/jit/gen_jit_dispatch.py | 167 ++++++++++++++++++-------- tools/jit/templates/aten_dispatch.cpp | 7 ++ torch/csrc/jit/tensor_conversions.h | 89 ++++++++++++-- 3 files changed, 208 insertions(+), 55 deletions(-) diff --git a/tools/jit/gen_jit_dispatch.py b/tools/jit/gen_jit_dispatch.py index 657eedcca570d9..40c0792e37670d 100644 --- a/tools/jit/gen_jit_dispatch.py +++ b/tools/jit/gen_jit_dispatch.py @@ -28,19 +28,25 @@ 'IntList': 'std::vector', } -ATTR_ASSIGNMENT = CodeTemplate("""\ +KW_ASSIGNMENT = CodeTemplate("""\ auto ${name} = ${type_cast}(node->${method}(Symbol("${name}")));\ """) +POS_ASSIGNMENT = CodeTemplate("""\ +auto ${name} = tensor_as<${type}>(std::move(fromLast(stack, ${arg_idx})));\ +""") + CALL_NAMESPACE = CodeTemplate("at::${name}(${args})") CALL_METHOD = CodeTemplate("(${first}).${name}(${args})") CONSTRUCTOR = CodeTemplate("""\ {"${descriptor}", [](Node *node) { - ${assignments} + ${kw_assignments} return TensorOp([=](Stack & stack) { autograd::profiler::RecordFunction record("${name}"); - AutoGPU device_guard(deviceForInputs(stack, ${num_inputs})); + AutoGPU device_guard(deviceForInputs(stack, ${num_inputs} + ${num_dropped_args})); + ${pos_assignments} + ${pos_arg_drop} auto result = ${call}; drop(stack, ${num_inputs}); pack(stack, std::move(result)); @@ -51,18 +57,33 @@ def is_jit_op(decl): + uses_tensors = any(arg['simple_type'] in {'Tensor', 'TensorList'} for arg in decl['arguments']) or \ + 'Tensor' in decl['method_of'] return (not decl['api_name'].endswith('_') and not decl['name'].endswith('_out') and not any(arg['simple_type'] == 'Generator' for arg in decl['arguments']) and not any(arg['simple_type'] == 'SparseTensor' for arg in decl['arguments']) and not any(arg['simple_type'] == 'Storage' for arg in decl['arguments']) and not any(arg['simple_type'] == 'Type' for arg in decl['arguments']) and - any(arg['simple_type'] in {'Tensor', 'TensorList'} for arg in decl['arguments']) and - 'Tensor' in decl['return_type']) + uses_tensors) + + +skip_scalar_overload = { + 'lt-2': [1], 'gt-2': [1], 'le-2': [1], 'ge-2': [1], 'eq-2': [1], 'ne-2': [1], + 'pow-2': [0, 1], 'add-3': [1], 'sub-3': [1], 'mul-2': [1], 'div-2': [1], + 'fmod-2': [1], 'remainder-2': [1] +} def gen_jit_dispatch(declarations, out): - aten_decls = load_aten_declarations(declarations) + # We need to add methods implemented manually in TensorImpl + tensor_impl_methods = [{ + 'name': name, + 'api_name': name, + 'method_of': ['Tensor'], + 'arguments': [{'name': 'self', 'simple_type': 'Tensor'}], + } for name in ['sizes', 'strides', 'dim']] + aten_decls = load_aten_declarations(declarations) + tensor_impl_methods jit_decls = [d for d in aten_decls if is_jit_op(d)] def is_tensor_arg(arg): @@ -72,54 +93,104 @@ def is_tensor_arg(arg): for decl in jit_decls: arguments = decl['arguments'] name = decl['name'] - scalar_args = [arg for arg in arguments if not is_tensor_arg(arg)] has_tensorlist = any(arg['simple_type'] == 'TensorList' for arg in arguments) - - # Descriptor is a unique identified for a particular overload of an op - attr_names = sorted([arg['name'] for arg in scalar_args]) - num_inputs = len(arguments) - len(scalar_args) if not has_tensorlist else "*" - descriptor = '-'.join([decl['name'], str(num_inputs)] + attr_names) - - # All scalar args need to be assigned, so they can be captured by a lambda - assignments = [ATTR_ASSIGNMENT.substitute(type=arg['simple_type'], - type_cast=TYPE_CASTS.get(arg['simple_type'], arg['simple_type']), - name=arg['name'], - method=ATTR_METHOD_MAP[arg['simple_type']]) - for arg in scalar_args] - if num_inputs == "*": - assignments.append('auto varargs_length = node->inputs().size();') - num_inputs = 'varargs_length' - - # Generate the actuall ATen call. This gets a bit tricky because of - # TensorList arguments, and functions that are only available as methods. - if 'namespace' in decl['method_of']: - if has_tensorlist: - if sum(map(is_tensor_arg, arguments)) != 1: - # TODO: support this + scalar_arg_idx = [i for i, arg in enumerate(arguments) if not is_tensor_arg(arg)] + num_tensor_args = sum(map(is_tensor_arg, arguments)) + # TODO: support this + if has_tensorlist and (num_tensor_args != 1 or not is_tensor_arg(arguments[0])): + continue + + # Right now, we generate dispatch methods that either take all non-tensor arguments + # as attributes, or don't use any attributes at all. In the future we might want to + # have something in the middle too (might be useful for e.g. constant propagation + # into attributes, as that would allow us to avoid reparsing tensors into scalar + # args at every invocation). + # NB: if there are no scalar args then both options on LHS are equivalent, so deduplicate them. + scalar_arg_idx_iter = ([], scalar_arg_idx) if scalar_arg_idx else ([],) + for pos_scalar_arg_idx in scalar_arg_idx_iter: + num_args = len(arguments) + num_inputs = num_tensor_args + len(pos_scalar_arg_idx) if not has_tensorlist else '*' + + # Scatter arguments into positional and keyword, and compute stack offsets + # of posiitional args. + pos_scalar_args, kw_scalar_args = [], [] + scalar_stack_off, tensor_stack_off = [], [] + for i, arg in enumerate(arguments): + # XXX: we currently support only TensorList ops that have a TensorList as + # the first argument, that is then followed by a number of positional args. + stack_off = (num_args if num_inputs == '*' else num_inputs) - i - 1 + if is_tensor_arg(arg): + tensor_stack_off.append(stack_off) + else: + if i in pos_scalar_arg_idx: + pos_scalar_args.append(arg) + scalar_stack_off.append(stack_off) + else: + kw_scalar_args.append(arg) + + # Descriptor is a unique identifier for a particular overload of an op. + attr_names = sorted([arg['name'] for arg in kw_scalar_args]) + descriptor = '-'.join([decl['name'], str(num_inputs)] + attr_names) + + # If there are two overloads with the same descriptor, that differ only by a type of a + # single argument, where one of them takes a tensor, while another one takes an + # at::Scalar as a positional scalar arg, then prefer the tensor overload. + # It should get broadcasted correctly. + if descriptor in skip_scalar_overload: + if any(arguments[idx]['simple_type'] == 'Scalar' + for idx in skip_scalar_overload[descriptor]): continue - args = ['last(stack, varargs_length)' if is_tensor_arg(arg) else arg['name'] - for arg in arguments] + kw_assignments = [KW_ASSIGNMENT.substitute(type_cast=TYPE_CASTS.get(arg['simple_type'], arg['simple_type']), + name=arg['name'], + method=ATTR_METHOD_MAP[arg['simple_type']]) + for arg in kw_scalar_args] + if num_inputs == "*": + kw_assignments.append('size_t varargs_length = node->inputs().size();') + num_inputs = 'varargs_length' + pos_assignments = [POS_ASSIGNMENT.substitute(type=arg['simple_type'], + name=arg['name'], + arg_idx=arg_idx) + for arg_idx, arg in zip(scalar_stack_off, pos_scalar_args)] + + # Generate the actuall ATen call. This gets a bit tricky because of + # TensorList arguments, and functions that are only available as methods. + pos_arg_drop = '' + num_dropped_args = 0 + if 'namespace' in decl['method_of']: + if has_tensorlist: + # We need to drop the scalar args following varargs before we use last + if pos_scalar_args: + num_dropped_args = len(pos_scalar_args) + pos_arg_drop = 'drop(stack, {});'.format(num_dropped_args) + args = ['last(stack, varargs_length)' if is_tensor_arg(arg) else arg['name'] + for arg in arguments] + else: + tensor_id = iter(tensor_stack_off) + args = ['std::move(fromLast(stack, {}))'.format(1 + next(tensor_id)) + if is_tensor_arg(arg) else arg['name'] + for arg in arguments] + call = CALL_NAMESPACE.substitute(name=name, args=args) else: - tensor_id = iter(count(start=num_inputs, step=-1)) - args = ['std::move(fromLast(stack,{}))'.format( - next(tensor_id)) if is_tensor_arg(arg) else arg['name'] - for arg in arguments] - call = CALL_NAMESPACE.substitute(name=name, args=args) - else: - tensor_id = iter(count(start=num_inputs, step=-1)) - args = ['std::move(fromLast(stack,{}))'.format(next(tensor_id)) if is_tensor_arg(arg) else arg['name'] - for arg in arguments] - call = CALL_METHOD.substitute(name=name, first=args[0], args=args[1:]) - - constructor = CONSTRUCTOR.substitute(descriptor=descriptor, name=name, call=call, - assignments=assignments, - num_inputs=num_inputs) - assert descriptor not in ops, descriptor - ops[descriptor] = constructor + tensor_id = iter(tensor_stack_off) + args = ['std::move(fromLast(stack, {}))'.format(1 + next(tensor_id)) + if is_tensor_arg(arg) else arg['name'] + for arg in arguments] + call = CALL_METHOD.substitute(name=name, first=args[0], args=args[1:]) + + constructor = CONSTRUCTOR.substitute(descriptor=descriptor, name=name, + num_dropped_args=num_dropped_args, + pos_arg_drop=pos_arg_drop, + call=call, + kw_assignments=kw_assignments, + pos_assignments=pos_assignments, + num_inputs=num_inputs) + + assert descriptor not in ops, descriptor + ops[descriptor] = constructor # Sort the generated snippets to ensure that the generation is deterministic - env = {'constructors': sorted(list(ops.values()))} + env = {'constructors': sorted(ops.values())} write(out, 'aten_dispatch.h', ATEN_DISPATCH_H, env) write(out, 'aten_dispatch.cpp', ATEN_DISPATCH_CPP, env) diff --git a/tools/jit/templates/aten_dispatch.cpp b/tools/jit/templates/aten_dispatch.cpp index b3122c6eb80f10..67998c9cb0d68c 100644 --- a/tools/jit/templates/aten_dispatch.cpp +++ b/tools/jit/templates/aten_dispatch.cpp @@ -1,6 +1,7 @@ #include "aten_dispatch.h" #include "torch/csrc/autograd/profiler.h" #include "torch/csrc/jit/interned_strings.h" +#include "torch/csrc/jit/tensor_conversions.h" #include "torch/csrc/utils/functional.h" #include @@ -25,9 +26,15 @@ namespace { // copies. // pack takes the return values of aten functions pushes them onto the stack +template +void pack(Stack & stack, T&& v) { + stack.push_back(as_tensor(std::move(v))); +} +template<> void pack(Stack & stack, Tensor&& v) { stack.push_back(std::move(v)); } +template<> void pack(Stack & stack, std::vector&& ts) { for(auto& t : ts) { stack.push_back(std::move(t)); diff --git a/torch/csrc/jit/tensor_conversions.h b/torch/csrc/jit/tensor_conversions.h index d23d96dbaea13d..2eec5f2eadc5a5 100644 --- a/torch/csrc/jit/tensor_conversions.h +++ b/torch/csrc/jit/tensor_conversions.h @@ -1,15 +1,90 @@ #pragma once #include "ATen/ATen.h" +#include +#include + +namespace torch { namespace jit { + +////////////////////////////////////////////////////////////////////////////////// +// Tensor -> T conversion +////////////////////////////////////////////////////////////////////////////////// + +namespace detail { + +template +struct tensor_as_impl {}; + template -static inline T tensor_as(at::Tensor&& t) = delete; +struct tensor_as_impl::value>::type> { + T operator()(at::Tensor&& t) { + // workaround for 1-dim 1-element pytorch tensors until zero-dim + // tensors are fully supported + if(t.ndimension() == 1 && t.size(0) == 1) { + t = t[0]; + } + return at::Scalar(t).to(); + } +}; + +template +struct tensor_as_impl> { + std::array operator()(at::Tensor&& t) { + throw std::runtime_error("tensor_as>: NYI"); + } +}; + +template<> +struct tensor_as_impl { + at::IntList operator()(at::Tensor&& t) { + if (t.type().scalarType() != at::ScalarType::Long) + throw std::runtime_error("Expected a LongTensor"); + if (t.dim() != 1) + throw std::runtime_error("Expected a 1D LongTensor"); + if (!t.is_contiguous()) + throw std::runtime_error("Expected a contiguous LongTensor"); + return at::IntList{t.data(), static_cast(t.numel())}; + } +}; template<> -inline int64_t tensor_as(at::Tensor&& t) { - // workaround for 1-dim 1-element pytorch tensors until zero-dim - // tensors are fully supported - if(t.ndimension() == 1 && t.size(0) == 1) { - t = t[0]; +struct tensor_as_impl { + at::Scalar operator()(at::Tensor&& t) { + return at::Scalar(t.view({})); } - return at::Scalar(t).to(); +}; + +} + +template +inline T tensor_as(at::Tensor&& t) { + return detail::tensor_as_impl()(std::move(t)); +} + +////////////////////////////////////////////////////////////////////////////////// +// T -> Tensor conversion +////////////////////////////////////////////////////////////////////////////////// + +inline at::Tensor as_tensor(int64_t v) { + return at::Scalar(v).toTensor(); +} + +inline at::Tensor as_tensor(double v) { + return at::Scalar(v).toTensor(); +} + +inline at::Tensor as_tensor(bool v) { + return at::Scalar(v).toTensor(); +} + +inline at::Tensor as_tensor(at::IntList l) { + return at::CPU(at::kLong).tensorFromBlob(const_cast(reinterpret_cast(l.data())), + {static_cast(l.size())}).clone(); } + + +inline at::Tensor as_tensor(at::Scalar&& s) { + return s.toTensor(); +} + +}} // namespace torch::jit From a0118533ef1bebddb8df9f0261ab9ee27f3b157f Mon Sep 17 00:00:00 2001 From: Adam Paszke Date: Sat, 24 Feb 2018 11:15:55 +0100 Subject: [PATCH 2/8] Add a print() function to the JIT script (#5274) Additionally: - add support for calling functions that are not methods in the Python frontend - add an end-to-end test for the Python frontend - add a capture_stdout helper for checking that `print` actually works --- .../TestJit.test_python_frontend.expect | 10 ++- ...Jit.test_python_frontend_run-stdout.expect | 5 ++ test/test_jit.py | 72 ++++++++++++++++++- torch/csrc/jit/init.cpp | 2 +- torch/csrc/jit/interned_strings.h | 1 + torch/csrc/jit/interpreter.cpp | 21 ++++++ torch/csrc/jit/script/compiler.cpp | 10 +++ torch/csrc/jit/script/compiler.h | 4 +- torch/csrc/jit/script/init.cpp | 8 +-- torch/csrc/jit/script/python_tree_views.cpp | 2 +- torch/csrc/jit/script/tree_views.h | 3 + torch/jit/frontend.py | 31 ++++++-- 12 files changed, 153 insertions(+), 16 deletions(-) create mode 100644 test/expect/TestJit.test_python_frontend_run-stdout.expect diff --git a/test/expect/TestJit.test_python_frontend.expect b/test/expect/TestJit.test_python_frontend.expect index 88baca9d5872fb..1e7b39f4a9eb32 100644 --- a/test/expect/TestJit.test_python_frontend.expect +++ b/test/expect/TestJit.test_python_frontend.expect @@ -16,7 +16,15 @@ (+ (variable (ident x)) (variable (ident y))) - (variable (ident z)))) + (apply + (ident sigmoid) + (list (variable (ident z))) + (list)))) + (expression statement + (apply + (ident print) + (list (variable (ident q))) + (list))) (assign (list (ident w)) (=) diff --git a/test/expect/TestJit.test_python_frontend_run-stdout.expect b/test/expect/TestJit.test_python_frontend_run-stdout.expect new file mode 100644 index 00000000000000..9a4852689eb850 --- /dev/null +++ b/test/expect/TestJit.test_python_frontend_run-stdout.expect @@ -0,0 +1,5 @@ + 0.5000 + 0.9526 + 0.9975 + 0.9999 +[ Variable{4} ] diff --git a/test/test_jit.py b/test/test_jit.py index e0c6ba353b7189..eceeb1efdbeaf1 100644 --- a/test/test_jit.py +++ b/test/test_jit.py @@ -19,6 +19,7 @@ except ImportError: HAS_TORCHVISION = False + skipIfNoTorchVision = unittest.skipIf(not HAS_TORCHVISION, "no torchvision") RUN_CUDA = torch.cuda.is_available() @@ -32,6 +33,47 @@ RUN_CUDA_MULTI_GPU = RUN_CUDA and torch.cuda.device_count() > 1 PY2 = sys.version_info[0] == 2 +WINDOWS = sys.platform == 'win32' + + +@contextmanager +def capture_stdout(): + # No idea how to capture stdout from C++ on Windows + if WINDOWS: + yield [''] + return + import os + import fcntl + import errno + stdout_fd = os.dup(1) + r, w = os.pipe() + try: + # Override stdout with r - dup is guaranteed to return the lowest free fd + os.close(1) + os.dup(w) + + captured_stdout = [''] + yield captured_stdout + sys.stdout.flush() # Make sure that Python hasn't buffered anything + + # Do the ugly dance to read all the data that was written into the pipe + fcntl.fcntl(r, fcntl.F_SETFL, os.O_NONBLOCK) + total_stdout = '' + while True: + try: + total_stdout += os.read(r, 1000).decode('ascii') + except OSError as e: + if e.errno != errno.EAGAIN: + raise + break + captured_stdout[0] = total_stdout + finally: + # Revert the change, and clean up all fds + os.close(1) + os.dup(stdout_fd) + os.close(stdout_fd) + os.close(r) + os.close(w) def LSTMCell(input, hidden, w_ih, w_hh, b_ih=None, b_hh=None): @@ -1309,11 +1351,19 @@ def foo(a, b): self.assertEqual(g2result, g2result2) def checkScript(self, script, inputs, outputs, optimize, name='func'): - cu = torch.jit._jit_script_compile(script) + if isinstance(script, str): + cu = torch.jit._jit_script_compile(script) + else: + ast = torch.jit.frontend.get_jit_ast(script) + cu = torch._C.CompilationUnit() + cu.define_function(ast) graph = cu.get_graph(name) ge = torch._C.GraphExecutor(graph, optimize) - outputs_ge = ge(*inputs) + with capture_stdout() as captured: + outputs_ge = ge(*inputs) self.assertEqual(outputs, outputs_ge) + if captured[0]: + self.assertExpected(captured[0], subname='stdout') def test_script_add(self): script = ''' @@ -1395,7 +1445,8 @@ def to_int(x) -> (y): def test_python_frontend(self): def fn(x, y, z): - q = x + y - z + q = x + y - z.sigmoid() + print(q) w = -z if not x and not y and z: m = x if not z else y @@ -1558,5 +1609,20 @@ def test_ternary(a, b) -> (c): str(cu2.get_graph('test_ternary')), ) + def test_python_frontend_run(self): + def func(x, y): + q = (x + y).sigmoid() + print(q) + w = -q + return w * w + + x = Variable(torch.arange(4), requires_grad=True) + y = Variable(torch.arange(4) * 2, requires_grad=True) + with capture_stdout(): + expected_out = func(x, y) + expected_out = (x + y).sigmoid().pow(2) + self.checkScript(func, [x, y], [expected_out], False) + + if __name__ == '__main__': run_tests() diff --git a/torch/csrc/jit/init.cpp b/torch/csrc/jit/init.cpp index 5f227f81623987..c8600e00b2adb9 100644 --- a/torch/csrc/jit/init.cpp +++ b/torch/csrc/jit/init.cpp @@ -134,8 +134,8 @@ void initJITBindings(PyObject *module) { initPythonIRBindings(module); initPythonTracerBindings(module); python::initCompilerMixin(module); - script::initJitScriptBindings(module); script::initTreeViewBindings(module); + script::initJitScriptBindings(module); } }} diff --git a/torch/csrc/jit/interned_strings.h b/torch/csrc/jit/interned_strings.h index 89d774b9b91c9e..bfd3caba1c2cad 100644 --- a/torch/csrc/jit/interned_strings.h +++ b/torch/csrc/jit/interned_strings.h @@ -144,6 +144,7 @@ _(device) \ _(ReplaceIfUndef) \ _(is_zero) \ _(GraphExecutor) \ +_(Print) \ _(mm) \ _(t) \ _(Loop) \ diff --git a/torch/csrc/jit/interpreter.cpp b/torch/csrc/jit/interpreter.cpp index d438b22207a4e6..73d66128539635 100644 --- a/torch/csrc/jit/interpreter.cpp +++ b/torch/csrc/jit/interpreter.cpp @@ -16,6 +16,8 @@ #include "torch/csrc/jit/tensor_conversions.h" #include "torch/csrc/utils/auto_gil.h" +#include + namespace py = pybind11; namespace torch { namespace jit { @@ -528,6 +530,25 @@ Operation getOperation(jit::Node *node, bool constants_are_variables) { } return 0; }; + IR_ELSEIF(Print) + size_t num_inputs = value->inputs().size(); + return [num_inputs](Stack & stack) { + bool first = true; + for (at::Tensor i : last(stack, num_inputs)) { + if (!first) std::cout << " "; + first = false; + if (auto tensor_impl = dynamic_cast(i.get())) { + std::cout << at::Tensor(tensor_impl, true); + } else if (!i.defined()) { + std::cout << ""; + } else { + std::cout << "<" << typeid(*i.get()).name() << " at " << i << ">"; + } + } + drop(stack, num_inputs); + std::cout << std::endl; + return 0; + }; IR_ELSEIF(GraphExecutor) GraphExecutor executor(value->g(kSubgraph)); auto num_inputs = value->inputs().size(); diff --git a/torch/csrc/jit/script/compiler.cpp b/torch/csrc/jit/script/compiler.cpp index 90a7e500fbaa6b..536ef6775c1f30 100644 --- a/torch/csrc/jit/script/compiler.cpp +++ b/torch/csrc/jit/script/compiler.cpp @@ -479,6 +479,12 @@ struct to_ir { auto apply = Apply(tree); if (function_table.count(apply.name().name()) > 0) { return emitFunctionCall(apply, output_size); + } else if (apply.name().name() == "print") { + expectOutputs(tree, output_size, 0); + if (!apply.attributes().empty()) + throw ErrorReport(tree) << "print doesn't accept any keyword arguments"; + return emitNode(kPrint, getValues(apply.inputs()), 0, + AttributeMap{}, ListAttributeMap{})->outputs(); } else { const auto& inputs = getValues(apply.inputs()); NodeKind kind{apply.name().name()}; @@ -712,6 +718,10 @@ void CompilationUnit::define(const std::string& script) { return pImpl->define(script); } +void CompilationUnit::defineFunction(const Def& def) { + return pImpl->defineFunction(def); +} + std::shared_ptr CompilationUnit::getGraph(const std::string& func_name) { return pImpl->getGraph(func_name); } diff --git a/torch/csrc/jit/script/compiler.h b/torch/csrc/jit/script/compiler.h index b20124756a7679..c43931f5b00344 100644 --- a/torch/csrc/jit/script/compiler.h +++ b/torch/csrc/jit/script/compiler.h @@ -3,6 +3,7 @@ #include #include "torch/csrc/jit/ir.h" +#include "torch/csrc/jit/script/tree_views.h" namespace torch { namespace jit { @@ -11,7 +12,8 @@ namespace script { struct CompilationUnitImpl; struct CompilationUnit { CompilationUnit(); - void define(const std::string& str); + void define(const std::string& source); + void defineFunction(const Def& def); std::shared_ptr getGraph(const std::string& func_name); ~CompilationUnit(); diff --git a/torch/csrc/jit/script/init.cpp b/torch/csrc/jit/script/init.cpp index 00613440d5ef7a..3add566bdb2d62 100644 --- a/torch/csrc/jit/script/init.cpp +++ b/torch/csrc/jit/script/init.cpp @@ -8,10 +8,10 @@ namespace script { void initJitScriptBindings(PyObject* module) { auto m = py::handle(module).cast(); py::class_(m, "CompilationUnit") - .def( - "get_graph", - &CompilationUnit::getGraph, - py::return_value_policy::reference); + .def(py::init<>()) + .def("get_graph", &CompilationUnit::getGraph, + py::return_value_policy::reference) + .def("define_function", &CompilationUnit::defineFunction); m.def("_jit_script_compile", jitScriptCompile); } diff --git a/torch/csrc/jit/script/python_tree_views.cpp b/torch/csrc/jit/script/python_tree_views.cpp index d25aec3fc9ebe9..915026b3ae6072 100644 --- a/torch/csrc/jit/script/python_tree_views.cpp +++ b/torch/csrc/jit/script/python_tree_views.cpp @@ -130,7 +130,7 @@ void initTreeViewBindings(PyObject *module) { .def(py::init([](const Ident& name) { return Var::create(name.range(), name); })) - .def("name", [](const Var& var) { return var.name(); }); + .def_property_readonly("name", [](const Var& var) { return var.name(); }); py::class_(m, "BinOp") .def(py::init([](std::string kind, const Expr& lhs, const Expr& rhs) { return BinOp::create(lhs.range(), stringToKind(kind), lhs, rhs); diff --git a/torch/csrc/jit/script/tree_views.h b/torch/csrc/jit/script/tree_views.h index 95115f0426e840..ae6e0b5d6f1def 100644 --- a/torch/csrc/jit/script/tree_views.h +++ b/torch/csrc/jit/script/tree_views.h @@ -135,6 +135,9 @@ struct List : public TreeView { iterator end() const { return iterator(tree_->trees().end()); } + bool empty() const { + return tree_->trees().begin() == tree_->trees().end(); + } T operator[](size_t i) const { return T(subtree(i)); } diff --git a/torch/jit/frontend.py b/torch/jit/frontend.py index 1f8cf306e2093d..cffcf6986eb074 100644 --- a/torch/jit/frontend.py +++ b/torch/jit/frontend.py @@ -10,9 +10,16 @@ PY2 = sys.version_info[0] == 2 _reserved_prefix = '__jit' +_reserved_names = {'print'} _identifier_chars = set(string.ascii_lowercase + string.ascii_uppercase + string.digits) + +def is_reserved_name(name): + return name.startswith(_reserved_prefix) or name in _reserved_names + + pretty_node_names = { + ast.FunctionDef: "function definitions", ast.For: "for loops", ast.Delete: "del statements", ast.ClassDef: "class definitions", @@ -27,6 +34,7 @@ } node_start_tokens = { + ast.FunctionDef: "def", ast.For: "for", ast.Delete: "del", ast.ClassDef: "class", @@ -56,6 +64,7 @@ }) else: pretty_node_names.update({ + ast.AsyncFunctionDef: "async function definitions", ast.AsyncFor: "async for loops", ast.AsyncWith: "async with statements", ast.Try: "try blocks", @@ -63,6 +72,7 @@ }) node_start_tokens.update({ + ast.AsyncFunctionDef: "async def", ast.AsyncFor: "async for", ast.AsyncWith: "async with", ast.Try: "try", @@ -210,7 +220,7 @@ def get_assign_ident(ctx, expr): if not isinstance(var, Var): raise NotSupportedError("the only expressions allowed on the left hand side of " "assignments are variable names", var.range()) - return var.name() + return var.name @staticmethod def build_Assign(ctx, stmt): @@ -247,6 +257,14 @@ def build_If(ctx, stmt): [build_stmt(ctx, s) for s in stmt.body], [build_stmt(ctx, s) for s in stmt.orelse]) + @staticmethod + def build_Print(ctx, stmt): + r = ctx.make_range(stmt.lineno, stmt.col_offset, stmt.col_offset + len("print")) + if stmt.dest: + raise NotSupportedError(r, "print statements with non-default destinations aren't supported") + args = [build_expr(ctx, val) for val in stmt.values] + return ExprStmt(Apply(Ident(r, "print"), args, [])) + class ExprBuilder(Builder): _MethodRef = namedtuple('MethodRef', ['self', 'name']) @@ -294,15 +312,18 @@ def build_Attribute(ctx, expr): @staticmethod def build_Call(ctx, expr): ref = build_expr(ctx, expr.func, allow_methods=True) - if type(ref) is not ExprBuilder._MethodRef: + args = [build_expr(ctx, py_arg) for py_arg in expr.args] + kwargs = [Attribute(Ident(name), build_expr(ctx, value)) for name, value in expr.keywords] + if type(ref) is ExprBuilder._MethodRef: # Method call + return Apply(ref.name, [ref.self] + args, kwargs) + elif isinstance(ref, Var): # Top-level function call + return Apply(ref.name, args, kwargs) + else: ref_range = ref.range() parenthesis_range = find_after(ctx, ref_range.end, '(') raise FrontendTypeError( ctx.make_raw_range(ref_range.start, parenthesis_range.end), "trying to call a non-function object") - args = [build_expr(ctx, py_arg) for py_arg in expr.args] - kwargs = [Attribute(Ident(name), build_expr(ctx, value)) for name, value in expr.keywords] - return Apply(ref.name, [ref.self] + args, kwargs) @staticmethod def build_Name(ctx, expr): From c06c6046e39f87a8e086709eed891fac10879615 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Sat, 24 Feb 2018 12:24:24 -0500 Subject: [PATCH 3/8] Accept GPU perf test regression. (#5395) Signed-off-by: Edward Z. Yang --- .jenkins/perf_test/perf_test_numbers.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.jenkins/perf_test/perf_test_numbers.json b/.jenkins/perf_test/perf_test_numbers.json index c5c91070155e86..c49b1ecfff6522 100644 --- a/.jenkins/perf_test/perf_test_numbers.json +++ b/.jenkins/perf_test/perf_test_numbers.json @@ -22,8 +22,8 @@ }, "test_gpu_speed_word_language_model": { - "mean": "5.65807", - "sigma": "0.1132" + "mean": "5.9411499999999995", + "sigma": "0.02134777505971057" }, "test_gpu_speed_cudnn_lstm": { From 1ff537ca7198ddef7e76d0f75c001d0b3f41d7fd Mon Sep 17 00:00:00 2001 From: Tongzhou Wang Date: Sat, 24 Feb 2018 13:32:13 -0500 Subject: [PATCH 4/8] Ignore FileNotFoundError when shutting down in data_queue.get (#5380) * Ignore FileNotFoundError when shutting down in data_queue.get * Address @apaszke comments --- torch/utils/data/dataloader.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py index 43f4cb50d77e9b..bc2d8b213083bd 100644 --- a/torch/utils/data/dataloader.py +++ b/torch/utils/data/dataloader.py @@ -314,9 +314,15 @@ def _shutdown_workers(self): if not self.shutdown: self.shutdown = True self.done_event.set() - # if worker_manager_thread is waiting to put - while not self.data_queue.empty(): - self.data_queue.get() + # if worker_manager_thread is waiting to put, make place for it + try: + while not self.data_queue.empty(): + self.data_queue.get() + except FileNotFoundError: + # FileNotFoundError can happen when we rebuild the fd + # fetched from the queue but the socket is already closed + # from the worker side (e.g. due to Python shutting down). + pass for _ in self.workers: self.index_queue.put(None) # done_event should be sufficient to exit worker_manager_thread, From 40d79e4447acf218836132eabb9d4b0d7bfede77 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Sat, 24 Feb 2018 17:04:25 -0500 Subject: [PATCH 5/8] Turn on ASAN in continuous integration. (#5271) I know this works because I had to squelch a bunch of ASAN errors in multiprocessing. Signed-off-by: Edward Z. Yang --- .jenkins/build.sh | 28 +++++++++++++++++++++++++++- .jenkins/disabled-configs.txt | 4 ---- .jenkins/enabled-configs.txt | 2 ++ .jenkins/test.sh | 10 ++++++++++ test/test_multiprocessing.py | 15 +++++++++++++++ 5 files changed, 54 insertions(+), 5 deletions(-) diff --git a/.jenkins/build.sh b/.jenkins/build.sh index 5d2a3b870e9ac7..0ae5c8082cd548 100755 --- a/.jenkins/build.sh +++ b/.jenkins/build.sh @@ -47,7 +47,33 @@ python --version pip install -r requirements.txt || true -time python setup.py install +if [[ "$JOB_NAME" == *asan* ]]; then + export ASAN_OPTIONS=detect_leaks=0:symbolize=1 + # Disable Valgrind tests in run_aten_tests.sh; otherwise + # we'll be valgrind'ing an ASAN'ed binary! ASANity. + export VALGRIND=0 + + sudo apt-get update + sudo apt-get install clang-5.0 + + export PATH="/usr/lib/llvm-5.0/bin:$PATH" + + # TODO: Figure out how to avoid hard-coding these paths + LD_LIBRARY_PATH=/usr/lib/llvm-5.0/lib/clang/5.0.0/lib/linux \ + CC="sccache clang" \ + CXX="sccache clang++" \ + LDSHARED="clang --shared" \ + LDFLAGS="-stdlib=libstdc++" \ + CFLAGS="-fsanitize=address -shared-libasan" \ + NO_CUDA=1 \ + python setup.py install + + export LD_PRELOAD=/usr/lib/llvm-5.0/lib/clang/5.0.0/lib/linux/libclang_rt.asan-x86_64.so + +else + python setup.py install + +fi if [[ "$JOB_NAME" != *cuda* ]]; then echo "Testing ATen" diff --git a/.jenkins/disabled-configs.txt b/.jenkins/disabled-configs.txt index 2d37dbd199a605..cdd51d3fb54a56 100644 --- a/.jenkins/disabled-configs.txt +++ b/.jenkins/disabled-configs.txt @@ -3,7 +3,3 @@ # fail. You can use this to temporarily reserve a test name to # turn on CI side before PyTorch repository supports it. This # file has the same format as .jenkins/enabled-configs.txt - -pytorch-linux-xenial-py3-clang5-asan -pytorch-linux-xenial-py3-clang5-asan-build -pytorch-linux-xenial-py3-clang5-asan-test diff --git a/.jenkins/enabled-configs.txt b/.jenkins/enabled-configs.txt index 3b3539abf7c041..7b43e9fcb15f74 100644 --- a/.jenkins/enabled-configs.txt +++ b/.jenkins/enabled-configs.txt @@ -11,6 +11,8 @@ pytorch-linux-xenial-cuda9-cudnn7-py2-build pytorch-linux-xenial-cuda9-cudnn7-py2-test pytorch-linux-xenial-cuda9-cudnn7-py3-build pytorch-linux-xenial-cuda9-cudnn7-py3-test +pytorch-linux-xenial-py3-clang5-asan-build +pytorch-linux-xenial-py3-clang5-asan-test pytorch-linux-trusty-py2.7.9-build pytorch-linux-trusty-py2.7.9-test pytorch-linux-trusty-py2.7-build diff --git a/.jenkins/test.sh b/.jenkins/test.sh index 9ab4c592428d65..6614bcc8d7f46a 100755 --- a/.jenkins/test.sh +++ b/.jenkins/test.sh @@ -40,6 +40,12 @@ echo "Testing pytorch" export OMP_NUM_THREADS=4 export MKL_NUM_THREADS=4 +if [[ "$JOB_NAME" == *asan* ]]; then + export PATH="/usr/lib/llvm-5.0/bin:$PATH" + export ASAN_OPTIONS=detect_leaks=0:symbolize=1 + export PYTORCH_TEST_WITH_ASAN=1 +fi + # JIT C++ extensions require ninja. git clone https://github.com/ninja-build/ninja --quiet pushd ninja @@ -47,6 +53,10 @@ python ./configure.py --bootstrap export PATH="$PWD:$PATH" popd +if [[ "$JOB_NAME" == *asan* ]]; then + export LD_PRELOAD=/usr/lib/llvm-5.0/lib/clang/5.0.0/lib/linux/libclang_rt.asan-x86_64.so +fi + time test/run_test.sh -- -v rm -rf ninja diff --git a/test/test_multiprocessing.py b/test/test_multiprocessing.py index a23af037db6c8c..181c881b73e862 100644 --- a/test/test_multiprocessing.py +++ b/test/test_multiprocessing.py @@ -21,6 +21,7 @@ sys.platform != 'darwin' and \ sys.platform != 'win32' TEST_MULTIGPU = TEST_CUDA_IPC and torch.cuda.device_count() > 1 +TEST_WITH_ASAN = os.getenv('PYTORCH_TEST_WITH_ASAN', False) class SubProcess(mp.Process): @@ -246,10 +247,14 @@ def do_test(): do_test() @unittest.skipIf(platform == 'darwin', "file descriptor strategy is not supported on macOS") + @unittest.skipIf(TEST_WITH_ASAN, + "seems to hang with ASAN, see https://github.com/pytorch/pytorch/issues/5326") def test_fd_sharing(self): self._test_sharing(repeat=TEST_REPEATS) @unittest.skipIf(platform == 'darwin', "file descriptor strategy is not supported on macOS") + @unittest.skipIf(TEST_WITH_ASAN, + "test_fd_preserve_sharing is known buggy, see https://github.com/pytorch/pytorch/issues/5311") def test_fd_preserve_sharing(self): self._test_preserve_sharing(repeat=TEST_REPEATS) @@ -257,19 +262,27 @@ def test_fd_preserve_sharing(self): def test_fd_pool(self): self._test_pool(repeat=TEST_REPEATS) + @unittest.skipIf(TEST_WITH_ASAN, + "test_fs_sharing is known buggy, see https://github.com/pytorch/pytorch/issues/5325") def test_fs_sharing(self): with fs_sharing(): self._test_sharing(repeat=TEST_REPEATS) + @unittest.skipIf(TEST_WITH_ASAN, + "test_fs_preserve_sharing is known buggy, see https://github.com/pytorch/pytorch/issues/5311") def test_fs_preserve_sharing(self): with fs_sharing(): self._test_preserve_sharing(repeat=TEST_REPEATS) + @unittest.skipIf(TEST_WITH_ASAN, + "test_fs_pool is known buggy, see https://github.com/pytorch/pytorch/issues/5325") def test_fs_pool(self): with fs_sharing(): self._test_pool(repeat=TEST_REPEATS) @unittest.skipIf(not HAS_SHM_FILES, "don't not how to check if shm files exist") + @unittest.skipIf(TEST_WITH_ASAN, + "test_fs is known buggy, see https://github.com/pytorch/pytorch/issues/5325") def test_fs(self): def queue_put(): x = torch.DoubleStorage(4) @@ -409,6 +422,8 @@ def _test_is_shared(self): def test_is_shared(self): self._test_is_shared() + @unittest.skipIf(TEST_WITH_ASAN, + "test_fs_is_shared is known buggy, see https://github.com/pytorch/pytorch/issues/5325") def test_fs_is_shared(self): with fs_sharing(): self._test_is_shared() From d2f71cbdebe48b64888f27b0c97ffb346dacecc2 Mon Sep 17 00:00:00 2001 From: Soumith Chintala Date: Sat, 24 Feb 2018 19:37:00 -0500 Subject: [PATCH 6/8] make CuDNN finders respect library major version (#5399) --- aten/cmake/FindCuDNN.cmake | 12 ++++++++++-- setup.py | 3 +-- tools/setup_helpers/cudnn.py | 25 +++++++++++++++++-------- 3 files changed, 28 insertions(+), 12 deletions(-) diff --git a/aten/cmake/FindCuDNN.cmake b/aten/cmake/FindCuDNN.cmake index 78a6d0eede1735..f7ceed91da96b8 100644 --- a/aten/cmake/FindCuDNN.cmake +++ b/aten/cmake/FindCuDNN.cmake @@ -15,13 +15,21 @@ include(FindPackageHandleStandardArgs) set(CUDNN_ROOT_DIR "" CACHE PATH "Folder contains NVIDIA cuDNN") -find_path(CUDNN_INCLUDE_DIR cudnn.h +if($ENV{CUDNN_INCLUDE_DIR}) + SET(CUDNN_INCLUDE_DIR $ENV{CUDNN_INCLUDE_DIR}) +else($ENV{CUDNN_INCLUDE_DIR}) + find_path(CUDNN_INCLUDE_DIR cudnn.h HINTS ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES cuda/include include) +endif($ENV{CUDNN_INCLUDE_DIR}) -find_library(CUDNN_LIBRARY cudnn +if($ENV{CUDNN_LIBRARY}) + SET(CUDNN_LIBRARY $ENV{CUDNN_LIBRARY}) +else($ENV{CUDNN_LIBRARY}) + find_library(CUDNN_LIBRARY cudnn HINTS ${CUDNN_LIB_DIR} ${CUDNN_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64) +endif($ENV{CUDNN_LIBRARY}) find_package_handle_standard_args( CUDNN DEFAULT_MSG CUDNN_INCLUDE_DIR CUDNN_LIBRARY) diff --git a/setup.py b/setup.py index 0afe1895c56e8c..b9da0241e28251 100644 --- a/setup.py +++ b/setup.py @@ -709,8 +709,7 @@ def run(self): "torch/csrc/cuda/python_nccl.cpp", ] if WITH_CUDNN: - main_libraries += ['cudnn'] - library_dirs.insert(0, CUDNN_LIB_DIR) + main_libraries += [CUDNN_LIBRARY] # NOTE: these are at the front, in case there's another cuDNN in CUDA path include_dirs.insert(0, CUDNN_INCLUDE_DIR) if not IS_WINDOWS: diff --git a/tools/setup_helpers/cudnn.py b/tools/setup_helpers/cudnn.py index 1cc48a7c11179d..566635681171b0 100644 --- a/tools/setup_helpers/cudnn.py +++ b/tools/setup_helpers/cudnn.py @@ -35,6 +35,22 @@ if IS_CONDA: lib_paths.append(os.path.join(CONDA_DIR, 'lib')) include_paths.append(os.path.join(CONDA_DIR, 'include')) + for path in include_paths: + if path is None or not os.path.exists(path): + continue + include_file_path = os.path.join(path, 'cudnn.h') + if os.path.exists(include_file_path): + CUDNN_INCLUDE_DIR = path + CUDNN_INCLUDE_VERSION = -1 + with open(include_file_path) as f: + for line in f: + if "#define CUDNN_MAJOR" in line: + CUDNN_INCLUDE_VERSION = int(line.split()[-1]) + break + if CUDNN_INCLUDE_VERSION == -1: + raise AssertionError("Could not find #define CUDNN_MAJOR in " + include_file_path) + break + for path in lib_paths: if path is None or not os.path.exists(path): continue @@ -45,18 +61,11 @@ CUDNN_LIB_DIR = path break else: - libraries = sorted(glob.glob(os.path.join(path, 'libcudnn*'))) + libraries = sorted(glob.glob(os.path.join(path, 'libcudnn*' + str(CUDNN_INCLUDE_VERSION) + "*"))) if libraries: CUDNN_LIBRARY = libraries[0] CUDNN_LIB_DIR = path break - for path in include_paths: - if path is None or not os.path.exists(path): - continue - if os.path.exists((os.path.join(path, 'cudnn.h'))): - CUDNN_INCLUDE_DIR = path - break - # Specifying the library directly will overwrite the lib directory library = os.getenv('CUDNN_LIBRARY') if library is not None and os.path.exists(library): From d7488f4b59406e47994da587b54fb019a22a0130 Mon Sep 17 00:00:00 2001 From: Rachit Singh Date: Tue, 16 Jan 2018 19:17:09 +0000 Subject: [PATCH 7/8] Refactor standard_gamma and implement CUDA gamma sampling --- aten/src/ATen/Declarations.cwrap | 20 ------ aten/src/ATen/native/Distributions.cpp | 27 ++++++++ aten/src/ATen/native/Distributions.cuh | 63 +++++++++++++++++++ aten/src/ATen/native/cuda/Distributions.cu | 31 +++++++++ aten/src/ATen/native/native_functions.yaml | 6 ++ aten/src/TH/THRandom.cpp | 29 --------- aten/src/TH/THRandom.h | 6 -- aten/src/TH/generic/THTensorRandom.cpp | 10 --- aten/src/TH/generic/THTensorRandom.h | 1 - test/test_distributions.py | 13 +++- tools/autograd/derivatives.yaml | 4 +- torch/csrc/Module.cpp | 2 - torch/csrc/generic/methods/TensorRandom.cwrap | 20 ------ torch/distributions/dirichlet.py | 3 +- torch/distributions/gamma.py | 4 +- 15 files changed, 145 insertions(+), 94 deletions(-) create mode 100644 aten/src/ATen/native/Distributions.cuh diff --git a/aten/src/ATen/Declarations.cwrap b/aten/src/ATen/Declarations.cwrap index a59e1c1a0b6438..d426adaad541d5 100644 --- a/aten/src/ATen/Declarations.cwrap +++ b/aten/src/ATen/Declarations.cwrap @@ -3966,26 +3966,6 @@ kwarg_only: True - THTensor* self ]] -[[ - name: _standard_gamma - types: - - floating_point - backends: - - CPU - return: argument 0 - variants: - - method - - function - options: - - cname: standard_gamma - arguments: - - arg: THTensor* output - output: True - - arg: THGenerator* generator - default: nullptr - kwarg_only: True - - THTensor* self -]] [[ name: _dirichlet_grad types: diff --git a/aten/src/ATen/native/Distributions.cpp b/aten/src/ATen/native/Distributions.cpp index b1d7a086c6b448..67fc5f607450ab 100644 --- a/aten/src/ATen/native/Distributions.cpp +++ b/aten/src/ATen/native/Distributions.cpp @@ -8,6 +8,8 @@ #include "ATen/CheckGenerator.h" #include "ATen/Generator.h" +#include + #include "TH/THRandom.h" namespace at { @@ -155,6 +157,24 @@ namespace dist { return gen_->generator; } + template + struct GammaOp { + static void apply(Tensor& ret, const Tensor& alpha, THGenerator *generator) { + CPU_tensor_apply2(ret, alpha, + [generator](scalar& ret_val, const double& alpha){ + dist::baseSampler standard_uniform([generator] () { + return THRandom_standard_uniform(generator); + }); + dist::baseSampler standard_normal([generator] () { + return THRandom_normal(generator, 0.0, 1.0); + }); + auto sample = dist::sample_gamma(alpha, standard_uniform, standard_normal); + ret_val = std::max(std::numeric_limits::min(), (scalar) sample); + } + ); + } + }; + template struct PoissonOp { static int64_t sample_poisson(double lambda, THGenerator *generator) { @@ -227,5 +247,12 @@ Tensor _s_poisson_cpu(const Tensor& lambda, Generator *gen) { return ret; } +Tensor _s_gamma_cpu(const Tensor& alpha, Generator *gen) { + Tensor ret = alpha.type().zeros(alpha.sizes()); + auto alpha_ = alpha.toType(ScalarType::Double); + dispatch_floating_types(ret.type(), "gamma", ret, alpha_, dist::get_generator(gen)); + return ret; +} + } // at::native } // at diff --git a/aten/src/ATen/native/Distributions.cuh b/aten/src/ATen/native/Distributions.cuh new file mode 100644 index 00000000000000..7d6da545eea228 --- /dev/null +++ b/aten/src/ATen/native/Distributions.cuh @@ -0,0 +1,63 @@ +#include "ATen/Config.h" +#include +#if AT_CUDA_ENABLED() +#include +#endif + +namespace at { +namespace native { +namespace dist { + + // this wraps sampling primitives to expose a common interface + template + struct baseSampler { +#if AT_CUDA_ENABLED() + nvstd::function sampler; + __device__ baseSampler(nvstd::function sampler): sampler(sampler) {} + __device__ precision_t sample() { + return sampler(); + } +#else + std::function sampler; + baseSampler(std::function sampler): sampler(sampler) {} + precision_t sample() { + return sampler(); + } +#endif + }; + + template +#if AT_CUDA_ENABLED() + __host__ __device__ +#endif + precision_t sample_gamma(precision_t alpha, baseSampler& standard_uniform, baseSampler& standard_normal) { + precision_t scale = 1.0; + + // Boost alpha for higher acceptance probability. + if (alpha < 1.0) { + scale *= ::pow(1 - standard_uniform.sample(), 1.0 / alpha); + alpha += 1.0; + } + + // This implements the acceptance-rejection method of Marsaglia and Tsang (2000) + // doi:10.1145/358407.358414 + const precision_t d = alpha - 1.0 / 3.0; + const precision_t c = 1.0 / ::sqrt(9.0 * d); + for (;;) { + precision_t x, y; + do { + x = standard_normal.sample(); + y = 1.0 + c * x; + } while (y <= 0); + const precision_t v = y * y * y; + const precision_t u = 1 - standard_uniform.sample(); + const precision_t xx = x * x; + if (u < 1.0 - 0.0331 * xx * xx) + return scale * d * v; + if (::log(u) < 0.5 * xx + d * (1.0 - v + ::log(v))) + return scale * d * v; + } + } +} // dist +} // native +} // at diff --git a/aten/src/ATen/native/cuda/Distributions.cu b/aten/src/ATen/native/cuda/Distributions.cu index 859e97e0e4503a..81f26715dfee95 100644 --- a/aten/src/ATen/native/cuda/Distributions.cu +++ b/aten/src/ATen/native/cuda/Distributions.cu @@ -5,6 +5,10 @@ #include #include #include +#include +#include + +#include "ATen/native/Distributions.cuh" #include @@ -26,6 +30,26 @@ namespace dist { return std::make_pair(gen_->initial_seed, offset); } + template + struct GammaOpCUDA { + static void apply(Tensor& ret, const Tensor& alpha, std::pair seeds) { + at::cuda::CUDA_tensor_apply2(ret, alpha, + [seeds] __device__ (scalar& ret_val, const float& alpha, bool early_exit) { + curandStatePhilox4_32_10_t state; + curand_init(seeds.first, blockIdx.x * blockDim.x + threadIdx.x, seeds.second, &state); + baseSampler standard_uniform([&state] __device__ () { + return curand_uniform(&state); + }); + baseSampler standard_normal([&state] __device__ () { + return curand_normal(&state); + }); + auto sample = scalar_cast(sample_gamma(alpha, standard_uniform, standard_normal)); + ret_val = ::max(THCNumerics::min(), (scalar) sample); + } + ); + } + }; + template struct PoissonOpCUDA { static void apply(Tensor& ret, const Tensor& lambda, std::pair seeds) { @@ -48,5 +72,12 @@ Tensor _s_poisson_cuda(const Tensor& lambda, Generator* gen) { return ret; } +Tensor _s_gamma_cuda(const Tensor& alpha, Generator* gen) { + Tensor ret = alpha.type().tensor(alpha.sizes()); + auto alpha_ = alpha.toType(ScalarType::Float); + dispatch_floating_types(ret.type(), "gamma", ret, alpha_, dist::next_philox_seed(gen)); + return ret; +} + } // at::native } // at diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 1460ca4bbb0c5e..f3a81814c64895 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -392,6 +392,12 @@ CPU: _s_poisson_cpu CUDA: _s_poisson_cuda +- func: standard_gamma(Tensor self, Generator* generator=nullptr) -> Tensor + variants: function + dispatch: + CPU: _s_gamma_cpu + CUDA: _s_gamma_cuda + - func: _cudnn_rnn_flatten_weight(TensorList weight_arr, int64_t weight_stride0, int64_t input_size, int64_t mode, int64_t hidden_size, int64_t num_layers, bool batch_first, bool bidirectional) -> Tensor variants: function diff --git a/aten/src/TH/THRandom.cpp b/aten/src/TH/THRandom.cpp index 99cf4ff58ba67d..a5c35fdea75ffa 100644 --- a/aten/src/TH/THRandom.cpp +++ b/aten/src/TH/THRandom.cpp @@ -290,35 +290,6 @@ double THRandom_exponential(THGenerator *_generator, double lambda) return(-1. / lambda * log(1-uniform_double(_generator))); } -double THRandom_standard_gamma(THGenerator *_generator, double alpha) { - double scale = 1.0; - - // Boost alpha for higher acceptance probability. - if(alpha < 1.0) { - scale *= pow(1 - uniform_double(_generator), 1.0 / alpha); - alpha += 1.0; - } - - // This implements the acceptance-rejection method of Marsaglia and Tsang (2000) - // doi:10.1145/358407.358414 - const double d = alpha - 1.0 / 3.0; - const double c = 1.0 / sqrt(9.0 * d); - for(;;) { - double x, y; - do { - x = THRandom_normal(_generator, 0.0, 1.0); - y = 1.0 + c * x; - } while(y <= 0); - const double v = y * y * y; - const double u = 1 - uniform_double(_generator); - const double xx = x * x; - if(u < 1.0 - 0.0331 * xx * xx) - return scale * d * v; - if(log(u) < 0.5 * xx + d * (1.0 - v + log(v))) - return scale * d * v; - } -} - double THRandom_cauchy(THGenerator *_generator, double median, double sigma) { return(median + sigma * tan(M_PI*(uniform_double(_generator)-0.5))); diff --git a/aten/src/TH/THRandom.h b/aten/src/TH/THRandom.h index 33720af4ea95df..b7a827f49e0dba 100644 --- a/aten/src/TH/THRandom.h +++ b/aten/src/TH/THRandom.h @@ -63,12 +63,6 @@ TH_API double THRandom_normal(THGenerator *_generator, double mean, double stdv) */ TH_API double THRandom_exponential(THGenerator *_generator, double lambda); -/** Generates a random number from a standard Gamma distribution. - The Gamma density is proportional to $x^{alpha-1} exp(-x)$ - The shape parameter alpha (a.k.a. k) is a positive real number. -*/ -TH_API double THRandom_standard_gamma(THGenerator *_generator, double alpha); - /** Returns a random number from a Cauchy distribution. The Cauchy density is $p(x) = sigma/(pi*(sigma^2 + (x-median)^2))$ */ diff --git a/aten/src/TH/generic/THTensorRandom.cpp b/aten/src/TH/generic/THTensorRandom.cpp index 746f11073a2de4..01ba6a81e97173 100644 --- a/aten/src/TH/generic/THTensorRandom.cpp +++ b/aten/src/TH/generic/THTensorRandom.cpp @@ -138,16 +138,6 @@ void THTensor_(exponential)(THTensor *self, THGenerator *_generator, double lamb TH_TENSOR_APPLY(real, self, *self_data = (real)THRandom_exponential(_generator, lambda);); } -void THTensor_(standard_gamma)(THTensor *self, THGenerator *_generator, THTensor *alpha) -{ - std::lock_guard lock(_generator->mutex); - THTensor_(resizeAs)(self, alpha); - TH_TENSOR_APPLY2(real, self, real, alpha, { - const real sample = THRandom_standard_gamma(_generator, *alpha_data); - *self_data = sample > 0 ? sample : TH_REAL_MIN; - }); -} - #undef TH_REAL_MIN void THTensor_(cauchy)(THTensor *self, THGenerator *_generator, double median, double sigma) diff --git a/aten/src/TH/generic/THTensorRandom.h b/aten/src/TH/generic/THTensorRandom.h index 646e497f325f81..dc6bdafaae703c 100644 --- a/aten/src/TH/generic/THTensorRandom.h +++ b/aten/src/TH/generic/THTensorRandom.h @@ -18,7 +18,6 @@ TH_API void THTensor_(normal_means)(THTensor *self, THGenerator *gen, THTensor * TH_API void THTensor_(normal_stddevs)(THTensor *self, THGenerator *gen, double mean, THTensor *stddevs); TH_API void THTensor_(normal_means_stddevs)(THTensor *self, THGenerator *gen, THTensor *means, THTensor *stddevs); TH_API void THTensor_(exponential)(THTensor *self, THGenerator *_generator, double lambda); -TH_API void THTensor_(standard_gamma)(THTensor *self, THGenerator *_generator, THTensor *alpha); TH_API void THTensor_(cauchy)(THTensor *self, THGenerator *_generator, double median, double sigma); TH_API void THTensor_(logNormal)(THTensor *self, THGenerator *_generator, double mean, double stdv); TH_API void THTensor_(multinomial)(THLongTensor *self, THGenerator *_generator, THTensor *prob_dist, int n_sample, int with_replacement); diff --git a/test/test_distributions.py b/test/test_distributions.py index 718986dda45bb4..26a3108ab5cf7b 100644 --- a/test/test_distributions.py +++ b/test/test_distributions.py @@ -712,7 +712,7 @@ def test_poisson_sample(self): @unittest.skipIf(not TEST_CUDA, "CUDA not found") @unittest.skipIf(not TEST_NUMPY, "Numpy not found") def test_poisson_gpu_sample(self): - set_rng_seed(0) + set_rng_seed(1) for rate in [0.12, 0.9, 4.0]: self._check_sampler_discrete(Poisson(torch.Tensor([rate]).cuda()), scipy.stats.poisson(rate), @@ -1089,6 +1089,17 @@ def test_gamma_sample(self): scipy.stats.gamma(alpha, scale=1.0 / beta), 'Gamma(concentration={}, rate={})'.format(alpha, beta)) + @unittest.skipIf(not TEST_CUDA, "CUDA not found") + @unittest.skipIf(not TEST_NUMPY, "Numpy not found") + def test_gamma_gpu_sample(self): + set_rng_seed(0) + for alpha, beta in product([0.1, 1.0, 5.0], [0.1, 1.0, 10.0]): + a, b = torch.Tensor([alpha]).cuda(), torch.Tensor([beta]).cuda() + self._check_sampler_sampler(Gamma(a, b), + scipy.stats.gamma(alpha, scale=1.0 / beta), + 'Gamma(alpha={}, beta={})'.format(alpha, beta), + failure_rate=1e-4) + @unittest.skipIf(not TEST_NUMPY, "NumPy not found") def test_pareto(self): scale = Variable(torch.randn(2, 3).abs(), requires_grad=True) diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml index fbd4efdb08c795..7f867d3b7a2992 100644 --- a/tools/autograd/derivatives.yaml +++ b/tools/autograd/derivatives.yaml @@ -663,8 +663,8 @@ self: not_implemented("_sparse_mask") mask: not_implemented("_sparse_mask") -- name: _standard_gamma(Tensor self, Generator generator) - self: grad * self._standard_gamma_grad(output) +- name: standard_gamma(Tensor self, Generator generator) + self: grad * self._standard_gamma_grad(result) - name: _standard_gamma_grad(Tensor self, Tensor output) self: not_implemented("_standard_gamma_grad") diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp index aa957711ac54af..249416edd290f0 100644 --- a/torch/csrc/Module.cpp +++ b/torch/csrc/Module.cpp @@ -302,7 +302,6 @@ IMPLEMENT_STATELESS(bmm) // TODO: this doesn't implement options that return numbers! IMPLEMENT_STATELESS(multinomial) IMPLEMENT_STATELESS(normal) -IMPLEMENT_STATELESS(_standard_gamma) IMPLEMENT_STATELESS(_dirichlet_grad) IMPLEMENT_STATELESS(bernoulli) IMPLEMENT_STATELESS(range) @@ -752,7 +751,6 @@ static PyMethodDef TorchMethods[] = { {"bmm", (PyCFunction)THPModule_bmm, METH_VARARGS | METH_KEYWORDS, NULL}, {"multinomial", (PyCFunction)THPModule_multinomial, METH_VARARGS | METH_KEYWORDS, NULL}, {"normal", (PyCFunction)THPModule_normal, METH_VARARGS | METH_KEYWORDS, NULL}, - {"_standard_gamma", (PyCFunction)THPModule__standard_gamma, METH_VARARGS | METH_KEYWORDS, NULL}, {"_dirichlet_grad", (PyCFunction)THPModule__dirichlet_grad, METH_VARARGS | METH_KEYWORDS, NULL}, {"bernoulli", (PyCFunction)THPModule_bernoulli, METH_VARARGS | METH_KEYWORDS, NULL}, {"rand", (PyCFunction)THPModule_rand, METH_VARARGS | METH_KEYWORDS, NULL}, diff --git a/torch/csrc/generic/methods/TensorRandom.cwrap b/torch/csrc/generic/methods/TensorRandom.cwrap index a0ee66ec033481..62dc18dd273671 100644 --- a/torch/csrc/generic/methods/TensorRandom.cwrap +++ b/torch/csrc/generic/methods/TensorRandom.cwrap @@ -210,26 +210,6 @@ default: 1 ]] -[[ - name: _standard_gamma - types: - - floating_point - backends: - - CPU - return: argument 0 - variants: - - function - options: - - cname: standard_gamma - arguments: - - arg: THTensor* output - output: True - - arg: THGenerator* generator - default: THPGenerator_TH_CData(THPDefaultGenerator) - kwarg_only: True - - THTensor* alpha -]] - [[ name: _dirichlet_grad types: diff --git a/torch/distributions/dirichlet.py b/torch/distributions/dirichlet.py index 87b7bb9204fc2d..0c945f37d4c10d 100644 --- a/torch/distributions/dirichlet.py +++ b/torch/distributions/dirichlet.py @@ -5,11 +5,12 @@ from torch.autograd.function import once_differentiable from torch.distributions import constraints from torch.distributions.exp_family import ExponentialFamily +from torch.distributions.gamma import _standard_gamma from torch.distributions.utils import _finfo, broadcast_all def _dirichlet_sample_nograd(concentration): - probs = torch._C._standard_gamma(concentration) + probs = _standard_gamma(concentration) probs /= probs.sum(-1, True) eps = _finfo(probs).eps return probs.clamp_(min=eps, max=1 - eps) diff --git a/torch/distributions/gamma.py b/torch/distributions/gamma.py index 60ec109ef1a359..8123e4bce46ae6 100644 --- a/torch/distributions/gamma.py +++ b/torch/distributions/gamma.py @@ -10,8 +10,8 @@ def _standard_gamma(concentration): if not isinstance(concentration, Variable): - return torch._C._standard_gamma(concentration) - return concentration._standard_gamma() + return torch._C._VariableFunctions.standard_gamma(Variable(concentration)).data + return torch._C._VariableFunctions.standard_gamma(concentration) class Gamma(ExponentialFamily): From 6ff5d33ed7e00df70b8914c9e23d49d71077eda5 Mon Sep 17 00:00:00 2001 From: Rachit Singh Date: Wed, 31 Jan 2018 13:21:25 +0000 Subject: [PATCH 8/8] Attempt fixes for AT_CUDA_ENABLED changes --- aten/src/ATen/SharedDist.cu | 24 +++++++++ aten/src/ATen/native/Distributions.cpp | 55 +++++++++++++++++-- aten/src/ATen/native/Distributions.cuh | 63 ---------------------- aten/src/ATen/native/cuda/Distributions.cu | 6 ++- 4 files changed, 80 insertions(+), 68 deletions(-) create mode 100644 aten/src/ATen/SharedDist.cu delete mode 100644 aten/src/ATen/native/Distributions.cuh diff --git a/aten/src/ATen/SharedDist.cu b/aten/src/ATen/SharedDist.cu new file mode 100644 index 00000000000000..3bba673e5b073c --- /dev/null +++ b/aten/src/ATen/SharedDist.cu @@ -0,0 +1,24 @@ +#include "ATen/ATen.h" +#include "ATen/TensorUtils.h" +#include "ATen/NativeFunctions.h" +#include "ATen/Dispatch.h" +#include "ATen/Config.h" + +#include + +namespace at { + namespace native { + namespace dist { + template + struct baseSampler { + nvstd::function sampler; + baseSampler(nvstd::function sampler): sampler(sampler) {} + precision_t sample() { + return sampler(); + } + }; + } + } +} + +// this version is only linked if CUDA is enabled, so we can safely just use CUDA features here diff --git a/aten/src/ATen/native/Distributions.cpp b/aten/src/ATen/native/Distributions.cpp index 67fc5f607450ab..7ca705e267ceca 100644 --- a/aten/src/ATen/native/Distributions.cpp +++ b/aten/src/ATen/native/Distributions.cpp @@ -1,6 +1,7 @@ #include "ATen/ATen.h" #include "ATen/CPUApplyUtils.h" #include "ATen/Dispatch.h" +#include "ATen/Config.h" #include "ATen/ExpandUtils.h" #include "ATen/NativeFunctions.h" @@ -8,7 +9,7 @@ #include "ATen/CheckGenerator.h" #include "ATen/Generator.h" -#include +#include #include "TH/THRandom.h" @@ -121,12 +122,23 @@ Tensor _standard_gamma_grad_cuda(const Tensor& self, const Tensor& output) { /* * This section is a counterpart to Distributions.cu - * */ namespace dist { - // The function `sample_poisson` - // is adapted from Numpy's distributions.c implementation. + +#if !AT_CUDA_ENABLED() + template + struct baseSampler { + std::function sampler; + baseSampler(std::function sampler): sampler(sampler) {} + precision_t sample() { + return sampler(); + } + }; +#endif + + // The functions `sample_poisson`, `sample_gamma` + // are adapted from Numpy's distributions.c implementation. // It is MIT licensed, so here is the copyright: /* Copyright 2005 Robert Kern (robert.kern@gmail.com) @@ -151,6 +163,41 @@ namespace dist { * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ + + template +#if AT_CUDA_ENABLED() + __host__ __device__ +#endif + precision_t sample_gamma(precision_t alpha, baseSampler& standard_uniform, baseSampler& standard_normal) { + + precision_t scale = 1.0; + + // Boost alpha for higher acceptance probability. + if (alpha < 1.0) { + scale *= ::pow(1 - standard_uniform.sample(), 1.0 / alpha); + alpha += 1.0; + } + + // This implements the acceptance-rejection method of Marsaglia and Tsang (2000) + // doi:10.1145/358407.358414 + const precision_t d = alpha - 1.0 / 3.0; + const precision_t c = 1.0 / ::sqrt(9.0 * d); + for (;;) { + precision_t x, y; + do { + x = standard_normal.sample(); + y = 1.0 + c * x; + } while (y <= 0); + const precision_t v = y * y * y; + const precision_t u = 1 - standard_uniform.sample(); + const precision_t xx = x * x; + if (u < 1.0 - 0.0331 * xx * xx) + return scale * d * v; + if (::log(u) < 0.5 * xx + d * (1.0 - v + ::log(v))) + return scale * d * v; + } + } + THGenerator * get_generator(Generator *gen) { auto default_gen = &at::globalContext().defaultGenerator(Backend::CPU); auto gen_ = check_generator(gen, default_gen); diff --git a/aten/src/ATen/native/Distributions.cuh b/aten/src/ATen/native/Distributions.cuh deleted file mode 100644 index 7d6da545eea228..00000000000000 --- a/aten/src/ATen/native/Distributions.cuh +++ /dev/null @@ -1,63 +0,0 @@ -#include "ATen/Config.h" -#include -#if AT_CUDA_ENABLED() -#include -#endif - -namespace at { -namespace native { -namespace dist { - - // this wraps sampling primitives to expose a common interface - template - struct baseSampler { -#if AT_CUDA_ENABLED() - nvstd::function sampler; - __device__ baseSampler(nvstd::function sampler): sampler(sampler) {} - __device__ precision_t sample() { - return sampler(); - } -#else - std::function sampler; - baseSampler(std::function sampler): sampler(sampler) {} - precision_t sample() { - return sampler(); - } -#endif - }; - - template -#if AT_CUDA_ENABLED() - __host__ __device__ -#endif - precision_t sample_gamma(precision_t alpha, baseSampler& standard_uniform, baseSampler& standard_normal) { - precision_t scale = 1.0; - - // Boost alpha for higher acceptance probability. - if (alpha < 1.0) { - scale *= ::pow(1 - standard_uniform.sample(), 1.0 / alpha); - alpha += 1.0; - } - - // This implements the acceptance-rejection method of Marsaglia and Tsang (2000) - // doi:10.1145/358407.358414 - const precision_t d = alpha - 1.0 / 3.0; - const precision_t c = 1.0 / ::sqrt(9.0 * d); - for (;;) { - precision_t x, y; - do { - x = standard_normal.sample(); - y = 1.0 + c * x; - } while (y <= 0); - const precision_t v = y * y * y; - const precision_t u = 1 - standard_uniform.sample(); - const precision_t xx = x * x; - if (u < 1.0 - 0.0331 * xx * xx) - return scale * d * v; - if (::log(u) < 0.5 * xx + d * (1.0 - v + ::log(v))) - return scale * d * v; - } - } -} // dist -} // native -} // at diff --git a/aten/src/ATen/native/cuda/Distributions.cu b/aten/src/ATen/native/cuda/Distributions.cu index 81f26715dfee95..b6e62c7b6d70a3 100644 --- a/aten/src/ATen/native/cuda/Distributions.cu +++ b/aten/src/ATen/native/cuda/Distributions.cu @@ -1,5 +1,8 @@ +#include "ATen/ATen.h" +#include "ATen/TensorUtils.h" #include "ATen/NativeFunctions.h" #include "ATen/Dispatch.h" +#include "ATen/Config.h" #include "ATen/cuda/CUDAApplyUtils.cuh" #include #include @@ -8,7 +11,8 @@ #include #include -#include "ATen/native/Distributions.cuh" +#include "ATen/SharedDist.cu" +#include "ATen/native/Distributions.cpp" #include