onnx
diff --git a/‎doc/API.md
+1-1 b/‎doc/API.md
+1-1
diff --git a/‎doc/CLI.md
+2-2 b/‎doc/CLI.md
+2-2
diff --git a/‎onnx_tf/backend.py
+3-1 b/‎onnx_tf/backend.py
+3-1
diff --git a/‎onnx_tf/common/__init__.py
+3-2 b/‎onnx_tf/common/__init__.py
+3-2
diff --git a/‎onnx_tf/common/pooling_helper.py
+7-1 b/‎onnx_tf/common/pooling_helper.py
+7-1
diff --git a/‎onnx_tf/handlers/backend/conv_mixin.py
+5-6 b/‎onnx_tf/handlers/backend/conv_mixin.py
+5-6
@@ -20,7 +20,7 @@ _params_:
 `model` : The ONNX model to be converted.
 
 
-`device` : The device to execute this model on.
+`device` : The device to execute this model on. It can be either CPU (default) or CUDA.
 
 
 `strict` : Whether to enforce semantic equivalence between the original model
 
@@ -40,8 +40,8 @@ optional arguments:
                         Output directory.
 
 backend arguments (onnx -> tf):
-  --device DEVICE       The device to execute this model on. (from
-                        onnx_tf.backend.prepare)
+  --device DEVICE       The device to execute this model on. It can be either
+                        CPU (default) or CUDA. (from onnx_tf.backend.prepare)
   --strict STRICT       Whether to enforce semantic equivalence between the
                         original model and the converted tensorflow model,
                         defaults to True (yes, enforce semantic equivalence).
 
@@ -49,7 +49,7 @@ def prepare(cls,
     the converted representation.
 
     :param model: The ONNX model to be converted.
-    :param device: The device to execute this model on.
+    :param device: The device to execute this model on. It can be either CPU (default) or CUDA.
     :param strict: Whether to enforce semantic equivalence between the original model
       and the converted tensorflow model, defaults to True (yes, enforce semantic equivalence).
       Changing to False is strongly discouraged.
@@ -65,6 +65,7 @@ def prepare(cls,
     common.logger.setLevel(logging_level)
     common.logger.handlers[0].setLevel(logging_level)
     common.sys_config.auto_cast = auto_cast
+    common.sys_config.device = device
 
     return cls.onnx_model_to_tensorflow_rep(model, strict, **kwargs)
 
@@ -184,6 +185,7 @@ def __call__(self, **input_dict):
         return cls._onnx_node_to_tensorflow_op(self.node, input_dict)
 
     super(TensorflowBackend, cls).run_node(node, inputs, device)
+    common.sys_config.device = device
 
     node = OnnxNode(node)
     input_tensors = []
 
@@ -28,6 +28,8 @@ class SysConfig:
 
   def __init__(self):
     self.auto_cast = False
+    self.device = 'CPU'
+
 
 
 sys_config = SysConfig()
@@ -160,7 +162,7 @@ def get_data_format(x_rank):
   sp_dim_string = "".join(reversed(sp_dim_lst))
   storage_format = "NC" + sp_dim_string
 
-  if supports_device("CUDA"):
+  if sys_config.device == "CUDA":
     compute_format = "NC" + sp_dim_string
   else:
     compute_format = "N" + sp_dim_string + "C"
@@ -169,7 +171,6 @@ def get_data_format(x_rank):
 
 def supports_device(device):
   """ Check if support target device.
-
   :param device: CUDA or CPU.
   :return: If supports.
   """
 
@@ -158,6 +158,9 @@ def py_pool(input, kernel_shape, strides=None, dilations=None,
 
     def _loop_over_output(batch, channel):
         dims = [range(output_sp_shape[d]) for d in range(spatial_size)]
+        image_size = 1
+        for d in input_shape[2:]:
+          image_size *= d
         for counters in itertools.product(*dims):
             input_ranges = []
             for dim in range(spatial_size):
@@ -189,7 +192,10 @@ def _loop_over_output(batch, channel):
                 else:
                     if val > maxval:
                         maxval = val
-                        ind = 0
+                        # batch_offset = batch * C * image_size
+                        # channel_offset = channel * image_size
+                        # ind = batch_offset + channel_offset
+                        ind = image_size * (batch * input_shape[1] + channel)
                         for i in range(spatial_size):
                             coef = 1
                             for j in range(i+1, spatial_size):
 
@@ -1,10 +1,10 @@
 import tensorflow as tf
 
+from onnx_tf.common import exception
 from onnx_tf.common import get_data_format
 from onnx_tf.common import get_perm_from_formats
-from onnx_tf.common import supports_device
-from onnx_tf.common import exception
 from onnx_tf.common.tf_helper import tf_shape
+from onnx_tf.common import sys_config
 from .broadcast_mixin import BroadcastMixin
 from .pad_mixin import PadMixin
 
@@ -31,7 +31,6 @@ def conv(cls, node, input_dict, transpose=False):
     x_shape = tf_shape(x, tf.int32)
     spatial_size = x_rank - 2
 
-    support_cuda = supports_device("CUDA")
     storage_format, compute_format = get_data_format(x_rank)
     compute_c_idx = compute_format.find("C")
     spatial_format = "".join([d for d in compute_format if d not in ["N", "C"]])
@@ -94,7 +93,7 @@ def conv(cls, node, input_dict, transpose=False):
 
     weight_groups = tf.split(weights, num_or_size_splits=group, axis=-1)
 
-    if support_cuda:
+    if sys_config.device == 'CUDA':
       xs = tf.split(x, num_or_size_splits=group, axis=1)
     else:
       x = tf.transpose(x,
@@ -236,7 +235,7 @@ def conv(cls, node, input_dict, transpose=False):
       ]
 
     if len(node.inputs) == 2:
-      if support_cuda:
+      if sys_config.device == 'CUDA':
         output = tf.concat(convolved, axis=1)
       else:
         output = tf.concat(convolved, axis=-1)
@@ -247,7 +246,7 @@ def conv(cls, node, input_dict, transpose=False):
       bias = input_dict[node.inputs[2]]
       bias = cls.explicit_broadcast([x, bias], compute_c_idx)
 
-      if support_cuda:
+      if sys_config.device == 'CUDA':
         output = tf.concat(convolved, axis=1)
         output = tf.add(output, bias)
       else: