onnx
diff --git a/‎doc/API.md
+1-1 b/‎doc/API.md
+1-1
diff --git a/‎doc/CLI.md
+2-2 b/‎doc/CLI.md
+2-2
diff --git a/‎doc/support_status.md
+12-12 b/‎doc/support_status.md
+12-12
diff --git a/‎onnx_tf/backend.py
+3-1 b/‎onnx_tf/backend.py
+3-1
diff --git a/‎onnx_tf/backend_tf_module.py
+6-4 b/‎onnx_tf/backend_tf_module.py
+6-4
diff --git a/‎onnx_tf/common/__init__.py
+3-2 b/‎onnx_tf/common/__init__.py
+3-2
diff --git a/‎onnx_tf/common/pooling_helper.py
+7-1 b/‎onnx_tf/common/pooling_helper.py
+7-1
diff --git a/‎onnx_tf/handlers/backend/conv_mixin.py
+5-6 b/‎onnx_tf/handlers/backend/conv_mixin.py
+5-6
@@ -20,7 +20,7 @@ _params_:
 `model` : The ONNX model to be converted.
 
 
-`device` : The device to execute this model on.
+`device` : The device to execute this model on. It can be either CPU (default) or CUDA.
 
 
 `strict` : Whether to enforce semantic equivalence between the original model
 
@@ -40,8 +40,8 @@ optional arguments:
                         Output directory.
 
 backend arguments (onnx -> tf):
-  --device DEVICE       The device to execute this model on. (from
-                        onnx_tf.backend.prepare)
+  --device DEVICE       The device to execute this model on. It can be either
+                        CPU (default) or CUDA. (from onnx_tf.backend.prepare)
   --strict STRICT       Whether to enforce semantic equivalence between the
                         original model and the converted tensorflow model,
                         defaults to True (yes, enforce semantic equivalence).
 
@@ -1,9 +1,9 @@
 # ONNX-Tensorflow Support Status
 |||
 |-:|:-|
-|ONNX-Tensorflow Version|Master ( commit id: 6bfd631e0daedbc773b76636a5ea19e77a4b63ed )|
-|ONNX Version|Master ( commit id: b2ed660d0a065b8346816f2c3a95d79ca79b88c9 )|
-|Tensorflow Version|v2.3.0|
+|ONNX-Tensorflow Version|Master ( commit id: f64afb48034af7121341f4ba5d6f56e275c5aedb )|
+|ONNX Version|Master ( commit id: a7a0fec7f25cae567429af62b7eaaee1c3f0e247 )|
+|Tensorflow Version|v2.3.1|
 
 Notes:
 * Values that are new or updated from a previous opset version are in bold.
@@ -51,7 +51,7 @@ Notes:
 |Div|**1**|1|1|1|1|**6**|**7**|7|7|7|7|7|**13**:small_red_triangle:|Div|
 |Dropout|**1**|1|1|1|1|**6**|**7**|7|7|**10**|10|**12**|**13**|Dropout|
 |DynamicQuantizeLinear|-|-|-|-|-|-|-|-|-|-|**11**|11|11|DynamicQuantizeLinear|
-|Einsum|-|-|-|-|-|-|-|-|-|-|-|**12**:small_red_triangle:|12:small_red_triangle:|Einsum|
+|Einsum|-|-|-|-|-|-|-|-|-|-|-|**12**|12|Einsum|
 |Elu|**1**|1|1|1|1|**6**|6|6|6|6|6|6|6|Elu|
 |Equal|**1**|1|1|1|1|1|**7**|7|7|7|**11**|11|**13**|Equal|
 |Erf|-|-|-|-|-|-|-|-|**9**|9|9|9|**13**|Erf|
@@ -72,11 +72,11 @@ Notes:
 |GreaterOrEqual|-|-|-|-|-|-|-|-|-|-|-|**12**|12|GreaterOrEqual|
 |HardSigmoid|**1**|1|1|1|1|**6**|6|6|6|6|6|6|6|HardSigmoid|
 |Hardmax|**1**|1|1|1|1|1|1|1|1|1|**11**|11|**13**:small_red_triangle:|Hardmax|
-|Identity|**1**|1|1|1|1|1|1|1|1|1|1|1|**13**:small_red_triangle:|Identity|
+|Identity|**1**|1|1|1|1|1|1|1|1|1|1|1|**13**|Identity|
 |If|**1**|1|1|1|1|1|1|1|1|1|**11**|11|**13**|If|
 |InstanceNormalization|**1**|1|1|1|1|**6**|6|6|6|6|6|6|6|InstanceNormalization|
 |IsInf|-|-|-|-|-|-|-|-|-|**10**|10|10|10|IsInf|
-|IsNaN|-|-|-|-|-|-|-|-|**9**|9|9|9|**13**:small_red_triangle:|IsNaN|
+|IsNaN|-|-|-|-|-|-|-|-|**9**|9|9|9|**13**|IsNaN|
 |LRN|**1**|1|1|1|1|1|1|1|1|1|1|1|**13**|LRN|
 |LSTM|**1**:small_orange_diamond:|1:small_orange_diamond:|1:small_orange_diamond:|1:small_orange_diamond:|1:small_orange_diamond:|1:small_orange_diamond:|**7**:small_orange_diamond:|7:small_orange_diamond:|7:small_orange_diamond:|7:small_orange_diamond:|7:small_orange_diamond:|7:small_orange_diamond:|7:small_orange_diamond:|LSTM|
 |LeakyRelu|**1**|1|1|1|1|**6**|6|6|6|6|6|6|6|LeakyRelu|
@@ -100,7 +100,7 @@ Notes:
 |Mul|**1**|1|1|1|1|**6**|**7**|7|7|7|7|7|**13**|Mul|
 |Multinomial|-|-|-|-|-|-|**7**:small_red_triangle:|7:small_red_triangle:|7:small_red_triangle:|7:small_red_triangle:|7:small_red_triangle:|7:small_red_triangle:|7:small_red_triangle:|Multinomial|
 |Neg|**1**|1|1|1|1|**6**|6|6|6|6|6|6|**13**|Neg|
-|NegativeLogLikelihoodLoss|-|-|-|-|-|-|-|-|-|-|-|**12**:small_red_triangle:|12:small_red_triangle:|NegativeLogLikelihoodLoss|
+|NegativeLogLikelihoodLoss|-|-|-|-|-|-|-|-|-|-|-|**12**:small_red_triangle:|**13**:small_red_triangle:|NegativeLogLikelihoodLoss|
 |NonMaxSuppression|-|-|-|-|-|-|-|-|-|**10**|**11**|11|11|NonMaxSuppression|
 |NonZero|-|-|-|-|-|-|-|-|**9**|9|9|9|**13**:small_red_triangle:|NonZero|
 |Not|**1**|1|1|1|1|1|1|1|1|1|1|1|1|Not|
@@ -123,10 +123,10 @@ Notes:
 |ReduceL2|**1**|1|1|1|1|1|1|1|1|1|**11**|11|**13**:small_red_triangle:|ReduceL2|
 |ReduceLogSum|**1**|1|1|1|1|1|1|1|1|1|**11**|11|**13**:small_red_triangle:|ReduceLogSum|
 |ReduceLogSumExp|**1**|1|1|1|1|1|1|1|1|1|**11**|11|**13**:small_red_triangle:|ReduceLogSumExp|
-|ReduceMax|**1**|1|1|1|1|1|1|1|1|1|**11**|**12**|**13**:small_red_triangle:|ReduceMax|
-|ReduceMean|**1**|1|1|1|1|1|1|1|1|1|**11**|11|**13**:small_red_triangle:|ReduceMean|
-|ReduceMin|**1**|1|1|1|1|1|1|1|1|1|**11**|**12**|**13**:small_red_triangle:|ReduceMin|
-|ReduceProd|**1**|1|1|1|1|1|1|1|1|1|**11**|11|**13**:small_red_triangle:|ReduceProd|
+|ReduceMax|**1**|1|1|1|1|1|1|1|1|1|**11**|**12**|**13**|ReduceMax|
+|ReduceMean|**1**|1|1|1|1|1|1|1|1|1|**11**|11|**13**|ReduceMean|
+|ReduceMin|**1**|1|1|1|1|1|1|1|1|1|**11**|**12**|**13**|ReduceMin|
+|ReduceProd|**1**|1|1|1|1|1|1|1|1|1|**11**|11|**13**|ReduceProd|
 |ReduceSum|**1**|1|1|1|1|1|1|1|1|1|**11**|11|**13**:small_red_triangle:|ReduceSum|
 |ReduceSumSquare|**1**|1|1|1|1|1|1|1|1|1|**11**|11|**13**:small_red_triangle:|ReduceSumSquare|
 |Relu|**1**|1|1|1|1|**6**|6|6|6|6|6|6|**13**:small_red_triangle:|Relu|
@@ -179,7 +179,7 @@ Notes:
 |Where|-|-|-|-|-|-|-|-|**9**|9|9|9|9|Where|
 |Xor|**1**|1|1|1|1|1|**7**|7|7|7|7|7|7|Xor|
 
-ONNX-TF Supported Operators / ONNX Operators: 105 / 162
+ONNX-TF Supported Operators / ONNX Operators: 118 / 162
 
 Notes:
 1. Cast: Cast string to data types other than float32/float64/int32/int64 is not supported in Tensorflow
 
@@ -49,7 +49,7 @@ def prepare(cls,
     the converted representation.
 
     :param model: The ONNX model to be converted.
-    :param device: The device to execute this model on.
+    :param device: The device to execute this model on. It can be either CPU (default) or CUDA.
     :param strict: Whether to enforce semantic equivalence between the original model
       and the converted tensorflow model, defaults to True (yes, enforce semantic equivalence).
       Changing to False is strongly discouraged.
@@ -65,6 +65,7 @@ def prepare(cls,
     common.logger.setLevel(logging_level)
     common.logger.handlers[0].setLevel(logging_level)
     common.sys_config.auto_cast = auto_cast
+    common.sys_config.device = device
 
     return cls.onnx_model_to_tensorflow_rep(model, strict, **kwargs)
 
@@ -184,6 +185,7 @@ def __call__(self, **input_dict):
         return cls._onnx_node_to_tensorflow_op(self.node, input_dict)
 
     super(TensorflowBackend, cls).run_node(node, inputs, device)
+    common.sys_config.device = device
 
     node = OnnxNode(node)
     input_tensors = []
 
@@ -12,6 +12,8 @@ def __init__(self, handlers, opset, strict, graph_def, backend):
     self.graph_def = graph_def
     self.backend = backend
     self.outputs = []
+    self.initializer_dict = self._get_initializer_from_graph_and_subgraphs(
+        self.graph_def, dict())
 
   # get initializer from the main graph and all subgraphs in loop or if or scan
   # into tensor_dict
@@ -37,8 +39,8 @@ def _get_initializer_from_graph_and_subgraphs(self, graph, graph_tensor_dict):
 
   @tf.function
   def gen_tensor_dict(self, input_dict):
-    tensor_dict = self._get_initializer_from_graph_and_subgraphs(
-        self.graph_def, dict(input_dict))
+    tensor_dict = dict(input_dict)
+    tensor_dict.update(self.initializer_dict)
 
     for node in self.graph_def.node:
       onnx_node = OnnxNode(node)
@@ -54,8 +56,8 @@ def gen_tensor_dict(self, input_dict):
 
   @tf.function
   def __call__(self, **kwargs):
-    tensor_dict = self._get_initializer_from_graph_and_subgraphs(
-        self.graph_def, kwargs)
+    tensor_dict = kwargs
+    tensor_dict.update(self.initializer_dict)
 
     for node in self.graph_def.node:
       onnx_node = OnnxNode(node)
 
@@ -28,6 +28,8 @@ class SysConfig:
 
   def __init__(self):
     self.auto_cast = False
+    self.device = 'CPU'
+
 
 
 sys_config = SysConfig()
@@ -160,7 +162,7 @@ def get_data_format(x_rank):
   sp_dim_string = "".join(reversed(sp_dim_lst))
   storage_format = "NC" + sp_dim_string
 
-  if supports_device("CUDA"):
+  if sys_config.device == "CUDA":
     compute_format = "NC" + sp_dim_string
   else:
     compute_format = "N" + sp_dim_string + "C"
@@ -169,7 +171,6 @@ def get_data_format(x_rank):
 
 def supports_device(device):
   """ Check if support target device.
-
   :param device: CUDA or CPU.
   :return: If supports.
   """
 
@@ -158,6 +158,9 @@ def py_pool(input, kernel_shape, strides=None, dilations=None,
 
     def _loop_over_output(batch, channel):
         dims = [range(output_sp_shape[d]) for d in range(spatial_size)]
+        image_size = 1
+        for d in input_shape[2:]:
+          image_size *= d
         for counters in itertools.product(*dims):
             input_ranges = []
             for dim in range(spatial_size):
@@ -189,7 +192,10 @@ def _loop_over_output(batch, channel):
                 else:
                     if val > maxval:
                         maxval = val
-                        ind = 0
+                        # batch_offset = batch * C * image_size
+                        # channel_offset = channel * image_size
+                        # ind = batch_offset + channel_offset
+                        ind = image_size * (batch * input_shape[1] + channel)
                         for i in range(spatial_size):
                             coef = 1
                             for j in range(i+1, spatial_size):
 
@@ -1,10 +1,10 @@
 import tensorflow as tf
 
+from onnx_tf.common import exception
 from onnx_tf.common import get_data_format
 from onnx_tf.common import get_perm_from_formats
-from onnx_tf.common import supports_device
-from onnx_tf.common import exception
 from onnx_tf.common.tf_helper import tf_shape
+from onnx_tf.common import sys_config
 from .broadcast_mixin import BroadcastMixin
 from .pad_mixin import PadMixin
 
@@ -31,7 +31,6 @@ def conv(cls, node, input_dict, transpose=False):
     x_shape = tf_shape(x, tf.int32)
     spatial_size = x_rank - 2
 
-    support_cuda = supports_device("CUDA")
     storage_format, compute_format = get_data_format(x_rank)
     compute_c_idx = compute_format.find("C")
     spatial_format = "".join([d for d in compute_format if d not in ["N", "C"]])
@@ -94,7 +93,7 @@ def conv(cls, node, input_dict, transpose=False):
 
     weight_groups = tf.split(weights, num_or_size_splits=group, axis=-1)
 
-    if support_cuda:
+    if sys_config.device == 'CUDA':
       xs = tf.split(x, num_or_size_splits=group, axis=1)
     else:
       x = tf.transpose(x,
@@ -236,7 +235,7 @@ def conv(cls, node, input_dict, transpose=False):
       ]
 
     if len(node.inputs) == 2:
-      if support_cuda:
+      if sys_config.device == 'CUDA':
         output = tf.concat(convolved, axis=1)
       else:
         output = tf.concat(convolved, axis=-1)
@@ -247,7 +246,7 @@ def conv(cls, node, input_dict, transpose=False):
       bias = input_dict[node.inputs[2]]
       bias = cls.explicit_broadcast([x, bias], compute_c_idx)
 
-      if support_cuda:
+      if sys_config.device == 'CUDA':
         output = tf.concat(convolved, axis=1)
         output = tf.add(output, bias)
       else: