InfiniTensor
diff --git a/‎src/02hardware/CMakeLists.txt
+4-6 b/‎src/02hardware/CMakeLists.txt
+4-6
diff --git a/‎src/02hardware/include/hardware/device.h
+1-1 b/‎src/02hardware/include/hardware/device.h
+1-1
diff --git a/‎src/02hardware/include/hardware/devices/nvidia.h
+1-1 b/‎src/02hardware/include/hardware/devices/nvidia.h
+1-1
diff --git a/‎src/02hardware/src/device.cc
+1-1 b/‎src/02hardware/src/device.cc
+1-1
diff --git a/‎src/02hardware/src/devices/mlu/device.cc
+20-4 b/‎src/02hardware/src/devices/mlu/device.cc
+20-4
diff --git a/‎src/02hardware/src/devices/mlu/functions.cc
-21 b/‎src/02hardware/src/devices/mlu/functions.cc
-21
diff --git a/‎src/02hardware/src/devices/mlu/functions.hh
-28 b/‎src/02hardware/src/devices/mlu/functions.hh
-28
diff --git a/‎src/02hardware/src/devices/mlu/memory.cc
+12-3 b/‎src/02hardware/src/devices/mlu/memory.cc
+12-3
diff --git a/‎src/02hardware/src/devices/nvidia/device.cc
+28-9 b/‎src/02hardware/src/devices/nvidia/device.cc
+28-9
diff --git a/‎src/02hardware/src/devices/nvidia/functions.cu
-19 b/‎src/02hardware/src/devices/nvidia/functions.cu
-19
diff --git a/‎src/02hardware/src/devices/nvidia/functions.cuh
-24 b/‎src/02hardware/src/devices/nvidia/functions.cuh
-24
diff --git a/‎src/02hardware/src/devices/nvidia/memory.cu renamed to ‎src/02hardware/src/devices/nvidia/memory.cc
+13-2 b/‎src/02hardware/src/devices/nvidia/memory.cu renamed to ‎src/02hardware/src/devices/nvidia/memory.cc
+13-2
diff --git a/‎src/02hardware/src/devices/nvidia/memory.cuh renamed to ‎src/02hardware/src/devices/nvidia/memory.hh b/‎src/02hardware/src/devices/nvidia/memory.cuh renamed to ‎src/02hardware/src/devices/nvidia/memory.hh
diff --git a/‎src/03runtime/include/runtime/stream.h
+4-2 b/‎src/03runtime/include/runtime/stream.h
+4-2
diff --git a/‎src/03runtime/src/stream.cc
+9-3 b/‎src/03runtime/src/stream.cc
+9-3
diff --git a/‎src/04kernel/include/kernel/collectors/simple_binary.h
+2 b/‎src/04kernel/include/kernel/collectors/simple_binary.h
+2
diff --git a/‎src/04kernel/src/collectors/simple_binary.cc
+2 b/‎src/04kernel/src/collectors/simple_binary.cc
+2
@@ -2,17 +2,15 @@ cmake_minimum_required(VERSION 3.12 FATAL_ERROR)
 project(hardware VERSION 0.0.0 LANGUAGES CXX)
 message(STATUS "Project " ${PROJECT_NAME} " version " ${PROJECT_VERSION})
 
-# Source files
 file(GLOB_RECURSE HARDWARE_SRC src/*.cc src/*.cpp)
+add_library(hardware STATIC ${HARDWARE_SRC})
+target_link_libraries(hardware PUBLIC common)
+target_include_directories(hardware PUBLIC include)
 
 if(USE_CUDA)
-    file(GLOB_RECURSE HARDWARE_CUDA_SRC src/devices/nvidia/*.cu)
+    target_include_directories(hardware PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
 endif()
 
-add_library(hardware STATIC ${HARDWARE_SRC} ${HARDWARE_CUDA_SRC} ${HARDWARE_BANG_SRC})
-target_link_libraries(hardware PUBLIC common)
-target_include_directories(hardware PUBLIC include)
-
 file(GLOB_RECURSE HARDWARE_TEST test/*.cpp)
 if(HARDWARE_TEST)
     add_executable(hardware_test ${HARDWARE_TEST})
 
@@ -52,7 +52,7 @@ namespace refactor::hardware {
 
         virtual ~Device() = default;
         virtual Type type() const noexcept = 0;
-        virtual void setContext() const noexcept;
+        virtual void setContext() const;
 
         Arc<Blob> malloc(size_t);
         Arc<Blob> absorb(Arc<Blob> &&);
 
@@ -8,7 +8,7 @@ namespace refactor::hardware {
     class Nvidia final : public Device {
     public:
         explicit Nvidia(int32_t card);
-        void setContext() const noexcept final;
+        void setContext() const final;
         Type type() const noexcept final {
             return Type::Nvidia;
         }
 
@@ -56,7 +56,7 @@ namespace refactor::hardware {
     Device::Device(decltype(_card) card, decltype(_mem) mem)
         : _card(card), _mem(std::move(mem)) {}
 
-    void Device::setContext() const noexcept {}
+    void Device::setContext() const {}
     auto Device::malloc(size_t size) -> Arc<Blob> {
         return Arc<Blob>(new Blob(this, size));
     }
 
@@ -1,15 +1,29 @@
 #include "functions.hh"
 #include "hardware/devices/mlu.h"
 #include "hardware/mem_pool.h"
+
+#ifdef USE_BANG
+#include "cnrt.h"
 #include "memory.hh"
 
+#define BANG_ASSERT(STATUS)                                                          \
+    if (auto status = (STATUS); status != CNRT_RET_SUCCESS) {                        \
+        RUNTIME_ERROR(fmt::format("bang failed on \"" #STATUS "\" with \"{}\" ({})", \
+                                  cnrtGetErrorStr(status), (int) status));           \
+    }
+
+#endif
 namespace refactor::hardware {
 
     static Arc<Memory> bangMemory(int32_t card) {
 #ifdef USE_BANG
-        ASSERT(0 <= card && card < getDeviceCount(), "Invalid card id: {}", card);
-        setDevice(card);
-        auto [free, total] = getMemInfo();
+        unsigned deviceCount;
+        BANG_ASSERT(cnrtGetDeviceCount(&deviceCount));
+        ASSERT(0 <= card && card < deviceCount, "Invalid card id: {}", card);
+        BANG_ASSERT(cnrtSetDevice(card));
+
+        size_t free, total;
+        BANG_ASSERT(cnrtMemGetInfo(&free, &total));
         auto size = std::min(free, std::max(5ul << 30, total * 4 / 5));
         fmt::println("initializing Cambricon MLU {}, memory {} / {}, alloc {}",
                      card, free, total, size);
@@ -25,7 +39,9 @@ namespace refactor::hardware {
     Mlu::Mlu(int32_t card) : Device(card, bangMemory(card)) {}
 
     void Mlu::setContext() const noexcept {
-        setDevice(_card);
+#ifdef USE_BANG
+        BANG_ASSERT(cnrtSetDevice(_card));
+#endif
     }
 
 }// namespace refactor::hardware
@@ -1,8 +1,17 @@
+#ifdef USE_BANG
+
 #include "memory.hh"
-#include "functions.hh"
+#include "cnrt.h"
+#include "common.h"
+
+#define BANG_ASSERT(STATUS)                                                          \
+    if (auto status = (STATUS); status != CNRT_RET_SUCCESS) {                        \
+        RUNTIME_ERROR(fmt::format("bang failed on \"" #STATUS "\" with \"{}\" ({})", \
+                                  cnrtGetErrorStr(status), (int) status));           \
+    }
 
 namespace refactor::hardware {
-#ifdef USE_BANG
+
     using M = MluMemory;
 
     void *M::malloc(size_t size) {
@@ -28,6 +37,6 @@ namespace refactor::hardware {
                                CNRT_MEM_TRANS_DIR_PEER2PEER));
         return dst;
     }
-#endif
 
 }// namespace refactor::hardware
+#endif
@@ -1,31 +1,50 @@
 #include "functions.cuh"
 #include "hardware/devices/nvidia.h"
 #include "hardware/mem_pool.h"
-#include "memory.cuh"
+
+#ifdef USE_CUDA
+#include "memory.hh"
+#include <cuda_runtime.h>
+
+#define CUDA_ASSERT(STATUS)                                                          \
+    if (auto status = (STATUS); status != cudaSuccess) {                             \
+        RUNTIME_ERROR(fmt::format("cuda failed on \"" #STATUS "\" with \"{}\" ({})", \
+                                  cudaGetErrorString(status), (int) status));        \
+    }
+#endif
 
 namespace refactor::hardware {
 
     static Arc<Memory> cudaMemory(int32_t card) {
 #ifdef USE_CUDA
-        ASSERT(0 <= card && card < getDeviceCount(), "Invalid card id: {}", card);
-        setDevice(card);
-        auto [free, total] = getMemInfo();
+        int deviceCount;
+        CUDA_ASSERT(cudaGetDeviceCount(&deviceCount));
+        ASSERT(0 <= card && card < deviceCount, "Invalid card id: {}", card);
+        CUDA_ASSERT(cudaSetDevice(card));
+
+        size_t free, total;
+        CUDA_ASSERT(cudaMemGetInfo(&free, &total));
         auto size = std::min(free, std::max(5ul << 30, total * 4 / 5));
-        fmt::println("initializing Nvidia GPU {}, memory {} / {}, alloc {}",
-                     card, free, total, size);
+        cudaDeviceProp prop;
+        CUDA_ASSERT(cudaGetDeviceProperties(&prop, 0));
+        size_t alignment = prop.textureAlignment;
+        fmt::println("initializing Nvidia GPU {}, memory {} / {}, alloc {}, alignment {}",
+                     card, free, total, size, alignment);
         return std::make_shared<MemPool>(
             std::make_shared<NvidiaMemory>(),
             size,
-            256ul);
+            alignment);
 #else
         return nullptr;
 #endif
     }
 
     Nvidia::Nvidia(int32_t card) : Device(card, cudaMemory(card)) {}
 
-    void Nvidia::setContext() const noexcept {
-        setDevice(_card);
+    void Nvidia::setContext() const {
+#ifdef USE_CUDA
+        CUDA_ASSERT(cudaSetDevice(_card));
+#endif
     }
 
 }// namespace refactor::hardware
@@ -1,5 +1,14 @@
-#include "functions.cuh"
-#include "memory.cuh"
+#ifdef USE_CUDA
+
+#include "memory.hh"
+#include "common.h"
+#include <cuda_runtime.h>
+
+#define CUDA_ASSERT(STATUS)                                                          \
+    if (auto status = (STATUS); status != cudaSuccess) {                             \
+        RUNTIME_ERROR(fmt::format("cuda failed on \"" #STATUS "\" with \"{}\" ({})", \
+                                  cudaGetErrorString(status), (int) status));        \
+    }
 
 namespace refactor::hardware {
     using M = NvidiaMemory;
@@ -29,3 +38,5 @@ namespace refactor::hardware {
     }
 
 }// namespace refactor::hardware
+
+#endif
@@ -42,9 +42,11 @@ namespace refactor::runtime {
                decltype(_device));
 
         decltype(_graph) const &graph() const noexcept { return _graph; }
-        void setData(count_t, void const *, size_t);
+        auto setData(count_t, size_t) -> Arc<hardware::Device::Blob>;
         void setData(count_t, Arc<hardware::Device::Blob>);
-        bool getData(count_t, void *, size_t) const;
+        auto getData(count_t) const -> Arc<hardware::Device::Blob>;
+        void setData(count_t, void const *, size_t);
+        bool copyData(count_t, void *, size_t) const;
         void run();
         auto bench(void (*sync)()) -> std::vector<std::chrono::nanoseconds>;
         void trace(std::function<void(count_t, void const *const *, void const *const *)>);
 
@@ -18,15 +18,21 @@ namespace refactor::runtime {
               std::move(edges),
           } {}
 
+    auto Stream::setData(count_t i, size_t size) -> Arc<hardware::Device::Blob> {
+        return _graph.edges[i].blob = _device->malloc(size);
+    }
+    void Stream::setData(count_t i, Arc<hardware::Device::Blob> blob) {
+        _graph.edges[i].blob = std::move(blob);
+    }
     void Stream::setData(count_t i, void const *data, size_t size) {
         auto blob = _device->malloc(size);
         blob->copyFromHost(data, size);
         _graph.edges[i].blob = std::move(blob);
     }
-    void Stream::setData(count_t i, Arc<hardware::Device::Blob> blob) {
-        _graph.edges[i].blob = std::move(blob);
+    auto Stream::getData(count_t i) const -> Arc<hardware::Device::Blob> {
+        return _graph.edges[i].blob;
     }
-    bool Stream::getData(count_t i, void *data, size_t size) const {
+    bool Stream::copyData(count_t i, void *data, size_t size) const {
         if (!_graph.edges[i].blob) { return false; }
         _graph.edges[i].blob->copyToHost(data, size);
         return true;
 
@@ -14,6 +14,8 @@ namespace refactor::kernel {
         And,
         Or,
         Xor,
+        Mod,
+        Fmod,
     };
 
     std::string_view opName(SimpleBinaryType type);
 
@@ -20,6 +20,8 @@ namespace refactor::kernel {
             CASE(And);
             CASE(Or);
             CASE(Xor);
+            CASE(Mod);
+            CASE(Fmod);
             default:
                 UNREACHABLE();
         }
Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,7 @@ namespace refactor::hardware {`
`8`	`8`	`class Nvidia final : public Device {`
`9`	`9`	`public:`
`10`	`10`	`explicit Nvidia(int32_t card);`
`11`		`- void setContext() const noexcept final;`
	`11`	`+ void setContext() const final;`
`12`	`12`	`Type type() const noexcept final {`
`13`	`13`	`return Type::Nvidia;`
`14`	`14`	`}`
Original file line number	Diff line number	Diff line change
`@@ -56,7 +56,7 @@ namespace refactor::hardware {`
`56`	`56`	`Device::Device(decltype(_card) card, decltype(_mem) mem)`
`57`	`57`	`: _card(card), _mem(std::move(mem)) {}`
`58`	`58`
`59`		`- void Device::setContext() const noexcept {}`
	`59`	`+ void Device::setContext() const {}`
`60`	`60`	`auto Device::malloc(size_t size) -> Arc<Blob> {`
`61`	`61`	`return Arc<Blob>(new Blob(this, size));`
`62`	`62`	`}`
-Original file line number
+Diff line change
         And,
         Or,
         Xor,
 +        Mod,
 +        Fmod,
     };
     std::string_view opName(SimpleBinaryType type);
Original file line number	Diff line number	Diff line change
`@@ -20,6 +20,8 @@ namespace refactor::kernel {`
`20`	`20`	`CASE(And);`
`21`	`21`	`CASE(Or);`
`22`	`22`	`CASE(Xor);`
	`23`	`+ CASE(Mod);`
	`24`	`+ CASE(Fmod);`
`23`	`25`	`default:`
`24`	`26`	`UNREACHABLE();`
`25`	`27`	`}`