diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..7882514
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+*.x
+*.out
diff --git a/Makefile b/Makefile
deleted file mode 100644
index df453b4..0000000
--- a/Makefile
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright 2024 Parallel Software and Systems Group, University of Maryland.
-# See the top-level LICENSE file for details.
-# 
-# SPDX-License-Identifier: MIT
-
-CC	= cc
-INC	= -I/global/common/software/nersc9/nccl/2.19.4/include
-CFLAGS	= -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL
-LDFLAGS	= -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
-
-
-all: allgather.x
-
-allgather.x: allgather.cu 
-	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu
-
-clean: 
-	rm -f allgather.x 
diff --git a/README b/README
deleted file mode 100644
index eba2046..0000000
--- a/README
+++ /dev/null
@@ -1,9 +0,0 @@
-Before compiling do these:
-
-module load PrgEnv-cray cudatoolkit craype-accel-nvidia80
-export CRAY_ACCEL_TARGET=nvidia80
-
-When running do these:
-
-module load cudatoolkit
-export MPICH_GPU_SUPPORT_ENABLED=1
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..526fb95
--- /dev/null
+++ b/README.md
@@ -0,0 +1,15 @@
+Before compiling do these:
+
+### Perlmutter
+```sh
+module load PrgEnv-cray cudatoolkit craype-accel-nvidia80 nccl
+export CRAY_ACCEL_TARGET=nvidia80
+export MPICH_GPU_SUPPORT_ENABLED=1
+```
+### Frontier
+```sh
+module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05
+export MPICH_GPU_SUPPORT_ENABLED=1
+export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}"
+```
+
diff --git a/allgather.cu b/allgather.cu
index 5953041..8c357bb 100644
--- a/allgather.cu
+++ b/allgather.cu
@@ -8,24 +8,31 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <mpi.h>
+#include <stdint.h>
 
 #ifdef USE_CUDA
-  #include <cuda_runtime.h>
   #include <cuda_bf16.h>
+  #define bfloat16 nv_bfloat16
+#elif USE_ROCM
+  #define __HIP_PLATFORM_AMD__
+  #include <hip/hip_bfloat16.h>
+  #include <hip/hip_runtime.h>
+  #include <hip/hip_runtime_api.h>
+  #define bfloat16 hip_bfloat16
 #endif
 
 #ifdef USE_NCCL
   #include "nccl.h"
-#elif defined(USE_RCCL)
-  #include "rccl.h"
+#elif USE_RCCL
+  #include <rccl/rccl.h> 
 #endif
 
 #define NUM_WARMUP_ITERATIONS		5
 
 #define MPI_CHECK(cmd) do {                         \
-  int e = cmd;                                      \
+  int64_t e = cmd;                                      \
   if( e != MPI_SUCCESS ) {                          \
-    printf("Failed: MPI error %s:%d '%d'\n",        \
+    printf("Failed: MPI error %s:%d '%ld'\n",        \
         __FILE__,__LINE__, e);                      \
     exit(EXIT_FAILURE);                             \
   }                                                 \
@@ -40,6 +47,16 @@
   }                                                 \
 } while(0)
 
+#define HIP_CHECK(cmd) do {                        \
+  hipError_t e = cmd;                              \
+  if(e != hipSuccess) {                            \
+    printf("HIP error  %s:%d: %s\n",               \
+        __FILE__, __LINE__, hipGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+// NCCL_CHECK is used to validate RCCL functions as well
 #define NCCL_CHECK(cmd) do {                        \
   ncclResult_t e = cmd;                             \
   if (e != ncclSuccess) {                           \
@@ -49,9 +66,14 @@
   }                                                 \
 } while(0)
 
-void initializeData(nv_bfloat16 *data, int size) {
-    for (int i = 0; i < (size / sizeof(nv_bfloat16)); ++i) {
+void initializeData(bfloat16 *data, int64_t size) {
+    for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) {
+        #ifdef USE_CUDA
         data[i] = __float2bfloat16((float)i);
+        #elif USE_ROCM
+        // ROCm doesn't have a float2bfloat16 method
+        data[i] = (bfloat16) ((float) i);
+        #endif
     }
 }
 
@@ -62,8 +84,8 @@ int main(int argc, char *argv[]) {
     }
 
     int num_gpus = atoi(argv[1]);
-    int min_msg_size = atoi(argv[2]);
-    int max_msg_size = atoi(argv[3]);
+    int64_t min_msg_size = atoi(argv[2]);
+    int64_t max_msg_size = atoi(argv[3]);
     int iterations = atoi(argv[4]);
 
     if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) {
@@ -86,33 +108,49 @@ int main(int argc, char *argv[]) {
     }
 
     // Initialize GPU context
+    #if USE_CUDA
     cudaGetDeviceCount(&num_gpus_per_node);
     cudaSetDevice((my_rank % num_gpus_per_node));
+    #elif USE_ROCM
+    hipGetDeviceCount(&num_gpus_per_node);
+    hipSetDevice((my_rank % num_gpus_per_node));
+    #endif
 
-    int local_data_size = max_msg_size; // Size of local data
-    int global_data_size = local_data_size * num_gpus; // Size of global data
+    int64_t local_data_size = max_msg_size; // Size of local data
+    int64_t global_data_size = local_data_size * num_gpus; // Size of global data
 
-    nv_bfloat16 *local_data = (nv_bfloat16*)malloc(local_data_size);
-    nv_bfloat16 *global_data = (nv_bfloat16*)malloc(global_data_size);
+    if (my_rank == 0) {
+        fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024);
+        fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024);
+    }
+
+    bfloat16 *local_data = (bfloat16*)malloc(local_data_size);
+    bfloat16 *global_data = (bfloat16*)malloc(global_data_size);
 
     // Initialize local data
     initializeData(local_data, local_data_size);
 
     // Allocate memory on GPU
-    nv_bfloat16 *d_local_data, *d_global_data;
+    bfloat16 *d_local_data, *d_global_data;
+    #ifdef USE_CUDA
     CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size));
     CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size));
-
     // Copy local data to GPU
     CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice));
 
+    #elif USE_ROCM
+    HIP_CHECK(hipMalloc(&d_local_data, local_data_size));
+    HIP_CHECK(hipMalloc(&d_global_data, global_data_size));
+    HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice));
+    #endif
+
     #ifdef USE_MPI
     // create 2-byte datatype (send raw, un-interpreted bytes)
     MPI_Datatype mpi_type_bfloat16;
     MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16);
     MPI_Type_commit(&mpi_type_bfloat16);
 
-    #elif USE_NCCL
+    #elif defined(USE_NCCL) || defined(USE_RCCL)
     ncclUniqueId nccl_comm_id;
     ncclComm_t nccl_comm;
 
@@ -125,13 +163,8 @@ int main(int argc, char *argv[]) {
     MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE,
                         0, MPI_COMM_WORLD));
 
-    /* Create a new NCCL communicator */
+    /* Create a new NCCL/RCCL communicator */
     NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank));
-
-    #elif defined(USE_RCCL)
-    // TODO: fix later
-    rcclComm_t rccl_comm;
-    rcclCommInitRank(&comm, num_gpus, 0, rccl_root);
     #endif
 
     // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather
@@ -142,13 +175,13 @@ int main(int argc, char *argv[]) {
     // Print benchmark results
     if (my_rank == 0) {
         printf("Number of GPUs: %d\n", num_gpus);
-        printf("Message size range: %d - %d\n", min_msg_size, max_msg_size);
+        printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size);
         printf("Number of iterations: %d\n", iterations);
     }
     fflush(NULL);
 
-    for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
-	msg_count = msg_size / sizeof(nv_bfloat16);
+    for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
+	msg_count = msg_size / sizeof(bfloat16);
 	// warmup iterations
 	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
             #ifdef USE_MPI
@@ -156,12 +189,14 @@ int main(int argc, char *argv[]) {
 		d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request));
                 
             MPI_CHECK(MPI_Wait(&request, &status));
-            #elif defined(USE_NCCL)
+            #elif defined(USE_NCCL) || defined(USE_RCCL)
             NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL));
-	    cudaDeviceSynchronize();
-            #elif defined(USE_RCCL)
-	    // TODO: fix later
-            rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
+            #endif
+        
+            #ifdef USE_CUDA
+            cudaDeviceSynchronize();
+            #elif USE_ROCM
+            hipDeviceSynchronize();
             #endif
         }
 
@@ -172,34 +207,39 @@ int main(int argc, char *argv[]) {
         start_time = MPI_Wtime();
 	for (int i = 0; i < iterations; ++i) {
             #ifdef USE_MPI
-            MPI_CHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16,
-                d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request));
-
+	    MPI_CHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16,
+		d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request));
+                
             MPI_CHECK(MPI_Wait(&request, &status));
-            #elif defined(USE_NCCL)
+            #elif defined(USE_NCCL) || defined(USE_RCCL)
             NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL));
-	    cudaDeviceSynchronize();
-            #elif defined(USE_RCCL)
-            // TODO: fix later
-            rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
+            #endif
+        
+            #ifdef USE_CUDA
+            cudaDeviceSynchronize();
+            #elif USE_ROCM
+            hipDeviceSynchronize();
             #endif
         }
         MPI_Barrier(MPI_COMM_WORLD);
         total_time = MPI_Wtime() - start_time;
 	if (my_rank == 0)
-	    printf("%d %.6f seconds\n", msg_size, (total_time / iterations));
+	    printf("%ld %.6f seconds\n", msg_size, (total_time / iterations));
     }
 
     // Cleanup
     free(local_data);
     free(global_data);
+    #ifdef USE_CUDA
     CUDA_CHECK(cudaFree(d_local_data));
     CUDA_CHECK(cudaFree(d_global_data));
+    #elif USE_ROCM
+    HIP_CHECK(hipFree(d_local_data));
+    HIP_CHECK(hipFree(d_global_data));
+    #endif
 
-    #ifdef USE_NCCL
+    #ifdef defined(USE_NCCL) || defined(USE_RCCL)
     ncclCommDestroy(nccl_comm);
-    #elif defined(USE_RCCL)
-    rcclCommDestroy(rccl_comm);
     #endif
 
     MPI_Finalize();
diff --git a/allreduce.cu b/allreduce.cu
new file mode 100644
index 0000000..111b254
--- /dev/null
+++ b/allreduce.cu
@@ -0,0 +1,262 @@
+/* \file allreduce.cu
+ * Copyright 2024 Parallel Software and Systems Group, University of Maryland.
+ * See the top-level LICENSE file for details.
+ * 
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+#include <stdint.h>
+
+#ifdef USE_CUDA
+  #include <cuda_bf16.h>
+  #define bfloat16 nv_bfloat16
+#elif USE_ROCM
+  #define __HIP_PLATFORM_AMD__
+  #include <hip/hip_bfloat16.h>
+  #include <hip/hip_runtime.h>
+  #include <hip/hip_runtime_api.h>
+  #define bfloat16 hip_bfloat16
+#endif
+
+#ifdef USE_NCCL
+  #include "nccl.h"
+#elif USE_RCCL
+  #include <rccl/rccl.h> 
+#endif
+
+#define NUM_WARMUP_ITERATIONS		5
+
+#define MPI_CHECK(cmd) do {                         \
+  int64_t e = cmd;                                      \
+  if( e != MPI_SUCCESS ) {                          \
+    printf("Failed: MPI error %s:%d '%ld'\n",        \
+        __FILE__,__LINE__, e);                      \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+#define CUDA_CHECK(cmd) do {                        \
+  cudaError_t e = cmd;                              \
+  if(e != cudaSuccess) {                            \
+    printf("CUDA error  %s:%d: %s\n",               \
+        __FILE__, __LINE__, cudaGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+#define HIP_CHECK(cmd) do {                        \
+  hipError_t e = cmd;                              \
+  if(e != hipSuccess) {                            \
+    printf("HIP error  %s:%d: %s\n",               \
+        __FILE__, __LINE__, hipGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+// NCCL_CHECK is used to validate RCCL functions as well
+#define NCCL_CHECK(cmd) do {                        \
+  ncclResult_t e = cmd;                             \
+  if (e != ncclSuccess) {                           \
+    printf("NCCL error %s:%d %s\n",                 \
+        __FILE__, __LINE__, ncclGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+void initializeData(bfloat16 *data, int64_t size) {
+    for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) {
+        #ifdef USE_CUDA
+        data[i] = __float2bfloat16((float)i);
+        #elif USE_ROCM
+        // ROCm doesn't have a float2bfloat16 method
+        data[i] = (bfloat16) ((float) i);
+        #endif
+    }
+}
+
+void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
+    bfloat16* in = (bfloat16*) invec;
+    bfloat16* inout = (bfloat16*) inoutvec;
+    for (int i = 0; i < *len; i++) {
+        #ifdef USE_CUDA
+        inout[i] = __hadd(in[i], inout[i]);
+        #elif USE_ROCM
+        inout[i] = in[i] + inout[i];
+        #endif
+    }
+}
+
+int main(int argc, char *argv[]) {
+    if (argc != 5) {
+        fprintf(stderr, "Usage: %s <num_gpus> <min_msg_size> <max_msg_size> <iterations>\n", argv[0]);
+        return EXIT_FAILURE;
+    }
+
+    int num_gpus = atoi(argv[1]);
+    int64_t min_msg_size = strtoll(argv[2], NULL, 10);
+    int64_t max_msg_size = strtoll(argv[3], NULL, 10);
+    int iterations = atoi(argv[4]);
+
+    if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) {
+        fprintf(stderr, "Invalid input parameters.\n");
+        return EXIT_FAILURE;
+    }
+
+    int my_rank, num_pes;
+    int num_gpus_per_node;
+    int msg_count;
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &num_pes);
+
+    if (num_pes != num_gpus) {
+        fprintf(stderr, "Number of processes must match number of GPUs.\n");
+        MPI_Finalize();
+        return EXIT_FAILURE;
+    }
+
+    // Initialize GPU context
+    #if USE_CUDA
+    cudaGetDeviceCount(&num_gpus_per_node);
+    cudaSetDevice((my_rank % num_gpus_per_node));
+    #elif USE_ROCM
+    hipGetDeviceCount(&num_gpus_per_node);
+    hipSetDevice((my_rank % num_gpus_per_node));
+    #endif
+
+    int64_t local_data_size = max_msg_size; // Size of local data
+    int64_t global_data_size = local_data_size; // Size of global data 
+    
+    if (my_rank == 0) {
+        fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024);
+        fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024);
+    }
+
+    bfloat16 *local_data = (bfloat16*)malloc(local_data_size);
+    bfloat16 *global_data = (bfloat16*)malloc(global_data_size);
+
+    // Initialize local data
+    initializeData(local_data, local_data_size);
+
+    bfloat16 *d_local_data, *d_global_data;
+    #ifdef USE_CUDA
+    CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size));
+    CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size));
+    // Copy local data to GPU
+    CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice));
+
+    #elif USE_ROCM
+    HIP_CHECK(hipMalloc(&d_local_data, local_data_size));
+    HIP_CHECK(hipMalloc(&d_global_data, global_data_size));
+    HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice));
+    #endif
+
+    #ifdef USE_MPI
+    // create 2-byte datatype (send raw, un-interpreted bytes)
+    MPI_Datatype mpi_type_bfloat16;
+    MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16);
+    MPI_Type_commit(&mpi_type_bfloat16);
+
+    // define custom reduce operation for nv_bfloat16 types
+    MPI_Op CUSTOM_SUM;
+    MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM);
+
+    #elif defined(USE_NCCL) || defined(USE_RCCL)
+    ncclUniqueId nccl_comm_id;
+    ncclComm_t nccl_comm;
+
+    if (my_rank == 0) {
+        /* Generates an Id to be used in ncclCommInitRank. */
+        ncclGetUniqueId(&nccl_comm_id);
+    }
+
+    /* distribute nccl_comm_id to all ranks */
+    MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE,
+                        0, MPI_COMM_WORLD));
+
+    /* Create a new NCCL/RCCL communicator */
+    NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank));
+    #endif
+
+    // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather
+    double total_time, start_time;
+    MPI_Request request;
+    MPI_Status status;
+
+    // Print benchmark results
+    if (my_rank == 0) {
+        printf("Number of GPUs: %d\n", num_gpus);
+        printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size);
+        printf("Number of iterations: %d\n", iterations);
+    }
+    fflush(NULL);
+
+    for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
+	msg_count = msg_size / sizeof(bfloat16);
+	// warmup iterations
+	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
+            #ifdef USE_MPI
+            MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16,
+                CUSTOM_SUM, MPI_COMM_WORLD, &request));
+
+            MPI_CHECK(MPI_Wait(&request, &status));
+            #elif defined(USE_NCCL) || defined(USE_RCCL)
+            NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL));
+            #endif
+            
+            #ifdef USE_CUDA
+            cudaDeviceSynchronize();
+            #elif USE_ROCM
+            hipDeviceSynchronize();
+            #endif
+        }
+
+	if(msg_size >= 8388608)
+	    iterations = 20;
+
+        MPI_Barrier(MPI_COMM_WORLD);
+        start_time = MPI_Wtime();
+	for (int i = 0; i < iterations; ++i) {
+            #ifdef USE_MPI
+            MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16,
+                CUSTOM_SUM, MPI_COMM_WORLD, &request));
+
+            MPI_CHECK(MPI_Wait(&request, &status));
+            #elif defined(USE_NCCL) || defined(USE_RCCL)
+            NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL));
+            #endif
+            
+            #ifdef USE_CUDA
+            cudaDeviceSynchronize();
+            #elif USE_ROCM
+            hipDeviceSynchronize();
+            #endif
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+        total_time = MPI_Wtime() - start_time;
+	if (my_rank == 0)
+	    printf("%ld %.6f seconds\n", msg_size, (total_time / iterations));
+    }
+
+    // Cleanup
+    free(local_data);
+    free(global_data);
+    #ifdef USE_CUDA
+    CUDA_CHECK(cudaFree(d_local_data));
+    CUDA_CHECK(cudaFree(d_global_data));
+    #elif USE_ROCM
+    HIP_CHECK(hipFree(d_local_data));
+    HIP_CHECK(hipFree(d_global_data));
+    #endif
+
+    #ifdef defined(USE_NCCL) || defined(USE_RCCL)
+    ncclCommDestroy(nccl_comm);
+    #endif
+
+    MPI_Finalize();
+    return EXIT_SUCCESS;
+}
diff --git a/mpi/Makefile b/mpi/Makefile
new file mode 100644
index 0000000..12ed3bf
--- /dev/null
+++ b/mpi/Makefile
@@ -0,0 +1,30 @@
+# Copyright 2024 Parallel Software and Systems Group, University of Maryland.
+# See the top-level LICENSE file for details.
+# 
+# SPDX-License-Identifier: MIT
+
+CC = cc
+
+# perlmutter flags
+INC = -I/global/common/software/nersc9/nccl/2.19.4/include
+CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI
+LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
+
+# frontier flags
+# INC = -I${ROCM_PATH}/include
+# CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_MPI
+# LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl
+
+all: allgather.x allreduce.x reduce_scatter.x
+
+allgather.x: ../allgather.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-gather/allgather.x ../allgather.cu
+
+allreduce.x: ../allreduce.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-reduce/allreduce.x ../allreduce.cu
+
+reduce_scatter.x: ../reduce_scatter.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce-scatter/reduce_scatter.x ../reduce_scatter.cu
+
+clean: 
+	rm -f all-gather/allgather.x all-reduce/allreduce.x reduce-scatter/reduce_scatter.x
diff --git a/mpi/all-gather/frontier/128_gcd_run.sh b/mpi/all-gather/frontier/128_gcd_run.sh
new file mode 100644
index 0000000..4e8c955
--- /dev/null
+++ b/mpi/all-gather/frontier/128_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 15:00
+#SBATCH -N 16
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/128_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 16))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/frontier/16_gcd_run.sh b/mpi/all-gather/frontier/16_gcd_run.sh
new file mode 100644
index 0000000..bb2429f
--- /dev/null
+++ b/mpi/all-gather/frontier/16_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 10:00
+#SBATCH -N 2
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/16_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 128))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/frontier/32_gcd_run.sh b/mpi/all-gather/frontier/32_gcd_run.sh
new file mode 100644
index 0000000..e630b97
--- /dev/null
+++ b/mpi/all-gather/frontier/32_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 15:00
+#SBATCH -N 4
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/32_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 64))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/frontier/64_gcd_run.sh b/mpi/all-gather/frontier/64_gcd_run.sh
new file mode 100644
index 0000000..e7c707f
--- /dev/null
+++ b/mpi/all-gather/frontier/64_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 15:00
+#SBATCH -N 8
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/64_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 32))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/frontier/8_gcd_run.sh b/mpi/all-gather/frontier/8_gcd_run.sh
new file mode 100644
index 0000000..563f933
--- /dev/null
+++ b/mpi/all-gather/frontier/8_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 10:00
+#SBATCH -N 1
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/8_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 256))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/frontier/benchmarks/128_gcd.txt b/mpi/all-gather/frontier/benchmarks/128_gcd.txt
new file mode 100644
index 0000000..824b380
--- /dev/null
+++ b/mpi/all-gather/frontier/benchmarks/128_gcd.txt
@@ -0,0 +1,13 @@
+srun -l -N 16 -n 128 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 128 262144 16777216 10
+  0: Local data size: 16
+  0: Global data size: 2048
+  0: Number of GPUs: 128
+  0: Message size range: 262144 - 16777216
+  0: Number of iterations: 10
+  0: 262144 0.003748 seconds
+  0: 524288 0.005048 seconds
+  0: 1048576 0.008068 seconds
+  0: 2097152 0.014084 seconds
+  0: 4194304 0.026981 seconds
+  0: 8388608 0.051879 seconds
+  0: 16777216 0.255600 seconds
diff --git a/mpi/all-gather/frontier/benchmarks/16_gcd.txt b/mpi/all-gather/frontier/benchmarks/16_gcd.txt
new file mode 100644
index 0000000..35a9e26
--- /dev/null
+++ b/mpi/all-gather/frontier/benchmarks/16_gcd.txt
@@ -0,0 +1,13 @@
+srun -l -N 2 -n 16 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 16 2097152 134217728 10
+ 0: Local data size: 128
+ 0: Global data size: 2048
+ 0: Number of GPUs: 16
+ 0: Message size range: 2097152 - 134217728
+ 0: Number of iterations: 10
+ 0: 2097152 0.002249 seconds
+ 0: 4194304 0.003148 seconds
+ 0: 8388608 0.006062 seconds
+ 0: 16777216 0.011871 seconds
+ 0: 33554432 0.023485 seconds
+ 0: 67108864 0.046822 seconds
+ 0: 134217728 0.139763 seconds
diff --git a/mpi/all-gather/frontier/benchmarks/32_gcd.txt b/mpi/all-gather/frontier/benchmarks/32_gcd.txt
new file mode 100644
index 0000000..f758360
--- /dev/null
+++ b/mpi/all-gather/frontier/benchmarks/32_gcd.txt
@@ -0,0 +1,15 @@
+srun -l -N 4 -n 32 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 32 262144 67108864 10
+ 0: Local data size: 64
+ 0: Global data size: 2048
+ 0: Number of GPUs: 32
+ 0: Message size range: 262144 - 67108864
+ 0: Number of iterations: 10
+ 0: 262144 0.000783 seconds
+ 0: 524288 0.001513 seconds
+ 0: 1048576 0.002953 seconds
+ 0: 2097152 0.003404 seconds
+ 0: 4194304 0.006485 seconds
+ 0: 8388608 0.012489 seconds
+ 0: 16777216 0.024484 seconds
+ 0: 33554432 0.048460 seconds
+ 0: 67108864 0.185884 seconds
diff --git a/mpi/all-gather/frontier/benchmarks/64_gcd.txt b/mpi/all-gather/frontier/benchmarks/64_gcd.txt
new file mode 100644
index 0000000..3eed822
--- /dev/null
+++ b/mpi/all-gather/frontier/benchmarks/64_gcd.txt
@@ -0,0 +1,14 @@
+srun -l -N 8 -n 64 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 64 262144 33554432 10
+ 0: Local data size: 32
+ 0: Global data size: 2048
+ 0: Number of GPUs: 64
+ 0: Message size range: 262144 - 33554432
+ 0: Number of iterations: 10
+ 0: 262144 0.001685 seconds
+ 0: 524288 0.003350 seconds
+ 0: 1048576 0.003938 seconds
+ 0: 2097152 0.006864 seconds
+ 0: 4194304 0.013037 seconds
+ 0: 8388608 0.025167 seconds
+ 0: 16777216 0.049414 seconds
+ 0: 33554432 0.211224 seconds
diff --git a/mpi/all-gather/frontier/benchmarks/8_gcd.txt b/mpi/all-gather/frontier/benchmarks/8_gcd.txt
new file mode 100644
index 0000000..7856a16
--- /dev/null
+++ b/mpi/all-gather/frontier/benchmarks/8_gcd.txt
@@ -0,0 +1,14 @@
+srun -l -N 1 -n 8 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 8 2097152 268435456 10
+0: Local data size: 256
+0: Global data size: 2048
+0: Number of GPUs: 8
+0: Message size range: 2097152 - 268435456
+0: Number of iterations: 10
+0: 2097152 0.000505 seconds
+0: 4194304 0.000856 seconds
+0: 8388608 0.001645 seconds
+0: 16777216 0.003223 seconds
+0: 33554432 0.006379 seconds
+0: 67108864 0.012691 seconds
+0: 134217728 0.025316 seconds
+0: 268435456 0.053944 seconds
diff --git a/mpi/all-gather/perlmutter/128_gpu_run.sh b/mpi/all-gather/perlmutter/128_gpu_run.sh
new file mode 100644
index 0000000..30fd2fc
--- /dev/null
+++ b/mpi/all-gather/perlmutter/128_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 32
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 16))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/perlmutter/16_gpu_run.sh b/mpi/all-gather/perlmutter/16_gpu_run.sh
new file mode 100644
index 0000000..e68834a
--- /dev/null
+++ b/mpi/all-gather/perlmutter/16_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 4
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 128))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/perlmutter/32_gpu_run.sh b/mpi/all-gather/perlmutter/32_gpu_run.sh
new file mode 100644
index 0000000..aad7f68
--- /dev/null
+++ b/mpi/all-gather/perlmutter/32_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 8
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 64))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/perlmutter/64_gpu_run.sh b/mpi/all-gather/perlmutter/64_gpu_run.sh
new file mode 100644
index 0000000..4897de4
--- /dev/null
+++ b/mpi/all-gather/perlmutter/64_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 16
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 32))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/perlmutter/8_gpu_run.sh b/mpi/all-gather/perlmutter/8_gpu_run.sh
new file mode 100644
index 0000000..3a454cf
--- /dev/null
+++ b/mpi/all-gather/perlmutter/8_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 2
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 256))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt
new file mode 100644
index 0000000..295c6c0
--- /dev/null
+++ b/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 16
+Global data size: 2048
+Number of GPUs: 128
+Message size range: 262144 - 16777216
+Number of iterations: 10
+262144 0.003072 seconds
+524288 0.005233 seconds
+1048576 0.008462 seconds
+2097152 0.015449 seconds
+4194304 0.030325 seconds
+8388608 0.060131 seconds
+16777216 0.190401 seconds
diff --git a/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt
new file mode 100644
index 0000000..740a003
--- /dev/null
+++ b/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 128
+Global data size: 2048
+Number of GPUs: 16
+Message size range: 2097152 - 134217728
+Number of iterations: 10
+2097152 0.002476 seconds
+4194304 0.003571 seconds
+8388608 0.007188 seconds
+16777216 0.014909 seconds
+33554432 0.030427 seconds
+67108864 0.061974 seconds
+134217728 0.150229 seconds
diff --git a/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt
new file mode 100644
index 0000000..fca9dfb
--- /dev/null
+++ b/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt
@@ -0,0 +1,14 @@
+Local data size: 64
+Global data size: 2048
+Number of GPUs: 32
+Message size range: 262144 - 67108864
+Number of iterations: 10
+262144 0.000814 seconds
+524288 0.001392 seconds
+1048576 0.002735 seconds
+2097152 0.003736 seconds
+4194304 0.007699 seconds
+8388608 0.014426 seconds
+16777216 0.030468 seconds
+33554432 0.063086 seconds
+67108864 0.172433 seconds
diff --git a/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt
new file mode 100644
index 0000000..fd082e7
--- /dev/null
+++ b/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 32
+Global data size: 2048
+Number of GPUs: 64
+Message size range: 262144 - 33554432
+Number of iterations: 10
+262144 0.001616 seconds
+524288 0.003051 seconds
+1048576 0.004224 seconds
+2097152 0.008058 seconds
+4194304 0.015085 seconds
+8388608 0.029593 seconds
+16777216 0.063129 seconds
+33554432 0.185107 seconds
diff --git a/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt
new file mode 100644
index 0000000..d027526
--- /dev/null
+++ b/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 256
+Global data size: 2048
+Number of GPUs: 8
+Message size range: 2097152 - 268435456
+Number of iterations: 10
+2097152 0.000804 seconds
+4194304 0.001514 seconds
+8388608 0.003268 seconds
+16777216 0.006800 seconds
+33554432 0.013764 seconds
+67108864 0.027832 seconds
+134217728 0.055076 seconds
+268435456 0.103476 seconds
diff --git a/mpi/all-reduce/frontier/128_gcd_run.sh b/mpi/all-reduce/frontier/128_gcd_run.sh
new file mode 100644
index 0000000..5c6baf5
--- /dev/null
+++ b/mpi/all-reduce/frontier/128_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 20:00
+#SBATCH -N 16
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/128_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 1024))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/frontier/16_gcd_run.sh b/mpi/all-reduce/frontier/16_gcd_run.sh
new file mode 100644
index 0000000..e1ad604
--- /dev/null
+++ b/mpi/all-reduce/frontier/16_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 20:00
+#SBATCH -N 2
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/16_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 1024))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/frontier/32_gcd_run.sh b/mpi/all-reduce/frontier/32_gcd_run.sh
new file mode 100644
index 0000000..be7bdd9
--- /dev/null
+++ b/mpi/all-reduce/frontier/32_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 20:00
+#SBATCH -N 4
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/32_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 1024))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/frontier/64_gcd_run.sh b/mpi/all-reduce/frontier/64_gcd_run.sh
new file mode 100644
index 0000000..a8e13d2
--- /dev/null
+++ b/mpi/all-reduce/frontier/64_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 20:00
+#SBATCH -N 8
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/64_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 1024))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/frontier/8_gcd_run.sh b/mpi/all-reduce/frontier/8_gcd_run.sh
new file mode 100644
index 0000000..81ffbc4
--- /dev/null
+++ b/mpi/all-reduce/frontier/8_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 20:00
+#SBATCH -N 1
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/8_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 1024))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/frontier/benchmarks/128_gcd.txt b/mpi/all-reduce/frontier/benchmarks/128_gcd.txt
new file mode 100644
index 0000000..56c18aa
--- /dev/null
+++ b/mpi/all-reduce/frontier/benchmarks/128_gcd.txt
@@ -0,0 +1,12 @@
+srun -l -N 16 -n 128 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 128 33554432 1073741824 10
+  0: Local data size: 1024
+  0: Global data size: 1024
+  0: Number of GPUs: 128
+  0: Message size range: 33554432 - 1073741824
+  0: Number of iterations: 10
+  0: 33554432 0.240206 seconds
+  0: 67108864 0.476990 seconds
+  0: 134217728 1.041500 seconds
+  0: 268435456 2.951969 seconds
+  0: 536870912 5.990606 seconds
+  0: 1073741824 12.004613 seconds
diff --git a/mpi/all-reduce/frontier/benchmarks/16_gcd.txt b/mpi/all-reduce/frontier/benchmarks/16_gcd.txt
new file mode 100644
index 0000000..609afbd
--- /dev/null
+++ b/mpi/all-reduce/frontier/benchmarks/16_gcd.txt
@@ -0,0 +1,12 @@
+srun -l -N 2 -n 16 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 16 33554432 1073741824 10
+ 0: Local data size: 1024
+ 0: Global data size: 1024
+ 0: Number of GPUs: 16
+ 0: Message size range: 33554432 - 1073741824
+ 0: Number of iterations: 10
+ 0: 33554432 0.133082 seconds
+ 0: 67108864 0.267616 seconds
+ 0: 134217728 0.634895 seconds
+ 0: 268435456 1.928400 seconds
+ 0: 536870912 3.973167 seconds
+ 0: 1073741824 7.913018 seconds
diff --git a/mpi/all-reduce/frontier/benchmarks/32_gcd.txt b/mpi/all-reduce/frontier/benchmarks/32_gcd.txt
new file mode 100644
index 0000000..b92c437
--- /dev/null
+++ b/mpi/all-reduce/frontier/benchmarks/32_gcd.txt
@@ -0,0 +1,14 @@
+srun -l -N 4 -n 32 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 32 8388608 1073741824 10
+ 0: Local data size: 1024
+ 0: Global data size: 1024
+ 0: Number of GPUs: 32
+ 0: Message size range: 8388608 - 1073741824
+ 0: Number of iterations: 10
+ 0: 8388608 0.043066 seconds
+ 0: 16777216 0.084259 seconds
+ 0: 33554432 0.167705 seconds
+ 0: 67108864 0.336696 seconds
+ 0: 134217728 0.773389 seconds
+ 0: 268435456 2.284815 seconds
+ 0: 536870912 4.693147 seconds
+ 0: 1073741824 9.356859 seconds
diff --git a/mpi/all-reduce/frontier/benchmarks/64_gcd.txt b/mpi/all-reduce/frontier/benchmarks/64_gcd.txt
new file mode 100644
index 0000000..122c83e
--- /dev/null
+++ b/mpi/all-reduce/frontier/benchmarks/64_gcd.txt
@@ -0,0 +1,13 @@
+srun -l -N 8 -n 64 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 64 16777216 1073741824 10
+ 0: Local data size: 1024
+ 0: Global data size: 1024
+ 0: Number of GPUs: 64
+ 0: Message size range: 16777216 - 1073741824
+ 0: Number of iterations: 10
+ 0: 16777216 0.101777 seconds
+ 0: 33554432 0.203258 seconds
+ 0: 67108864 0.406569 seconds
+ 0: 134217728 0.913391 seconds
+ 0: 268435456 2.633732 seconds
+ 0: 536870912 5.375804 seconds
+ 0: 1073741824 10.708706 seconds
diff --git a/mpi/all-reduce/frontier/benchmarks/8_gcd.txt b/mpi/all-reduce/frontier/benchmarks/8_gcd.txt
new file mode 100644
index 0000000..a9b69c1
--- /dev/null
+++ b/mpi/all-reduce/frontier/benchmarks/8_gcd.txt
@@ -0,0 +1,13 @@
+srun -l -N 1 -n 8 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 8 16777216 1073741824 10
+0: Local data size: 1024
+0: Global data size: 1024
+0: Number of GPUs: 8
+0: Message size range: 16777216 - 1073741824
+0: Number of iterations: 10
+0: 16777216 0.049728 seconds
+0: 33554432 0.099497 seconds
+0: 67108864 0.202129 seconds
+0: 134217728 0.500335 seconds
+0: 268435456 1.560791 seconds
+0: 536870912 3.265382 seconds
+0: 1073741824 6.500534 seconds
diff --git a/mpi/all-reduce/perlmutter/128_gpu_run.sh b/mpi/all-reduce/perlmutter/128_gpu_run.sh
new file mode 100644
index 0000000..3438061
--- /dev/null
+++ b/mpi/all-reduce/perlmutter/128_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 20:00
+#SBATCH -N 32
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 1024))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/perlmutter/16_gpu_run.sh b/mpi/all-reduce/perlmutter/16_gpu_run.sh
new file mode 100644
index 0000000..33962b7
--- /dev/null
+++ b/mpi/all-reduce/perlmutter/16_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 15:00
+#SBATCH -N 4
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 1024))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/perlmutter/32_gpu_run.sh b/mpi/all-reduce/perlmutter/32_gpu_run.sh
new file mode 100644
index 0000000..fcad983
--- /dev/null
+++ b/mpi/all-reduce/perlmutter/32_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 20:00
+#SBATCH -N 8
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 1024))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/perlmutter/64_gpu_run.sh b/mpi/all-reduce/perlmutter/64_gpu_run.sh
new file mode 100644
index 0000000..cd5b8fa
--- /dev/null
+++ b/mpi/all-reduce/perlmutter/64_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 20:00
+#SBATCH -N 16
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 1024))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/perlmutter/8_gpu_run.sh b/mpi/all-reduce/perlmutter/8_gpu_run.sh
new file mode 100644
index 0000000..ddf1050
--- /dev/null
+++ b/mpi/all-reduce/perlmutter/8_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 15:00
+#SBATCH -N 2
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 1024))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt
new file mode 100644
index 0000000..a4485f5
--- /dev/null
+++ b/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt
@@ -0,0 +1,11 @@
+Local data size: 1024
+Global data size: 1024
+Number of GPUs: 128
+Message size range: 33554432 - 1073741824
+Number of iterations: 10
+33554432 0.260096 seconds
+67108864 0.535750 seconds
+134217728 1.089220 seconds
+268435456 3.236966 seconds
+536870912 6.499632 seconds
+1073741824 12.975189 seconds
diff --git a/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt
new file mode 100644
index 0000000..7536923
--- /dev/null
+++ b/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt
@@ -0,0 +1,11 @@
+Local data size: 1024
+Global data size: 1024
+Number of GPUs: 16
+Message size range: 33554432 - 1073741824
+Number of iterations: 10
+33554432 0.142862 seconds
+67108864 0.282599 seconds
+134217728 0.635635 seconds
+268435456 1.893851 seconds
+536870912 3.800098 seconds
+1073741824 7.591759 seconds
diff --git a/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt
new file mode 100644
index 0000000..f210edf
--- /dev/null
+++ b/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 1024
+Global data size: 1024
+Number of GPUs: 32
+Message size range: 8388608 - 1073741824
+Number of iterations: 10
+8388608 0.050115 seconds
+16777216 0.093747 seconds
+33554432 0.182627 seconds
+67108864 0.363477 seconds
+134217728 0.777837 seconds
+268435456 2.348574 seconds
+536870912 4.726795 seconds
+1073741824 9.478696 seconds
diff --git a/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt
new file mode 100644
index 0000000..0052be4
--- /dev/null
+++ b/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 1024
+Global data size: 1024
+Number of GPUs: 64
+Message size range: 16777216 - 1073741824
+Number of iterations: 10
+16777216 0.120696 seconds
+33554432 0.238777 seconds
+67108864 0.470335 seconds
+134217728 0.963299 seconds
+268435456 2.857795 seconds
+536870912 5.742566 seconds
+1073741824 11.495248 seconds
diff --git a/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt
new file mode 100644
index 0000000..def3166
--- /dev/null
+++ b/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 1024
+Global data size: 1024
+Number of GPUs: 8
+Message size range: 16777216 - 1073741824
+Number of iterations: 10
+16777216 0.056844 seconds
+33554432 0.108090 seconds
+67108864 0.215626 seconds
+134217728 0.502310 seconds
+268435456 1.519484 seconds
+536870912 3.075941 seconds
+1073741824 6.121168 seconds
diff --git a/mpi/reduce-scatter/frontier/128_gcd_run.sh b/mpi/reduce-scatter/frontier/128_gcd_run.sh
new file mode 100644
index 0000000..b6505f8
--- /dev/null
+++ b/mpi/reduce-scatter/frontier/128_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 20:00
+#SBATCH -N 16
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/frontier/16_gcd_run.sh b/mpi/reduce-scatter/frontier/16_gcd_run.sh
new file mode 100644
index 0000000..eb6b2ba
--- /dev/null
+++ b/mpi/reduce-scatter/frontier/16_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 20:00
+#SBATCH -N 2
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/frontier/32_gcd_run.sh b/mpi/reduce-scatter/frontier/32_gcd_run.sh
new file mode 100644
index 0000000..4ed3437
--- /dev/null
+++ b/mpi/reduce-scatter/frontier/32_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 20:00
+#SBATCH -N 4
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/frontier/64_gcd_run.sh b/mpi/reduce-scatter/frontier/64_gcd_run.sh
new file mode 100644
index 0000000..a5a9957
--- /dev/null
+++ b/mpi/reduce-scatter/frontier/64_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 20:00
+#SBATCH -N 8
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/frontier/8_gcd_run.sh b/mpi/reduce-scatter/frontier/8_gcd_run.sh
new file mode 100644
index 0000000..9d4191c
--- /dev/null
+++ b/mpi/reduce-scatter/frontier/8_gcd_run.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#SBATCH -p batch
+#SBATCH -A CSC569
+#SBATCH -t 20:00
+#SBATCH -N 1
+#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt
+#SBATCH -C nvme
+
+## calculating the number of nodes and GPUs
+export NNODES=$SLURM_JOB_NUM_NODES
+export GPUS_PER_NODE=8 ## change as per your machine
+export GPUS=$(( NNODES * GPUS_PER_NODE )) 
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" 
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt
new file mode 100644
index 0000000..af5e98a
--- /dev/null
+++ b/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt
@@ -0,0 +1,13 @@
+srun -l -N 16 -n 128 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 128 33554432 2147483648 10
+  0: Local data size: 2048
+  0: Global data size: 2048
+  0: Number of GPUs: 128
+  0: Message size range: 33554432 - 2147483648
+  0: Number of iterations: 10
+  0: 33554432 5.046207 seconds
+  0: 67108864 5.031027 seconds
+  0: 134217728 5.063647 seconds
+  0: 268435456 5.054240 seconds
+  0: 536870912 5.047598 seconds
+  0: 1073741824 5.051536 seconds
+  0: 2147483648 5.057082 seconds
diff --git a/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt
new file mode 100644
index 0000000..fa9c67a
--- /dev/null
+++ b/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt
@@ -0,0 +1,13 @@
+srun -l -N 2 -n 16 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 16 33554432 2147483648 10
+ 0: Local data size: 2048
+ 0: Global data size: 2048
+ 0: Number of GPUs: 16
+ 0: Message size range: 33554432 - 2147483648
+ 0: Number of iterations: 10
+ 0: 33554432 5.091016 seconds
+ 0: 67108864 5.092117 seconds
+ 0: 134217728 5.082377 seconds
+ 0: 268435456 5.103443 seconds
+ 0: 536870912 5.102289 seconds
+ 0: 1073741824 5.116191 seconds
+ 0: 2147483648 5.115768 seconds
diff --git a/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt
new file mode 100644
index 0000000..23a0ace
--- /dev/null
+++ b/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt
@@ -0,0 +1,15 @@
+srun -l -N 4 -n 32 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 32 8388608 2147483648 10
+ 0: Local data size: 2048
+ 0: Global data size: 2048
+ 0: Number of GPUs: 32
+ 0: Message size range: 8388608 - 2147483648
+ 0: Number of iterations: 10
+ 0: 8388608 5.006776 seconds
+ 0: 16777216 4.981770 seconds
+ 0: 33554432 5.014587 seconds
+ 0: 67108864 4.994224 seconds
+ 0: 134217728 4.977063 seconds
+ 0: 268435456 4.980235 seconds
+ 0: 536870912 5.007770 seconds
+ 0: 1073741824 5.013561 seconds
+ 0: 2147483648 5.015718 seconds
diff --git a/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt
new file mode 100644
index 0000000..560c383
--- /dev/null
+++ b/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt
@@ -0,0 +1,17 @@
+srun -l -N 8 -n 64 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 64 16777216 2147483648 10
+ 0: Local data size: 2048
+ 0: Global data size: 2048
+ 0: Number of GPUs: 64
+ 0: Message size range: 16777216 - 2147483648
+ 0: Number of iterations: 10
+ 0: 16777216 5.006610 seconds
+ 0: 33554432 4.998351 seconds
+ 0: 67108864 5.003749 seconds
+ 0: 134217728 5.066133 seconds
+ 0: 268435456 4.980950 seconds
+ 0: 536870912 4.982830 seconds
+ 0: 1073741824 5.023178 seconds
+ 0: 2147483648 4.988750 seconds
+ 0: 
+ 0: MPICH Slingshot Network Summary: 4 network timeouts
+ 0: 
diff --git a/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt
new file mode 100644
index 0000000..493d5ee
--- /dev/null
+++ b/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt
@@ -0,0 +1,14 @@
+srun -l -N 1 -n 8 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 8 16777216 2147483648 10
+0: Local data size: 2048
+0: Global data size: 2048
+0: Number of GPUs: 8
+0: Message size range: 16777216 - 2147483648
+0: Number of iterations: 10
+0: 16777216 5.130130 seconds
+0: 33554432 5.120491 seconds
+0: 67108864 5.115654 seconds
+0: 134217728 5.128319 seconds
+0: 268435456 5.111989 seconds
+0: 536870912 5.115996 seconds
+0: 1073741824 5.127237 seconds
+0: 2147483648 5.116940 seconds
diff --git a/mpi/reduce-scatter/perlmutter/128_gpu_run.sh b/mpi/reduce-scatter/perlmutter/128_gpu_run.sh
new file mode 100644
index 0000000..28c8479
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/128_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 32
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/perlmutter/16_gpu_run.sh b/mpi/reduce-scatter/perlmutter/16_gpu_run.sh
new file mode 100644
index 0000000..c3b9e32
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/16_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 4
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/perlmutter/32_gpu_run.sh b/mpi/reduce-scatter/perlmutter/32_gpu_run.sh
new file mode 100644
index 0000000..1681d65
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/32_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 30:00
+#SBATCH -N 8
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/perlmutter/64_gpu_run.sh b/mpi/reduce-scatter/perlmutter/64_gpu_run.sh
new file mode 100644
index 0000000..f932006
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/64_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 16
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/perlmutter/8_gpu_run.sh b/mpi/reduce-scatter/perlmutter/8_gpu_run.sh
new file mode 100644
index 0000000..977ba91
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/8_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 20:00
+#SBATCH -N 2
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt
new file mode 100644
index 0000000..7306758
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 128
+Message size range: 33554432 - 2147483648
+Number of iterations: 10
+33554432 0.410163 seconds
+67108864 0.429161 seconds
+134217728 0.544002 seconds
+268435456 0.679339 seconds
+536870912 0.981913 seconds
+1073741824 1.583797 seconds
+2147483648 3.678590 seconds
diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt
new file mode 100644
index 0000000..190422f
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 16
+Message size range: 33554432 - 2147483648
+Number of iterations: 10
+33554432 0.056117 seconds
+67108864 0.092396 seconds
+134217728 0.169070 seconds
+268435456 0.331578 seconds
+536870912 0.641127 seconds
+1073741824 1.270086 seconds
+2147483648 3.735213 seconds
diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt
new file mode 100644
index 0000000..7b9f084
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt
@@ -0,0 +1,14 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 32
+Message size range: 8388608 - 2147483648
+Number of iterations: 10
+8388608 0.053765 seconds
+16777216 0.064537 seconds
+33554432 0.084740 seconds
+67108864 0.133787 seconds
+134217728 0.220573 seconds
+268435456 0.377243 seconds
+536870912 0.683938 seconds
+1073741824 1.321649 seconds
+2147483648 3.716915 seconds
diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt
new file mode 100644
index 0000000..675dc8f
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 64
+Message size range: 16777216 - 2147483648
+Number of iterations: 10
+16777216 0.157345 seconds
+33554432 0.205494 seconds
+67108864 0.216133 seconds
+134217728 0.316748 seconds
+268435456 0.476547 seconds
+536870912 0.776507 seconds
+1073741824 1.387122 seconds
+2147483648 3.688627 seconds
diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt
new file mode 100644
index 0000000..c7ca325
--- /dev/null
+++ b/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 8
+Message size range: 16777216 - 2147483648
+Number of iterations: 10
+16777216 0.024237 seconds
+33554432 0.043589 seconds
+67108864 0.083173 seconds
+134217728 0.153300 seconds
+268435456 0.300631 seconds
+536870912 0.598284 seconds
+1073741824 1.190578 seconds
+2147483648 3.832743 seconds
diff --git a/nccl/Makefile b/nccl/Makefile
new file mode 100644
index 0000000..d4423b4
--- /dev/null
+++ b/nccl/Makefile
@@ -0,0 +1,25 @@
+# Copyright 2024 Parallel Software and Systems Group, University of Maryland.
+# See the top-level LICENSE file for details.
+# 
+# SPDX-License-Identifier: MIT
+
+CC = cc
+
+# perlmutter flags
+INC = -I/global/common/software/nersc9/nccl/2.19.4/include
+CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL
+LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl
+
+all: allgather.x allreduce.x reduce_scatter.x
+
+allgather.x: ../allgather.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-gather/allgather.x ../allgather.cu
+
+allreduce.x: ../allreduce.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-reduce/allreduce.x ../allreduce.cu
+
+reduce_scatter.x: ../reduce_scatter.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce-scatter/reduce_scatter.x ../reduce_scatter.cu
+
+clean: 
+	rm -f all-gather/allgather.x all-reduce/allreduce.x reduce-scatter/reduce_scatter.x
diff --git a/nccl/all-gather/128_gpu_run.sh b/nccl/all-gather/128_gpu_run.sh
new file mode 100644
index 0000000..82998f7
--- /dev/null
+++ b/nccl/all-gather/128_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 32
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 32))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-gather/benchmarks/128_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-gather/16_gpu_run.sh b/nccl/all-gather/16_gpu_run.sh
new file mode 100644
index 0000000..47b5f7c
--- /dev/null
+++ b/nccl/all-gather/16_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 4
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 256))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-gather/benchmarks/16_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-gather/32_gpu_run.sh b/nccl/all-gather/32_gpu_run.sh
new file mode 100644
index 0000000..5459a34
--- /dev/null
+++ b/nccl/all-gather/32_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 8
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 64))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-gather/benchmarks/32_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-gather/64_gpu_run.sh b/nccl/all-gather/64_gpu_run.sh
new file mode 100644
index 0000000..2ad7e3a
--- /dev/null
+++ b/nccl/all-gather/64_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 16
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 32))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-gather/benchmarks/64_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-gather/8_gpu_run.sh b/nccl/all-gather/8_gpu_run.sh
new file mode 100644
index 0000000..55e05f8
--- /dev/null
+++ b/nccl/all-gather/8_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 2
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 256))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-gather/benchmarks/8_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-gather/benchmarks/128_gpu.txt b/nccl/all-gather/benchmarks/128_gpu.txt
new file mode 100644
index 0000000..3ac04bb
--- /dev/null
+++ b/nccl/all-gather/benchmarks/128_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 32
+Global data size: 4096
+Number of GPUs: 128
+Message size range: 262144 - 33554432
+Number of iterations: 10
+262144 0.002077 seconds
+524288 0.002368 seconds
+1048576 0.002832 seconds
+2097152 0.004504 seconds
+4194304 0.007551 seconds
+8388608 0.014982 seconds
+16777216 0.028604 seconds
+33554432 0.056227 seconds
diff --git a/nccl/all-gather/benchmarks/16_gpu.txt b/nccl/all-gather/benchmarks/16_gpu.txt
new file mode 100644
index 0000000..1afafc0
--- /dev/null
+++ b/nccl/all-gather/benchmarks/16_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 256
+Global data size: 4096
+Number of GPUs: 16
+Message size range: 2097152 - 268435456
+Number of iterations: 10
+2097152 0.000643 seconds
+4194304 0.000944 seconds
+8388608 0.001838 seconds
+16777216 0.003452 seconds
+33554432 0.007084 seconds
+67108864 0.013794 seconds
+134217728 0.026821 seconds
+268435456 0.052760 seconds
diff --git a/nccl/all-gather/benchmarks/32_gpu.txt b/nccl/all-gather/benchmarks/32_gpu.txt
new file mode 100644
index 0000000..03e6ee9
--- /dev/null
+++ b/nccl/all-gather/benchmarks/32_gpu.txt
@@ -0,0 +1,14 @@
+Local data size: 64
+Global data size: 2048
+Number of GPUs: 32
+Message size range: 262144 - 67108864
+Number of iterations: 10
+262144 0.000528 seconds
+524288 0.000604 seconds
+1048576 0.000701 seconds
+2097152 0.001044 seconds
+4194304 0.002055 seconds
+8388608 0.004240 seconds
+16777216 0.006949 seconds
+33554432 0.014221 seconds
+67108864 0.027622 seconds
diff --git a/nccl/all-gather/benchmarks/64_gpu.txt b/nccl/all-gather/benchmarks/64_gpu.txt
new file mode 100644
index 0000000..c0872ab
--- /dev/null
+++ b/nccl/all-gather/benchmarks/64_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 32
+Global data size: 2048
+Number of GPUs: 64
+Message size range: 262144 - 33554432
+Number of iterations: 10
+262144 0.001230 seconds
+524288 0.001226 seconds
+1048576 0.001381 seconds
+2097152 0.002098 seconds
+4194304 0.003764 seconds
+8388608 0.007649 seconds
+16777216 0.014257 seconds
+33554432 0.027941 seconds
diff --git a/nccl/all-gather/benchmarks/8_gpu.txt b/nccl/all-gather/benchmarks/8_gpu.txt
new file mode 100644
index 0000000..8fc4917
--- /dev/null
+++ b/nccl/all-gather/benchmarks/8_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 256
+Global data size: 2048
+Number of GPUs: 8
+Message size range: 2097152 - 268435456
+Number of iterations: 10
+2097152 0.000325 seconds
+4194304 0.000482 seconds
+8388608 0.000881 seconds
+16777216 0.001679 seconds
+33554432 0.003206 seconds
+67108864 0.006338 seconds
+134217728 0.012452 seconds
+268435456 0.024147 seconds
diff --git a/nccl/all-reduce/128_gpu_run.sh b/nccl/all-reduce/128_gpu_run.sh
new file mode 100644
index 0000000..591cdf3
--- /dev/null
+++ b/nccl/all-reduce/128_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 32
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-reduce/benchmarks/128_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-reduce/16_gpu_run.sh b/nccl/all-reduce/16_gpu_run.sh
new file mode 100644
index 0000000..9232407
--- /dev/null
+++ b/nccl/all-reduce/16_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 4
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-reduce/benchmarks/16_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-reduce/32_gpu_run.sh b/nccl/all-reduce/32_gpu_run.sh
new file mode 100644
index 0000000..7130fa8
--- /dev/null
+++ b/nccl/all-reduce/32_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 8
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-reduce/benchmarks/32_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-reduce/64_gpu_run.sh b/nccl/all-reduce/64_gpu_run.sh
new file mode 100644
index 0000000..057637f
--- /dev/null
+++ b/nccl/all-reduce/64_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 16
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-reduce/benchmarks/64_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-reduce/8_gpu_run.sh b/nccl/all-reduce/8_gpu_run.sh
new file mode 100644
index 0000000..be7f5f1
--- /dev/null
+++ b/nccl/all-reduce/8_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 2
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-reduce/benchmarks/8_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/all-reduce/benchmarks/128_gpu.txt b/nccl/all-reduce/benchmarks/128_gpu.txt
new file mode 100644
index 0000000..30388e3
--- /dev/null
+++ b/nccl/all-reduce/benchmarks/128_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 128
+Message size range: 33554432 - 2147483648
+Number of iterations: 10
+33554432 0.002305 seconds
+67108864 0.003309 seconds
+134217728 0.005263 seconds
+268435456 0.008851 seconds
+536870912 0.017150 seconds
+1073741824 0.037149 seconds
+2147483648 0.075655 seconds
diff --git a/nccl/all-reduce/benchmarks/16_gpu.txt b/nccl/all-reduce/benchmarks/16_gpu.txt
new file mode 100644
index 0000000..26fc256
--- /dev/null
+++ b/nccl/all-reduce/benchmarks/16_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 16
+Message size range: 33554432 - 2147483648
+Number of iterations: 10
+33554432 0.000969 seconds
+67108864 0.001819 seconds
+134217728 0.003596 seconds
+268435456 0.006813 seconds
+536870912 0.013459 seconds
+1073741824 0.026683 seconds
+2147483648 0.052290 seconds
diff --git a/nccl/all-reduce/benchmarks/32_gpu.txt b/nccl/all-reduce/benchmarks/32_gpu.txt
new file mode 100644
index 0000000..90fc0f0
--- /dev/null
+++ b/nccl/all-reduce/benchmarks/32_gpu.txt
@@ -0,0 +1,14 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 32
+Message size range: 8388608 - 2147483648
+Number of iterations: 10
+8388608 0.001510 seconds
+16777216 0.001222 seconds
+33554432 0.001317 seconds
+67108864 0.002024 seconds
+134217728 0.003762 seconds
+268435456 0.007554 seconds
+536870912 0.014173 seconds
+1073741824 0.027756 seconds
+2147483648 0.054544 seconds
diff --git a/nccl/all-reduce/benchmarks/64_gpu.txt b/nccl/all-reduce/benchmarks/64_gpu.txt
new file mode 100644
index 0000000..ebd310e
--- /dev/null
+++ b/nccl/all-reduce/benchmarks/64_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 64
+Message size range: 16777216 - 2147483648
+Number of iterations: 10
+16777216 0.001551 seconds
+33554432 0.001949 seconds
+67108864 0.002918 seconds
+134217728 0.004132 seconds
+268435456 0.007447 seconds
+536870912 0.014747 seconds
+1073741824 0.028172 seconds
+2147483648 0.055372 seconds
diff --git a/nccl/all-reduce/benchmarks/8_gpu.txt b/nccl/all-reduce/benchmarks/8_gpu.txt
new file mode 100644
index 0000000..e5a5769
--- /dev/null
+++ b/nccl/all-reduce/benchmarks/8_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 8
+Message size range: 16777216 - 2147483648
+Number of iterations: 10
+16777216 0.000635 seconds
+33554432 0.000887 seconds
+67108864 0.001639 seconds
+134217728 0.003232 seconds
+268435456 0.006303 seconds
+536870912 0.011998 seconds
+1073741824 0.024143 seconds
+2147483648 0.047652 seconds
diff --git a/nccl/reduce-scatter/128_gpu_run.sh b/nccl/reduce-scatter/128_gpu_run.sh
new file mode 100644
index 0000000..e37f70b
--- /dev/null
+++ b/nccl/reduce-scatter/128_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 32
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/reduce-scatter/benchmarks/128_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/reduce-scatter/16_gpu_run.sh b/nccl/reduce-scatter/16_gpu_run.sh
new file mode 100644
index 0000000..0ea1f3b
--- /dev/null
+++ b/nccl/reduce-scatter/16_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 4
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/reduce-scatter/benchmarks/16_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/reduce-scatter/32_gpu_run.sh b/nccl/reduce-scatter/32_gpu_run.sh
new file mode 100644
index 0000000..0bccbb2
--- /dev/null
+++ b/nccl/reduce-scatter/32_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 8
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/reduce-scatter/benchmarks/32_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/reduce-scatter/64_gpu_run.sh b/nccl/reduce-scatter/64_gpu_run.sh
new file mode 100644
index 0000000..79dd4cb
--- /dev/null
+++ b/nccl/reduce-scatter/64_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 16
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/reduce-scatter/benchmarks/64_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/reduce-scatter/8_gpu_run.sh b/nccl/reduce-scatter/8_gpu_run.sh
new file mode 100644
index 0000000..6fba196
--- /dev/null
+++ b/nccl/reduce-scatter/8_gpu_run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+#SBATCH -A m4641_g
+#SBATCH -C gpu
+#SBATCH -q regular
+#SBATCH -t 10:00
+#SBATCH -N 2
+#SBATCH --ntasks-per-node=4
+#SBATCH -c 32
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=none
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+NNODES=$SLURM_JOB_NUM_NODES
+GPUS=$(( NNODES * 4 ))
+export WORLD_SIZE=$GPUS
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export CUDA_VISIBLE_DEVICES=3,2,1,0
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsn
+export NCCL_NET="AWS Libfabric"
+export FI_CXI_RDZV_THRESHOLD=0
+export FI_CXI_RDZV_GET_MIN=0
+export FI_CXI_OFLOW_BUF_SIZE=1073741824
+export FI_CXI_OFLOW_BUF_COUNT=1
+
+MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024
+MAX_MSG_SIZE=$((1048576 * 2048))
+
+SCRIPT="$SCRATCH/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10"
+run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/reduce-scatter/benchmarks/8_gpu.txt"
+
+echo $run_cmd
+eval $run_cmd
+set +x
diff --git a/nccl/reduce-scatter/benchmarks/128_gpu.txt b/nccl/reduce-scatter/benchmarks/128_gpu.txt
new file mode 100644
index 0000000..846d583
--- /dev/null
+++ b/nccl/reduce-scatter/benchmarks/128_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 128
+Message size range: 33554432 - 2147483648
+Number of iterations: 10
+33554432 0.002055 seconds
+67108864 0.002314 seconds
+134217728 0.003003 seconds
+268435456 0.004164 seconds
+536870912 0.007515 seconds
+1073741824 0.014791 seconds
+2147483648 0.027948 seconds
diff --git a/nccl/reduce-scatter/benchmarks/16_gpu.txt b/nccl/reduce-scatter/benchmarks/16_gpu.txt
new file mode 100644
index 0000000..0bae9e9
--- /dev/null
+++ b/nccl/reduce-scatter/benchmarks/16_gpu.txt
@@ -0,0 +1,12 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 16
+Message size range: 33554432 - 2147483648
+Number of iterations: 10
+33554432 0.000552 seconds
+67108864 0.000933 seconds
+134217728 0.001772 seconds
+268435456 0.003462 seconds
+536870912 0.007059 seconds
+1073741824 0.013749 seconds
+2147483648 0.026539 seconds
diff --git a/nccl/reduce-scatter/benchmarks/32_gpu.txt b/nccl/reduce-scatter/benchmarks/32_gpu.txt
new file mode 100644
index 0000000..307b0ce
--- /dev/null
+++ b/nccl/reduce-scatter/benchmarks/32_gpu.txt
@@ -0,0 +1,14 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 32
+Message size range: 8388608 - 2147483648
+Number of iterations: 10
+8388608 0.000586 seconds
+16777216 0.000629 seconds
+33554432 0.000712 seconds
+67108864 0.001141 seconds
+134217728 0.002012 seconds
+268435456 0.003715 seconds
+536870912 0.007022 seconds
+1073741824 0.014078 seconds
+2147483648 0.027699 seconds
diff --git a/nccl/reduce-scatter/benchmarks/64_gpu.txt b/nccl/reduce-scatter/benchmarks/64_gpu.txt
new file mode 100644
index 0000000..45bd514
--- /dev/null
+++ b/nccl/reduce-scatter/benchmarks/64_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 64
+Message size range: 16777216 - 2147483648
+Number of iterations: 10
+16777216 0.001059 seconds
+33554432 0.001147 seconds
+67108864 0.001410 seconds
+134217728 0.002090 seconds
+268435456 0.004116 seconds
+536870912 0.007125 seconds
+1073741824 0.014305 seconds
+2147483648 0.028156 seconds
diff --git a/nccl/reduce-scatter/benchmarks/8_gpu.txt b/nccl/reduce-scatter/benchmarks/8_gpu.txt
new file mode 100644
index 0000000..5cee721
--- /dev/null
+++ b/nccl/reduce-scatter/benchmarks/8_gpu.txt
@@ -0,0 +1,13 @@
+Local data size: 2048
+Global data size: 2048
+Number of GPUs: 8
+Message size range: 16777216 - 2147483648
+Number of iterations: 10
+16777216 0.000363 seconds
+33554432 0.000450 seconds
+67108864 0.000876 seconds
+134217728 0.001650 seconds
+268435456 0.003169 seconds
+536870912 0.006491 seconds
+1073741824 0.012103 seconds
+2147483648 0.024166 seconds
diff --git a/rccl/Makefile b/rccl/Makefile
new file mode 100644
index 0000000..aa0a7b9
--- /dev/null
+++ b/rccl/Makefile
@@ -0,0 +1,25 @@
+# Copyright 2024 Parallel Software and Systems Group, University of Maryland.
+# See the top-level LICENSE file for details.
+# 
+# SPDX-License-Identifier: MIT
+
+CC = cc
+
+# frontier flags
+INC = -I${ROCM_PATH}/include
+CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL
+LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl
+
+all: allgather.x allreduce.x reduce_scatter.x
+
+allgather.x: ../allgather.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-gather/allgather.x ../allgather.cu
+
+allreduce.x: ../allreduce.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-reduce/allreduce.x ../allreduce.cu
+
+reduce_scatter.x: ../reduce_scatter.cu
+	${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce-scatter/reduce_scatter.x ../reduce_scatter.cu
+
+clean: 
+	rm -f all-gather/allgather.x all-reduce/allreduce.x reduce-scatter/reduce_scatter.x
diff --git a/reduce_scatter.cu b/reduce_scatter.cu
new file mode 100644
index 0000000..b2dc99e
--- /dev/null
+++ b/reduce_scatter.cu
@@ -0,0 +1,271 @@
+/* \file reduce_scatter.cu
+ * Copyright 2024 Parallel Software and Systems Group, University of Maryland.
+ * See the top-level LICENSE file for details.
+ * 
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <mpi.h>
+#include <stdint.h>
+
+#ifdef USE_CUDA
+  #include <cuda_bf16.h>
+  #define bfloat16 nv_bfloat16
+#elif USE_ROCM
+  #define __HIP_PLATFORM_AMD__
+  #include <hip/hip_bfloat16.h>
+  #include <hip/hip_runtime.h>
+  #include <hip/hip_runtime_api.h>
+  #define bfloat16 hip_bfloat16
+#endif
+
+#ifdef USE_NCCL
+  #include "nccl.h"
+#elif USE_RCCL
+  #include <rccl/rccl.h> 
+#endif
+
+#define NUM_WARMUP_ITERATIONS		5
+
+#define MPI_CHECK(cmd) do {                         \
+  int64_t e = cmd;                                      \
+  if( e != MPI_SUCCESS ) {                          \
+    printf("Failed: MPI error %s:%d '%ld'\n",        \
+        __FILE__,__LINE__, e);                      \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+#define CUDA_CHECK(cmd) do {                        \
+  cudaError_t e = cmd;                              \
+  if(e != cudaSuccess) {                            \
+    printf("CUDA error  %s:%d: %s\n",               \
+        __FILE__, __LINE__, cudaGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+#define HIP_CHECK(cmd) do {                        \
+  hipError_t e = cmd;                              \
+  if(e != hipSuccess) {                            \
+    printf("HIP error  %s:%d: %s\n",               \
+        __FILE__, __LINE__, hipGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+// NCCL_CHECK is used to validate RCCL functions as well
+#define NCCL_CHECK(cmd) do {                        \
+  ncclResult_t e = cmd;                             \
+  if (e != ncclSuccess) {                           \
+    printf("NCCL error %s:%d %s\n",                 \
+        __FILE__, __LINE__, ncclGetErrorString(e)); \
+    exit(EXIT_FAILURE);                             \
+  }                                                 \
+} while(0)
+
+void initializeData(bfloat16 *data, int64_t size) {
+    for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) {
+        #ifdef USE_CUDA
+        data[i] = __float2bfloat16((float)i);
+        #elif USE_ROCM
+        // ROCm doesn't have a float2bfloat16 method
+        data[i] = (bfloat16) ((float) i);
+        #endif
+    }
+}
+
+void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) {
+    bfloat16* in = (bfloat16*) invec;
+    bfloat16* inout = (bfloat16*) inoutvec;
+    for (int i = 0; i < *len; i++) {
+        #ifdef USE_CUDA
+        inout[i] = __hadd(in[i], inout[i]);
+        #elif USE_ROCM
+        inout[i] = in[i] + inout[i];
+        #endif
+    }
+}
+
+int main(int argc, char *argv[]) {
+    if (argc != 5) {
+        fprintf(stderr, "Usage: %s <num_gpus> <min_msg_size> <max_msg_size> <iterations>\n", argv[0]);
+        return EXIT_FAILURE;
+    }
+
+    int num_gpus = atoi(argv[1]);
+    int64_t min_msg_size = strtoll(argv[2], NULL, 10);
+    int64_t max_msg_size = strtoll(argv[3], NULL, 10);
+    int iterations = atoi(argv[4]);
+
+    if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) {
+        fprintf(stderr, "Invalid input parameters.\n");
+        return EXIT_FAILURE;
+    }
+
+    int my_rank, num_pes;
+    int num_gpus_per_node;
+    int msg_count;
+
+    MPI_Init(&argc, &argv);
+    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &num_pes);
+
+    if (num_pes != num_gpus) {
+        fprintf(stderr, "Number of processes must match number of GPUs.\n");
+        MPI_Finalize();
+        return EXIT_FAILURE;
+    }
+
+    // Initialize GPU context
+    #if USE_CUDA
+    cudaGetDeviceCount(&num_gpus_per_node);
+    cudaSetDevice((my_rank % num_gpus_per_node));
+    #elif USE_ROCM
+    hipGetDeviceCount(&num_gpus_per_node);
+    hipSetDevice((my_rank % num_gpus_per_node));
+    #endif
+
+    int64_t local_data_size = max_msg_size; // Size of local data
+    int64_t global_data_size = local_data_size; // Size of global data
+
+    if (my_rank == 0) {
+        fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024);
+        fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024);
+    }
+
+    bfloat16 *local_data = (bfloat16*)malloc(local_data_size);
+    bfloat16 *global_data = (bfloat16*)malloc(global_data_size);
+
+    // Initialize local data
+    initializeData(local_data, local_data_size);
+
+    bfloat16 *d_local_data, *d_global_data;
+    #ifdef USE_CUDA
+    CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size));
+    CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size));
+    // Copy local data to GPU
+    CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice));
+
+    #elif USE_ROCM
+    HIP_CHECK(hipMalloc(&d_local_data, local_data_size));
+    HIP_CHECK(hipMalloc(&d_global_data, global_data_size));
+    HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice));
+    #endif
+
+    #ifdef USE_MPI
+    // create 2-byte datatype (send raw, un-interpreted bytes)
+    MPI_Datatype mpi_type_bfloat16;
+    MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16);
+    MPI_Type_commit(&mpi_type_bfloat16);
+
+    // define custom reduce operation for nv_bfloat16 types
+    MPI_Op CUSTOM_SUM;
+    MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM);
+
+    #elif defined(USE_NCCL) || defined(USE_RCCL)
+    ncclUniqueId nccl_comm_id;
+    ncclComm_t nccl_comm;
+
+    if (my_rank == 0) {
+        /* Generates an Id to be used in ncclCommInitRank. */
+        ncclGetUniqueId(&nccl_comm_id);
+    }
+
+    /* distribute nccl_comm_id to all ranks */
+    MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE,
+                        0, MPI_COMM_WORLD));
+
+    /* Create a new NCCL/RCCL communicator */
+    NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank));
+    #endif
+
+    // init recvcounts to send an equal portion of data from the reduce operation
+    int *recvcounts = (int*) malloc(sizeof(int) * num_pes);
+    int portion;
+
+    // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather
+    double total_time, start_time;
+    MPI_Request request;
+    MPI_Status status;
+
+    // Print benchmark results
+    if (my_rank == 0) {
+        printf("Number of GPUs: %d\n", num_gpus);
+        printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size);
+        printf("Number of iterations: %d\n", iterations);
+    }
+    fflush(NULL);
+
+    for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
+	msg_count = msg_size / sizeof(bfloat16);
+
+    portion = msg_count / num_pes;
+    for (int i = 0; i < num_pes; i++)
+        recvcounts[i] = portion;
+
+	// warmup iterations
+	for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
+            #ifdef USE_MPI
+            MPI_CHECK(MPI_Ireduce_scatter(d_local_data, d_global_data, recvcounts, mpi_type_bfloat16,
+                CUSTOM_SUM, MPI_COMM_WORLD, &request));
+
+            MPI_CHECK(MPI_Wait(&request, &status));
+            #elif defined(USE_NCCL) || defined(USE_RCCL)
+            NCCL_CHECK(ncclReduceScatter((const void*)d_local_data, (void*)d_global_data, portion, ncclBfloat16, ncclSum, nccl_comm, NULL));
+            #endif
+            
+            #ifdef USE_CUDA
+            cudaDeviceSynchronize();
+            #elif USE_ROCM
+            hipDeviceSynchronize();
+            #endif
+        }
+
+	if(msg_size >= 8388608)
+	    iterations = 20;
+
+        MPI_Barrier(MPI_COMM_WORLD);
+        start_time = MPI_Wtime();
+	for (int i = 0; i < iterations; ++i) {
+            #ifdef USE_MPI
+            MPI_CHECK(MPI_Ireduce_scatter(d_local_data, d_global_data, recvcounts, mpi_type_bfloat16,
+                CUSTOM_SUM, MPI_COMM_WORLD, &request));
+
+            MPI_CHECK(MPI_Wait(&request, &status));
+            #elif defined(USE_NCCL) || defined(USE_RCCL)
+            NCCL_CHECK(ncclReduceScatter((const void*)d_local_data, (void*)d_global_data, portion, ncclBfloat16, ncclSum, nccl_comm, NULL));
+            #endif
+            
+            #ifdef USE_CUDA
+            cudaDeviceSynchronize();
+            #elif USE_ROCM
+            hipDeviceSynchronize();
+            #endif
+        }
+        MPI_Barrier(MPI_COMM_WORLD);
+        total_time = MPI_Wtime() - start_time;
+	if (my_rank == 0)
+	    printf("%ld %.6f seconds\n", msg_size, (total_time / iterations));
+    }
+
+    // Cleanup
+    free(local_data);
+    free(global_data);
+    #ifdef USE_CUDA
+    CUDA_CHECK(cudaFree(d_local_data));
+    CUDA_CHECK(cudaFree(d_global_data));
+    #elif USE_ROCM
+    HIP_CHECK(hipFree(d_local_data));
+    HIP_CHECK(hipFree(d_global_data));
+    #endif
+
+    #ifdef defined(USE_NCCL) || defined(USE_RCCL)
+    ncclCommDestroy(nccl_comm);
+    #endif
+
+    MPI_Finalize();
+    return EXIT_SUCCESS;
+}