diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7882514 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.x +*.out diff --git a/Makefile b/Makefile deleted file mode 100644 index df453b4..0000000 --- a/Makefile +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright 2024 Parallel Software and Systems Group, University of Maryland. -# See the top-level LICENSE file for details. -# -# SPDX-License-Identifier: MIT - -CC = cc -INC = -I/global/common/software/nersc9/nccl/2.19.4/include -CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL -LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl - - -all: allgather.x - -allgather.x: allgather.cu - ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o allgather.x allgather.cu - -clean: - rm -f allgather.x diff --git a/README b/README deleted file mode 100644 index eba2046..0000000 --- a/README +++ /dev/null @@ -1,9 +0,0 @@ -Before compiling do these: - -module load PrgEnv-cray cudatoolkit craype-accel-nvidia80 -export CRAY_ACCEL_TARGET=nvidia80 - -When running do these: - -module load cudatoolkit -export MPICH_GPU_SUPPORT_ENABLED=1 diff --git a/README.md b/README.md new file mode 100644 index 0000000..526fb95 --- /dev/null +++ b/README.md @@ -0,0 +1,15 @@ +Before compiling do these: + +### Perlmutter +```sh +module load PrgEnv-cray cudatoolkit craype-accel-nvidia80 nccl +export CRAY_ACCEL_TARGET=nvidia80 +export MPICH_GPU_SUPPORT_ENABLED=1 +``` +### Frontier +```sh +module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05 +export MPICH_GPU_SUPPORT_ENABLED=1 +export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}" +``` + diff --git a/allgather.cu b/allgather.cu index 5953041..8c357bb 100644 --- a/allgather.cu +++ b/allgather.cu @@ -8,24 +8,31 @@ #include #include #include +#include #ifdef USE_CUDA - #include #include + #define bfloat16 nv_bfloat16 +#elif USE_ROCM + #define __HIP_PLATFORM_AMD__ + #include + #include + #include + #define bfloat16 hip_bfloat16 #endif #ifdef USE_NCCL #include "nccl.h" -#elif defined(USE_RCCL) - #include "rccl.h" +#elif USE_RCCL + #include #endif #define NUM_WARMUP_ITERATIONS 5 #define MPI_CHECK(cmd) do { \ - int e = cmd; \ + int64_t e = cmd; \ if( e != MPI_SUCCESS ) { \ - printf("Failed: MPI error %s:%d '%d'\n", \ + printf("Failed: MPI error %s:%d '%ld'\n", \ __FILE__,__LINE__, e); \ exit(EXIT_FAILURE); \ } \ @@ -40,6 +47,16 @@ } \ } while(0) +#define HIP_CHECK(cmd) do { \ + hipError_t e = cmd; \ + if(e != hipSuccess) { \ + printf("HIP error %s:%d: %s\n", \ + __FILE__, __LINE__, hipGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +// NCCL_CHECK is used to validate RCCL functions as well #define NCCL_CHECK(cmd) do { \ ncclResult_t e = cmd; \ if (e != ncclSuccess) { \ @@ -49,9 +66,14 @@ } \ } while(0) -void initializeData(nv_bfloat16 *data, int size) { - for (int i = 0; i < (size / sizeof(nv_bfloat16)); ++i) { +void initializeData(bfloat16 *data, int64_t size) { + for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) { + #ifdef USE_CUDA data[i] = __float2bfloat16((float)i); + #elif USE_ROCM + // ROCm doesn't have a float2bfloat16 method + data[i] = (bfloat16) ((float) i); + #endif } } @@ -62,8 +84,8 @@ int main(int argc, char *argv[]) { } int num_gpus = atoi(argv[1]); - int min_msg_size = atoi(argv[2]); - int max_msg_size = atoi(argv[3]); + int64_t min_msg_size = atoi(argv[2]); + int64_t max_msg_size = atoi(argv[3]); int iterations = atoi(argv[4]); if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) { @@ -86,33 +108,49 @@ int main(int argc, char *argv[]) { } // Initialize GPU context + #if USE_CUDA cudaGetDeviceCount(&num_gpus_per_node); cudaSetDevice((my_rank % num_gpus_per_node)); + #elif USE_ROCM + hipGetDeviceCount(&num_gpus_per_node); + hipSetDevice((my_rank % num_gpus_per_node)); + #endif - int local_data_size = max_msg_size; // Size of local data - int global_data_size = local_data_size * num_gpus; // Size of global data + int64_t local_data_size = max_msg_size; // Size of local data + int64_t global_data_size = local_data_size * num_gpus; // Size of global data - nv_bfloat16 *local_data = (nv_bfloat16*)malloc(local_data_size); - nv_bfloat16 *global_data = (nv_bfloat16*)malloc(global_data_size); + if (my_rank == 0) { + fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024); + fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024); + } + + bfloat16 *local_data = (bfloat16*)malloc(local_data_size); + bfloat16 *global_data = (bfloat16*)malloc(global_data_size); // Initialize local data initializeData(local_data, local_data_size); // Allocate memory on GPU - nv_bfloat16 *d_local_data, *d_global_data; + bfloat16 *d_local_data, *d_global_data; + #ifdef USE_CUDA CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size)); CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size)); - // Copy local data to GPU CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice)); + #elif USE_ROCM + HIP_CHECK(hipMalloc(&d_local_data, local_data_size)); + HIP_CHECK(hipMalloc(&d_global_data, global_data_size)); + HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice)); + #endif + #ifdef USE_MPI // create 2-byte datatype (send raw, un-interpreted bytes) MPI_Datatype mpi_type_bfloat16; MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16); MPI_Type_commit(&mpi_type_bfloat16); - #elif USE_NCCL + #elif defined(USE_NCCL) || defined(USE_RCCL) ncclUniqueId nccl_comm_id; ncclComm_t nccl_comm; @@ -125,13 +163,8 @@ int main(int argc, char *argv[]) { MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE, 0, MPI_COMM_WORLD)); - /* Create a new NCCL communicator */ + /* Create a new NCCL/RCCL communicator */ NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank)); - - #elif defined(USE_RCCL) - // TODO: fix later - rcclComm_t rccl_comm; - rcclCommInitRank(&comm, num_gpus, 0, rccl_root); #endif // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather @@ -142,13 +175,13 @@ int main(int argc, char *argv[]) { // Print benchmark results if (my_rank == 0) { printf("Number of GPUs: %d\n", num_gpus); - printf("Message size range: %d - %d\n", min_msg_size, max_msg_size); + printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size); printf("Number of iterations: %d\n", iterations); } fflush(NULL); - for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { - msg_count = msg_size / sizeof(nv_bfloat16); + for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { + msg_count = msg_size / sizeof(bfloat16); // warmup iterations for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { #ifdef USE_MPI @@ -156,12 +189,14 @@ int main(int argc, char *argv[]) { d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request)); MPI_CHECK(MPI_Wait(&request, &status)); - #elif defined(USE_NCCL) + #elif defined(USE_NCCL) || defined(USE_RCCL) NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL)); - cudaDeviceSynchronize(); - #elif defined(USE_RCCL) - // TODO: fix later - rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); + #endif + + #ifdef USE_CUDA + cudaDeviceSynchronize(); + #elif USE_ROCM + hipDeviceSynchronize(); #endif } @@ -172,34 +207,39 @@ int main(int argc, char *argv[]) { start_time = MPI_Wtime(); for (int i = 0; i < iterations; ++i) { #ifdef USE_MPI - MPI_CHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16, - d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request)); - + MPI_CHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16, + d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request)); + MPI_CHECK(MPI_Wait(&request, &status)); - #elif defined(USE_NCCL) + #elif defined(USE_NCCL) || defined(USE_RCCL) NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL)); - cudaDeviceSynchronize(); - #elif defined(USE_RCCL) - // TODO: fix later - rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL); + #endif + + #ifdef USE_CUDA + cudaDeviceSynchronize(); + #elif USE_ROCM + hipDeviceSynchronize(); #endif } MPI_Barrier(MPI_COMM_WORLD); total_time = MPI_Wtime() - start_time; if (my_rank == 0) - printf("%d %.6f seconds\n", msg_size, (total_time / iterations)); + printf("%ld %.6f seconds\n", msg_size, (total_time / iterations)); } // Cleanup free(local_data); free(global_data); + #ifdef USE_CUDA CUDA_CHECK(cudaFree(d_local_data)); CUDA_CHECK(cudaFree(d_global_data)); + #elif USE_ROCM + HIP_CHECK(hipFree(d_local_data)); + HIP_CHECK(hipFree(d_global_data)); + #endif - #ifdef USE_NCCL + #ifdef defined(USE_NCCL) || defined(USE_RCCL) ncclCommDestroy(nccl_comm); - #elif defined(USE_RCCL) - rcclCommDestroy(rccl_comm); #endif MPI_Finalize(); diff --git a/allreduce.cu b/allreduce.cu new file mode 100644 index 0000000..111b254 --- /dev/null +++ b/allreduce.cu @@ -0,0 +1,262 @@ +/* \file allreduce.cu + * Copyright 2024 Parallel Software and Systems Group, University of Maryland. + * See the top-level LICENSE file for details. + * + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include +#include + +#ifdef USE_CUDA + #include + #define bfloat16 nv_bfloat16 +#elif USE_ROCM + #define __HIP_PLATFORM_AMD__ + #include + #include + #include + #define bfloat16 hip_bfloat16 +#endif + +#ifdef USE_NCCL + #include "nccl.h" +#elif USE_RCCL + #include +#endif + +#define NUM_WARMUP_ITERATIONS 5 + +#define MPI_CHECK(cmd) do { \ + int64_t e = cmd; \ + if( e != MPI_SUCCESS ) { \ + printf("Failed: MPI error %s:%d '%ld'\n", \ + __FILE__,__LINE__, e); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +#define CUDA_CHECK(cmd) do { \ + cudaError_t e = cmd; \ + if(e != cudaSuccess) { \ + printf("CUDA error %s:%d: %s\n", \ + __FILE__, __LINE__, cudaGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +#define HIP_CHECK(cmd) do { \ + hipError_t e = cmd; \ + if(e != hipSuccess) { \ + printf("HIP error %s:%d: %s\n", \ + __FILE__, __LINE__, hipGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +// NCCL_CHECK is used to validate RCCL functions as well +#define NCCL_CHECK(cmd) do { \ + ncclResult_t e = cmd; \ + if (e != ncclSuccess) { \ + printf("NCCL error %s:%d %s\n", \ + __FILE__, __LINE__, ncclGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +void initializeData(bfloat16 *data, int64_t size) { + for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) { + #ifdef USE_CUDA + data[i] = __float2bfloat16((float)i); + #elif USE_ROCM + // ROCm doesn't have a float2bfloat16 method + data[i] = (bfloat16) ((float) i); + #endif + } +} + +void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { + bfloat16* in = (bfloat16*) invec; + bfloat16* inout = (bfloat16*) inoutvec; + for (int i = 0; i < *len; i++) { + #ifdef USE_CUDA + inout[i] = __hadd(in[i], inout[i]); + #elif USE_ROCM + inout[i] = in[i] + inout[i]; + #endif + } +} + +int main(int argc, char *argv[]) { + if (argc != 5) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return EXIT_FAILURE; + } + + int num_gpus = atoi(argv[1]); + int64_t min_msg_size = strtoll(argv[2], NULL, 10); + int64_t max_msg_size = strtoll(argv[3], NULL, 10); + int iterations = atoi(argv[4]); + + if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) { + fprintf(stderr, "Invalid input parameters.\n"); + return EXIT_FAILURE; + } + + int my_rank, num_pes; + int num_gpus_per_node; + int msg_count; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &num_pes); + + if (num_pes != num_gpus) { + fprintf(stderr, "Number of processes must match number of GPUs.\n"); + MPI_Finalize(); + return EXIT_FAILURE; + } + + // Initialize GPU context + #if USE_CUDA + cudaGetDeviceCount(&num_gpus_per_node); + cudaSetDevice((my_rank % num_gpus_per_node)); + #elif USE_ROCM + hipGetDeviceCount(&num_gpus_per_node); + hipSetDevice((my_rank % num_gpus_per_node)); + #endif + + int64_t local_data_size = max_msg_size; // Size of local data + int64_t global_data_size = local_data_size; // Size of global data + + if (my_rank == 0) { + fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024); + fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024); + } + + bfloat16 *local_data = (bfloat16*)malloc(local_data_size); + bfloat16 *global_data = (bfloat16*)malloc(global_data_size); + + // Initialize local data + initializeData(local_data, local_data_size); + + bfloat16 *d_local_data, *d_global_data; + #ifdef USE_CUDA + CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size)); + CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size)); + // Copy local data to GPU + CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice)); + + #elif USE_ROCM + HIP_CHECK(hipMalloc(&d_local_data, local_data_size)); + HIP_CHECK(hipMalloc(&d_global_data, global_data_size)); + HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice)); + #endif + + #ifdef USE_MPI + // create 2-byte datatype (send raw, un-interpreted bytes) + MPI_Datatype mpi_type_bfloat16; + MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16); + MPI_Type_commit(&mpi_type_bfloat16); + + // define custom reduce operation for nv_bfloat16 types + MPI_Op CUSTOM_SUM; + MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM); + + #elif defined(USE_NCCL) || defined(USE_RCCL) + ncclUniqueId nccl_comm_id; + ncclComm_t nccl_comm; + + if (my_rank == 0) { + /* Generates an Id to be used in ncclCommInitRank. */ + ncclGetUniqueId(&nccl_comm_id); + } + + /* distribute nccl_comm_id to all ranks */ + MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE, + 0, MPI_COMM_WORLD)); + + /* Create a new NCCL/RCCL communicator */ + NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank)); + #endif + + // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather + double total_time, start_time; + MPI_Request request; + MPI_Status status; + + // Print benchmark results + if (my_rank == 0) { + printf("Number of GPUs: %d\n", num_gpus); + printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size); + printf("Number of iterations: %d\n", iterations); + } + fflush(NULL); + + for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { + msg_count = msg_size / sizeof(bfloat16); + // warmup iterations + for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { + #ifdef USE_MPI + MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16, + CUSTOM_SUM, MPI_COMM_WORLD, &request)); + + MPI_CHECK(MPI_Wait(&request, &status)); + #elif defined(USE_NCCL) || defined(USE_RCCL) + NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL)); + #endif + + #ifdef USE_CUDA + cudaDeviceSynchronize(); + #elif USE_ROCM + hipDeviceSynchronize(); + #endif + } + + if(msg_size >= 8388608) + iterations = 20; + + MPI_Barrier(MPI_COMM_WORLD); + start_time = MPI_Wtime(); + for (int i = 0; i < iterations; ++i) { + #ifdef USE_MPI + MPI_CHECK(MPI_Iallreduce(d_local_data, d_global_data, msg_count, mpi_type_bfloat16, + CUSTOM_SUM, MPI_COMM_WORLD, &request)); + + MPI_CHECK(MPI_Wait(&request, &status)); + #elif defined(USE_NCCL) || defined(USE_RCCL) + NCCL_CHECK(ncclAllReduce((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, ncclSum, nccl_comm, NULL)); + #endif + + #ifdef USE_CUDA + cudaDeviceSynchronize(); + #elif USE_ROCM + hipDeviceSynchronize(); + #endif + } + MPI_Barrier(MPI_COMM_WORLD); + total_time = MPI_Wtime() - start_time; + if (my_rank == 0) + printf("%ld %.6f seconds\n", msg_size, (total_time / iterations)); + } + + // Cleanup + free(local_data); + free(global_data); + #ifdef USE_CUDA + CUDA_CHECK(cudaFree(d_local_data)); + CUDA_CHECK(cudaFree(d_global_data)); + #elif USE_ROCM + HIP_CHECK(hipFree(d_local_data)); + HIP_CHECK(hipFree(d_global_data)); + #endif + + #ifdef defined(USE_NCCL) || defined(USE_RCCL) + ncclCommDestroy(nccl_comm); + #endif + + MPI_Finalize(); + return EXIT_SUCCESS; +} diff --git a/mpi/Makefile b/mpi/Makefile new file mode 100644 index 0000000..12ed3bf --- /dev/null +++ b/mpi/Makefile @@ -0,0 +1,30 @@ +# Copyright 2024 Parallel Software and Systems Group, University of Maryland. +# See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: MIT + +CC = cc + +# perlmutter flags +INC = -I/global/common/software/nersc9/nccl/2.19.4/include +CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_MPI +LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl + +# frontier flags +# INC = -I${ROCM_PATH}/include +# CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_MPI +# LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl + +all: allgather.x allreduce.x reduce_scatter.x + +allgather.x: ../allgather.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-gather/allgather.x ../allgather.cu + +allreduce.x: ../allreduce.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-reduce/allreduce.x ../allreduce.cu + +reduce_scatter.x: ../reduce_scatter.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce-scatter/reduce_scatter.x ../reduce_scatter.cu + +clean: + rm -f all-gather/allgather.x all-reduce/allreduce.x reduce-scatter/reduce_scatter.x diff --git a/mpi/all-gather/frontier/128_gcd_run.sh b/mpi/all-gather/frontier/128_gcd_run.sh new file mode 100644 index 0000000..4e8c955 --- /dev/null +++ b/mpi/all-gather/frontier/128_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 15:00 +#SBATCH -N 16 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/128_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 16)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/frontier/16_gcd_run.sh b/mpi/all-gather/frontier/16_gcd_run.sh new file mode 100644 index 0000000..bb2429f --- /dev/null +++ b/mpi/all-gather/frontier/16_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 10:00 +#SBATCH -N 2 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/16_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 128)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/frontier/32_gcd_run.sh b/mpi/all-gather/frontier/32_gcd_run.sh new file mode 100644 index 0000000..e630b97 --- /dev/null +++ b/mpi/all-gather/frontier/32_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 15:00 +#SBATCH -N 4 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/32_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 64)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/frontier/64_gcd_run.sh b/mpi/all-gather/frontier/64_gcd_run.sh new file mode 100644 index 0000000..e7c707f --- /dev/null +++ b/mpi/all-gather/frontier/64_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 15:00 +#SBATCH -N 8 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/64_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 32)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/frontier/8_gcd_run.sh b/mpi/all-gather/frontier/8_gcd_run.sh new file mode 100644 index 0000000..563f933 --- /dev/null +++ b/mpi/all-gather/frontier/8_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 10:00 +#SBATCH -N 1 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/frontier/benchmarks/8_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 256)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/frontier/benchmarks/128_gcd.txt b/mpi/all-gather/frontier/benchmarks/128_gcd.txt new file mode 100644 index 0000000..824b380 --- /dev/null +++ b/mpi/all-gather/frontier/benchmarks/128_gcd.txt @@ -0,0 +1,13 @@ +srun -l -N 16 -n 128 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 128 262144 16777216 10 + 0: Local data size: 16 + 0: Global data size: 2048 + 0: Number of GPUs: 128 + 0: Message size range: 262144 - 16777216 + 0: Number of iterations: 10 + 0: 262144 0.003748 seconds + 0: 524288 0.005048 seconds + 0: 1048576 0.008068 seconds + 0: 2097152 0.014084 seconds + 0: 4194304 0.026981 seconds + 0: 8388608 0.051879 seconds + 0: 16777216 0.255600 seconds diff --git a/mpi/all-gather/frontier/benchmarks/16_gcd.txt b/mpi/all-gather/frontier/benchmarks/16_gcd.txt new file mode 100644 index 0000000..35a9e26 --- /dev/null +++ b/mpi/all-gather/frontier/benchmarks/16_gcd.txt @@ -0,0 +1,13 @@ +srun -l -N 2 -n 16 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 16 2097152 134217728 10 + 0: Local data size: 128 + 0: Global data size: 2048 + 0: Number of GPUs: 16 + 0: Message size range: 2097152 - 134217728 + 0: Number of iterations: 10 + 0: 2097152 0.002249 seconds + 0: 4194304 0.003148 seconds + 0: 8388608 0.006062 seconds + 0: 16777216 0.011871 seconds + 0: 33554432 0.023485 seconds + 0: 67108864 0.046822 seconds + 0: 134217728 0.139763 seconds diff --git a/mpi/all-gather/frontier/benchmarks/32_gcd.txt b/mpi/all-gather/frontier/benchmarks/32_gcd.txt new file mode 100644 index 0000000..f758360 --- /dev/null +++ b/mpi/all-gather/frontier/benchmarks/32_gcd.txt @@ -0,0 +1,15 @@ +srun -l -N 4 -n 32 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 32 262144 67108864 10 + 0: Local data size: 64 + 0: Global data size: 2048 + 0: Number of GPUs: 32 + 0: Message size range: 262144 - 67108864 + 0: Number of iterations: 10 + 0: 262144 0.000783 seconds + 0: 524288 0.001513 seconds + 0: 1048576 0.002953 seconds + 0: 2097152 0.003404 seconds + 0: 4194304 0.006485 seconds + 0: 8388608 0.012489 seconds + 0: 16777216 0.024484 seconds + 0: 33554432 0.048460 seconds + 0: 67108864 0.185884 seconds diff --git a/mpi/all-gather/frontier/benchmarks/64_gcd.txt b/mpi/all-gather/frontier/benchmarks/64_gcd.txt new file mode 100644 index 0000000..3eed822 --- /dev/null +++ b/mpi/all-gather/frontier/benchmarks/64_gcd.txt @@ -0,0 +1,14 @@ +srun -l -N 8 -n 64 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 64 262144 33554432 10 + 0: Local data size: 32 + 0: Global data size: 2048 + 0: Number of GPUs: 64 + 0: Message size range: 262144 - 33554432 + 0: Number of iterations: 10 + 0: 262144 0.001685 seconds + 0: 524288 0.003350 seconds + 0: 1048576 0.003938 seconds + 0: 2097152 0.006864 seconds + 0: 4194304 0.013037 seconds + 0: 8388608 0.025167 seconds + 0: 16777216 0.049414 seconds + 0: 33554432 0.211224 seconds diff --git a/mpi/all-gather/frontier/benchmarks/8_gcd.txt b/mpi/all-gather/frontier/benchmarks/8_gcd.txt new file mode 100644 index 0000000..7856a16 --- /dev/null +++ b/mpi/all-gather/frontier/benchmarks/8_gcd.txt @@ -0,0 +1,14 @@ +srun -l -N 1 -n 8 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-gather/allgather.x 8 2097152 268435456 10 +0: Local data size: 256 +0: Global data size: 2048 +0: Number of GPUs: 8 +0: Message size range: 2097152 - 268435456 +0: Number of iterations: 10 +0: 2097152 0.000505 seconds +0: 4194304 0.000856 seconds +0: 8388608 0.001645 seconds +0: 16777216 0.003223 seconds +0: 33554432 0.006379 seconds +0: 67108864 0.012691 seconds +0: 134217728 0.025316 seconds +0: 268435456 0.053944 seconds diff --git a/mpi/all-gather/perlmutter/128_gpu_run.sh b/mpi/all-gather/perlmutter/128_gpu_run.sh new file mode 100644 index 0000000..30fd2fc --- /dev/null +++ b/mpi/all-gather/perlmutter/128_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 32 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 16)) + +SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/perlmutter/16_gpu_run.sh b/mpi/all-gather/perlmutter/16_gpu_run.sh new file mode 100644 index 0000000..e68834a --- /dev/null +++ b/mpi/all-gather/perlmutter/16_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 4 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 128)) + +SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/perlmutter/32_gpu_run.sh b/mpi/all-gather/perlmutter/32_gpu_run.sh new file mode 100644 index 0000000..aad7f68 --- /dev/null +++ b/mpi/all-gather/perlmutter/32_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 8 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 64)) + +SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/perlmutter/64_gpu_run.sh b/mpi/all-gather/perlmutter/64_gpu_run.sh new file mode 100644 index 0000000..4897de4 --- /dev/null +++ b/mpi/all-gather/perlmutter/64_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 16 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 32)) + +SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/perlmutter/8_gpu_run.sh b/mpi/all-gather/perlmutter/8_gpu_run.sh new file mode 100644 index 0000000..3a454cf --- /dev/null +++ b/mpi/all-gather/perlmutter/8_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 2 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 256)) + +SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt new file mode 100644 index 0000000..295c6c0 --- /dev/null +++ b/mpi/all-gather/perlmutter/benchmarks/128_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 16 +Global data size: 2048 +Number of GPUs: 128 +Message size range: 262144 - 16777216 +Number of iterations: 10 +262144 0.003072 seconds +524288 0.005233 seconds +1048576 0.008462 seconds +2097152 0.015449 seconds +4194304 0.030325 seconds +8388608 0.060131 seconds +16777216 0.190401 seconds diff --git a/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt new file mode 100644 index 0000000..740a003 --- /dev/null +++ b/mpi/all-gather/perlmutter/benchmarks/16_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 128 +Global data size: 2048 +Number of GPUs: 16 +Message size range: 2097152 - 134217728 +Number of iterations: 10 +2097152 0.002476 seconds +4194304 0.003571 seconds +8388608 0.007188 seconds +16777216 0.014909 seconds +33554432 0.030427 seconds +67108864 0.061974 seconds +134217728 0.150229 seconds diff --git a/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt new file mode 100644 index 0000000..fca9dfb --- /dev/null +++ b/mpi/all-gather/perlmutter/benchmarks/32_gpu.txt @@ -0,0 +1,14 @@ +Local data size: 64 +Global data size: 2048 +Number of GPUs: 32 +Message size range: 262144 - 67108864 +Number of iterations: 10 +262144 0.000814 seconds +524288 0.001392 seconds +1048576 0.002735 seconds +2097152 0.003736 seconds +4194304 0.007699 seconds +8388608 0.014426 seconds +16777216 0.030468 seconds +33554432 0.063086 seconds +67108864 0.172433 seconds diff --git a/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt new file mode 100644 index 0000000..fd082e7 --- /dev/null +++ b/mpi/all-gather/perlmutter/benchmarks/64_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 32 +Global data size: 2048 +Number of GPUs: 64 +Message size range: 262144 - 33554432 +Number of iterations: 10 +262144 0.001616 seconds +524288 0.003051 seconds +1048576 0.004224 seconds +2097152 0.008058 seconds +4194304 0.015085 seconds +8388608 0.029593 seconds +16777216 0.063129 seconds +33554432 0.185107 seconds diff --git a/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt b/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt new file mode 100644 index 0000000..d027526 --- /dev/null +++ b/mpi/all-gather/perlmutter/benchmarks/8_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 256 +Global data size: 2048 +Number of GPUs: 8 +Message size range: 2097152 - 268435456 +Number of iterations: 10 +2097152 0.000804 seconds +4194304 0.001514 seconds +8388608 0.003268 seconds +16777216 0.006800 seconds +33554432 0.013764 seconds +67108864 0.027832 seconds +134217728 0.055076 seconds +268435456 0.103476 seconds diff --git a/mpi/all-reduce/frontier/128_gcd_run.sh b/mpi/all-reduce/frontier/128_gcd_run.sh new file mode 100644 index 0000000..5c6baf5 --- /dev/null +++ b/mpi/all-reduce/frontier/128_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 20:00 +#SBATCH -N 16 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/128_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 1024)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/frontier/16_gcd_run.sh b/mpi/all-reduce/frontier/16_gcd_run.sh new file mode 100644 index 0000000..e1ad604 --- /dev/null +++ b/mpi/all-reduce/frontier/16_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 20:00 +#SBATCH -N 2 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/16_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 1024)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/frontier/32_gcd_run.sh b/mpi/all-reduce/frontier/32_gcd_run.sh new file mode 100644 index 0000000..be7bdd9 --- /dev/null +++ b/mpi/all-reduce/frontier/32_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 20:00 +#SBATCH -N 4 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/32_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 1024)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/frontier/64_gcd_run.sh b/mpi/all-reduce/frontier/64_gcd_run.sh new file mode 100644 index 0000000..a8e13d2 --- /dev/null +++ b/mpi/all-reduce/frontier/64_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 20:00 +#SBATCH -N 8 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/64_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 1024)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/frontier/8_gcd_run.sh b/mpi/all-reduce/frontier/8_gcd_run.sh new file mode 100644 index 0000000..81ffbc4 --- /dev/null +++ b/mpi/all-reduce/frontier/8_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 20:00 +#SBATCH -N 1 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/frontier/benchmarks/8_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 1024)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/frontier/benchmarks/128_gcd.txt b/mpi/all-reduce/frontier/benchmarks/128_gcd.txt new file mode 100644 index 0000000..56c18aa --- /dev/null +++ b/mpi/all-reduce/frontier/benchmarks/128_gcd.txt @@ -0,0 +1,12 @@ +srun -l -N 16 -n 128 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 128 33554432 1073741824 10 + 0: Local data size: 1024 + 0: Global data size: 1024 + 0: Number of GPUs: 128 + 0: Message size range: 33554432 - 1073741824 + 0: Number of iterations: 10 + 0: 33554432 0.240206 seconds + 0: 67108864 0.476990 seconds + 0: 134217728 1.041500 seconds + 0: 268435456 2.951969 seconds + 0: 536870912 5.990606 seconds + 0: 1073741824 12.004613 seconds diff --git a/mpi/all-reduce/frontier/benchmarks/16_gcd.txt b/mpi/all-reduce/frontier/benchmarks/16_gcd.txt new file mode 100644 index 0000000..609afbd --- /dev/null +++ b/mpi/all-reduce/frontier/benchmarks/16_gcd.txt @@ -0,0 +1,12 @@ +srun -l -N 2 -n 16 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 16 33554432 1073741824 10 + 0: Local data size: 1024 + 0: Global data size: 1024 + 0: Number of GPUs: 16 + 0: Message size range: 33554432 - 1073741824 + 0: Number of iterations: 10 + 0: 33554432 0.133082 seconds + 0: 67108864 0.267616 seconds + 0: 134217728 0.634895 seconds + 0: 268435456 1.928400 seconds + 0: 536870912 3.973167 seconds + 0: 1073741824 7.913018 seconds diff --git a/mpi/all-reduce/frontier/benchmarks/32_gcd.txt b/mpi/all-reduce/frontier/benchmarks/32_gcd.txt new file mode 100644 index 0000000..b92c437 --- /dev/null +++ b/mpi/all-reduce/frontier/benchmarks/32_gcd.txt @@ -0,0 +1,14 @@ +srun -l -N 4 -n 32 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 32 8388608 1073741824 10 + 0: Local data size: 1024 + 0: Global data size: 1024 + 0: Number of GPUs: 32 + 0: Message size range: 8388608 - 1073741824 + 0: Number of iterations: 10 + 0: 8388608 0.043066 seconds + 0: 16777216 0.084259 seconds + 0: 33554432 0.167705 seconds + 0: 67108864 0.336696 seconds + 0: 134217728 0.773389 seconds + 0: 268435456 2.284815 seconds + 0: 536870912 4.693147 seconds + 0: 1073741824 9.356859 seconds diff --git a/mpi/all-reduce/frontier/benchmarks/64_gcd.txt b/mpi/all-reduce/frontier/benchmarks/64_gcd.txt new file mode 100644 index 0000000..122c83e --- /dev/null +++ b/mpi/all-reduce/frontier/benchmarks/64_gcd.txt @@ -0,0 +1,13 @@ +srun -l -N 8 -n 64 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 64 16777216 1073741824 10 + 0: Local data size: 1024 + 0: Global data size: 1024 + 0: Number of GPUs: 64 + 0: Message size range: 16777216 - 1073741824 + 0: Number of iterations: 10 + 0: 16777216 0.101777 seconds + 0: 33554432 0.203258 seconds + 0: 67108864 0.406569 seconds + 0: 134217728 0.913391 seconds + 0: 268435456 2.633732 seconds + 0: 536870912 5.375804 seconds + 0: 1073741824 10.708706 seconds diff --git a/mpi/all-reduce/frontier/benchmarks/8_gcd.txt b/mpi/all-reduce/frontier/benchmarks/8_gcd.txt new file mode 100644 index 0000000..a9b69c1 --- /dev/null +++ b/mpi/all-reduce/frontier/benchmarks/8_gcd.txt @@ -0,0 +1,13 @@ +srun -l -N 1 -n 8 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/all-reduce/allreduce.x 8 16777216 1073741824 10 +0: Local data size: 1024 +0: Global data size: 1024 +0: Number of GPUs: 8 +0: Message size range: 16777216 - 1073741824 +0: Number of iterations: 10 +0: 16777216 0.049728 seconds +0: 33554432 0.099497 seconds +0: 67108864 0.202129 seconds +0: 134217728 0.500335 seconds +0: 268435456 1.560791 seconds +0: 536870912 3.265382 seconds +0: 1073741824 6.500534 seconds diff --git a/mpi/all-reduce/perlmutter/128_gpu_run.sh b/mpi/all-reduce/perlmutter/128_gpu_run.sh new file mode 100644 index 0000000..3438061 --- /dev/null +++ b/mpi/all-reduce/perlmutter/128_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 20:00 +#SBATCH -N 32 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 1024)) + +SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/perlmutter/16_gpu_run.sh b/mpi/all-reduce/perlmutter/16_gpu_run.sh new file mode 100644 index 0000000..33962b7 --- /dev/null +++ b/mpi/all-reduce/perlmutter/16_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 15:00 +#SBATCH -N 4 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 1024)) + +SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/perlmutter/32_gpu_run.sh b/mpi/all-reduce/perlmutter/32_gpu_run.sh new file mode 100644 index 0000000..fcad983 --- /dev/null +++ b/mpi/all-reduce/perlmutter/32_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 20:00 +#SBATCH -N 8 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 1024)) + +SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/perlmutter/64_gpu_run.sh b/mpi/all-reduce/perlmutter/64_gpu_run.sh new file mode 100644 index 0000000..cd5b8fa --- /dev/null +++ b/mpi/all-reduce/perlmutter/64_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 20:00 +#SBATCH -N 16 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 1024)) + +SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/perlmutter/8_gpu_run.sh b/mpi/all-reduce/perlmutter/8_gpu_run.sh new file mode 100644 index 0000000..ddf1050 --- /dev/null +++ b/mpi/all-reduce/perlmutter/8_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 15:00 +#SBATCH -N 2 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 1024)) + +SCRIPT="$SCRATCH/gpu-benchmarks/mpi/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt new file mode 100644 index 0000000..a4485f5 --- /dev/null +++ b/mpi/all-reduce/perlmutter/benchmarks/128_gpu.txt @@ -0,0 +1,11 @@ +Local data size: 1024 +Global data size: 1024 +Number of GPUs: 128 +Message size range: 33554432 - 1073741824 +Number of iterations: 10 +33554432 0.260096 seconds +67108864 0.535750 seconds +134217728 1.089220 seconds +268435456 3.236966 seconds +536870912 6.499632 seconds +1073741824 12.975189 seconds diff --git a/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt new file mode 100644 index 0000000..7536923 --- /dev/null +++ b/mpi/all-reduce/perlmutter/benchmarks/16_gpu.txt @@ -0,0 +1,11 @@ +Local data size: 1024 +Global data size: 1024 +Number of GPUs: 16 +Message size range: 33554432 - 1073741824 +Number of iterations: 10 +33554432 0.142862 seconds +67108864 0.282599 seconds +134217728 0.635635 seconds +268435456 1.893851 seconds +536870912 3.800098 seconds +1073741824 7.591759 seconds diff --git a/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt new file mode 100644 index 0000000..f210edf --- /dev/null +++ b/mpi/all-reduce/perlmutter/benchmarks/32_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 1024 +Global data size: 1024 +Number of GPUs: 32 +Message size range: 8388608 - 1073741824 +Number of iterations: 10 +8388608 0.050115 seconds +16777216 0.093747 seconds +33554432 0.182627 seconds +67108864 0.363477 seconds +134217728 0.777837 seconds +268435456 2.348574 seconds +536870912 4.726795 seconds +1073741824 9.478696 seconds diff --git a/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt new file mode 100644 index 0000000..0052be4 --- /dev/null +++ b/mpi/all-reduce/perlmutter/benchmarks/64_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 1024 +Global data size: 1024 +Number of GPUs: 64 +Message size range: 16777216 - 1073741824 +Number of iterations: 10 +16777216 0.120696 seconds +33554432 0.238777 seconds +67108864 0.470335 seconds +134217728 0.963299 seconds +268435456 2.857795 seconds +536870912 5.742566 seconds +1073741824 11.495248 seconds diff --git a/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt b/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt new file mode 100644 index 0000000..def3166 --- /dev/null +++ b/mpi/all-reduce/perlmutter/benchmarks/8_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 1024 +Global data size: 1024 +Number of GPUs: 8 +Message size range: 16777216 - 1073741824 +Number of iterations: 10 +16777216 0.056844 seconds +33554432 0.108090 seconds +67108864 0.215626 seconds +134217728 0.502310 seconds +268435456 1.519484 seconds +536870912 3.075941 seconds +1073741824 6.121168 seconds diff --git a/mpi/reduce-scatter/frontier/128_gcd_run.sh b/mpi/reduce-scatter/frontier/128_gcd_run.sh new file mode 100644 index 0000000..b6505f8 --- /dev/null +++ b/mpi/reduce-scatter/frontier/128_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 20:00 +#SBATCH -N 16 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/frontier/16_gcd_run.sh b/mpi/reduce-scatter/frontier/16_gcd_run.sh new file mode 100644 index 0000000..eb6b2ba --- /dev/null +++ b/mpi/reduce-scatter/frontier/16_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 20:00 +#SBATCH -N 2 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/frontier/32_gcd_run.sh b/mpi/reduce-scatter/frontier/32_gcd_run.sh new file mode 100644 index 0000000..4ed3437 --- /dev/null +++ b/mpi/reduce-scatter/frontier/32_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 20:00 +#SBATCH -N 4 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/frontier/64_gcd_run.sh b/mpi/reduce-scatter/frontier/64_gcd_run.sh new file mode 100644 index 0000000..a5a9957 --- /dev/null +++ b/mpi/reduce-scatter/frontier/64_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 20:00 +#SBATCH -N 8 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/frontier/8_gcd_run.sh b/mpi/reduce-scatter/frontier/8_gcd_run.sh new file mode 100644 index 0000000..9d4191c --- /dev/null +++ b/mpi/reduce-scatter/frontier/8_gcd_run.sh @@ -0,0 +1,21 @@ +#!/bin/bash +#SBATCH -p batch +#SBATCH -A CSC569 +#SBATCH -t 20:00 +#SBATCH -N 1 +#SBATCH --output=/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt +#SBATCH -C nvme + +## calculating the number of nodes and GPUs +export NNODES=$SLURM_JOB_NUM_NODES +export GPUS_PER_NODE=8 ## change as per your machine +export GPUS=$(( NNODES * GPUS_PER_NODE )) + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) +SCRIPT="/ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -l -N $NNODES -n $GPUS -c7 --ntasks-per-node=8 --gpus-per-node=8 $SCRIPT" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt new file mode 100644 index 0000000..af5e98a --- /dev/null +++ b/mpi/reduce-scatter/frontier/benchmarks/128_gcd.txt @@ -0,0 +1,13 @@ +srun -l -N 16 -n 128 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 128 33554432 2147483648 10 + 0: Local data size: 2048 + 0: Global data size: 2048 + 0: Number of GPUs: 128 + 0: Message size range: 33554432 - 2147483648 + 0: Number of iterations: 10 + 0: 33554432 5.046207 seconds + 0: 67108864 5.031027 seconds + 0: 134217728 5.063647 seconds + 0: 268435456 5.054240 seconds + 0: 536870912 5.047598 seconds + 0: 1073741824 5.051536 seconds + 0: 2147483648 5.057082 seconds diff --git a/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt new file mode 100644 index 0000000..fa9c67a --- /dev/null +++ b/mpi/reduce-scatter/frontier/benchmarks/16_gcd.txt @@ -0,0 +1,13 @@ +srun -l -N 2 -n 16 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 16 33554432 2147483648 10 + 0: Local data size: 2048 + 0: Global data size: 2048 + 0: Number of GPUs: 16 + 0: Message size range: 33554432 - 2147483648 + 0: Number of iterations: 10 + 0: 33554432 5.091016 seconds + 0: 67108864 5.092117 seconds + 0: 134217728 5.082377 seconds + 0: 268435456 5.103443 seconds + 0: 536870912 5.102289 seconds + 0: 1073741824 5.116191 seconds + 0: 2147483648 5.115768 seconds diff --git a/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt new file mode 100644 index 0000000..23a0ace --- /dev/null +++ b/mpi/reduce-scatter/frontier/benchmarks/32_gcd.txt @@ -0,0 +1,15 @@ +srun -l -N 4 -n 32 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 32 8388608 2147483648 10 + 0: Local data size: 2048 + 0: Global data size: 2048 + 0: Number of GPUs: 32 + 0: Message size range: 8388608 - 2147483648 + 0: Number of iterations: 10 + 0: 8388608 5.006776 seconds + 0: 16777216 4.981770 seconds + 0: 33554432 5.014587 seconds + 0: 67108864 4.994224 seconds + 0: 134217728 4.977063 seconds + 0: 268435456 4.980235 seconds + 0: 536870912 5.007770 seconds + 0: 1073741824 5.013561 seconds + 0: 2147483648 5.015718 seconds diff --git a/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt new file mode 100644 index 0000000..560c383 --- /dev/null +++ b/mpi/reduce-scatter/frontier/benchmarks/64_gcd.txt @@ -0,0 +1,17 @@ +srun -l -N 8 -n 64 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 64 16777216 2147483648 10 + 0: Local data size: 2048 + 0: Global data size: 2048 + 0: Number of GPUs: 64 + 0: Message size range: 16777216 - 2147483648 + 0: Number of iterations: 10 + 0: 16777216 5.006610 seconds + 0: 33554432 4.998351 seconds + 0: 67108864 5.003749 seconds + 0: 134217728 5.066133 seconds + 0: 268435456 4.980950 seconds + 0: 536870912 4.982830 seconds + 0: 1073741824 5.023178 seconds + 0: 2147483648 4.988750 seconds + 0: + 0: MPICH Slingshot Network Summary: 4 network timeouts + 0: diff --git a/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt b/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt new file mode 100644 index 0000000..493d5ee --- /dev/null +++ b/mpi/reduce-scatter/frontier/benchmarks/8_gcd.txt @@ -0,0 +1,14 @@ +srun -l -N 1 -n 8 -c7 --ntasks-per-node=8 --gpus-per-node=8 /ccs/home/adityatomar/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x 8 16777216 2147483648 10 +0: Local data size: 2048 +0: Global data size: 2048 +0: Number of GPUs: 8 +0: Message size range: 16777216 - 2147483648 +0: Number of iterations: 10 +0: 16777216 5.130130 seconds +0: 33554432 5.120491 seconds +0: 67108864 5.115654 seconds +0: 134217728 5.128319 seconds +0: 268435456 5.111989 seconds +0: 536870912 5.115996 seconds +0: 1073741824 5.127237 seconds +0: 2147483648 5.116940 seconds diff --git a/mpi/reduce-scatter/perlmutter/128_gpu_run.sh b/mpi/reduce-scatter/perlmutter/128_gpu_run.sh new file mode 100644 index 0000000..28c8479 --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/128_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 32 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="$SCRATCH/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/perlmutter/16_gpu_run.sh b/mpi/reduce-scatter/perlmutter/16_gpu_run.sh new file mode 100644 index 0000000..c3b9e32 --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/16_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 4 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="$SCRATCH/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/perlmutter/32_gpu_run.sh b/mpi/reduce-scatter/perlmutter/32_gpu_run.sh new file mode 100644 index 0000000..1681d65 --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/32_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 30:00 +#SBATCH -N 8 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="$SCRATCH/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/perlmutter/64_gpu_run.sh b/mpi/reduce-scatter/perlmutter/64_gpu_run.sh new file mode 100644 index 0000000..f932006 --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/64_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 16 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="$SCRATCH/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/perlmutter/8_gpu_run.sh b/mpi/reduce-scatter/perlmutter/8_gpu_run.sh new file mode 100644 index 0000000..977ba91 --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/8_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 20:00 +#SBATCH -N 2 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="$SCRATCH/gpu-benchmarks/mpi/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt new file mode 100644 index 0000000..7306758 --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/benchmarks/128_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 128 +Message size range: 33554432 - 2147483648 +Number of iterations: 10 +33554432 0.410163 seconds +67108864 0.429161 seconds +134217728 0.544002 seconds +268435456 0.679339 seconds +536870912 0.981913 seconds +1073741824 1.583797 seconds +2147483648 3.678590 seconds diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt new file mode 100644 index 0000000..190422f --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/benchmarks/16_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 16 +Message size range: 33554432 - 2147483648 +Number of iterations: 10 +33554432 0.056117 seconds +67108864 0.092396 seconds +134217728 0.169070 seconds +268435456 0.331578 seconds +536870912 0.641127 seconds +1073741824 1.270086 seconds +2147483648 3.735213 seconds diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt new file mode 100644 index 0000000..7b9f084 --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/benchmarks/32_gpu.txt @@ -0,0 +1,14 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 32 +Message size range: 8388608 - 2147483648 +Number of iterations: 10 +8388608 0.053765 seconds +16777216 0.064537 seconds +33554432 0.084740 seconds +67108864 0.133787 seconds +134217728 0.220573 seconds +268435456 0.377243 seconds +536870912 0.683938 seconds +1073741824 1.321649 seconds +2147483648 3.716915 seconds diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt new file mode 100644 index 0000000..675dc8f --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/benchmarks/64_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 64 +Message size range: 16777216 - 2147483648 +Number of iterations: 10 +16777216 0.157345 seconds +33554432 0.205494 seconds +67108864 0.216133 seconds +134217728 0.316748 seconds +268435456 0.476547 seconds +536870912 0.776507 seconds +1073741824 1.387122 seconds +2147483648 3.688627 seconds diff --git a/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt b/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt new file mode 100644 index 0000000..c7ca325 --- /dev/null +++ b/mpi/reduce-scatter/perlmutter/benchmarks/8_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 8 +Message size range: 16777216 - 2147483648 +Number of iterations: 10 +16777216 0.024237 seconds +33554432 0.043589 seconds +67108864 0.083173 seconds +134217728 0.153300 seconds +268435456 0.300631 seconds +536870912 0.598284 seconds +1073741824 1.190578 seconds +2147483648 3.832743 seconds diff --git a/nccl/Makefile b/nccl/Makefile new file mode 100644 index 0000000..d4423b4 --- /dev/null +++ b/nccl/Makefile @@ -0,0 +1,25 @@ +# Copyright 2024 Parallel Software and Systems Group, University of Maryland. +# See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: MIT + +CC = cc + +# perlmutter flags +INC = -I/global/common/software/nersc9/nccl/2.19.4/include +CFLAGS = -std=c++11 -O2 -target-accel=nvidia80 --cuda-gpu-arch=sm_80 -DUSE_CUDA -DUSE_NCCL +LDFLAGS = -L/global/common/software/nersc9/nccl/2.19.4/lib -lnccl + +all: allgather.x allreduce.x reduce_scatter.x + +allgather.x: ../allgather.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-gather/allgather.x ../allgather.cu + +allreduce.x: ../allreduce.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-reduce/allreduce.x ../allreduce.cu + +reduce_scatter.x: ../reduce_scatter.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce-scatter/reduce_scatter.x ../reduce_scatter.cu + +clean: + rm -f all-gather/allgather.x all-reduce/allreduce.x reduce-scatter/reduce_scatter.x diff --git a/nccl/all-gather/128_gpu_run.sh b/nccl/all-gather/128_gpu_run.sh new file mode 100644 index 0000000..82998f7 --- /dev/null +++ b/nccl/all-gather/128_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 32 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 32)) + +SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-gather/benchmarks/128_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-gather/16_gpu_run.sh b/nccl/all-gather/16_gpu_run.sh new file mode 100644 index 0000000..47b5f7c --- /dev/null +++ b/nccl/all-gather/16_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 4 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 256)) + +SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-gather/benchmarks/16_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-gather/32_gpu_run.sh b/nccl/all-gather/32_gpu_run.sh new file mode 100644 index 0000000..5459a34 --- /dev/null +++ b/nccl/all-gather/32_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 8 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 64)) + +SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-gather/benchmarks/32_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-gather/64_gpu_run.sh b/nccl/all-gather/64_gpu_run.sh new file mode 100644 index 0000000..2ad7e3a --- /dev/null +++ b/nccl/all-gather/64_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 16 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 / 4)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 32)) + +SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-gather/benchmarks/64_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-gather/8_gpu_run.sh b/nccl/all-gather/8_gpu_run.sh new file mode 100644 index 0000000..55e05f8 --- /dev/null +++ b/nccl/all-gather/8_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 2 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 2)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 256)) + +SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-gather/allgather.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-gather/benchmarks/8_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-gather/benchmarks/128_gpu.txt b/nccl/all-gather/benchmarks/128_gpu.txt new file mode 100644 index 0000000..3ac04bb --- /dev/null +++ b/nccl/all-gather/benchmarks/128_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 32 +Global data size: 4096 +Number of GPUs: 128 +Message size range: 262144 - 33554432 +Number of iterations: 10 +262144 0.002077 seconds +524288 0.002368 seconds +1048576 0.002832 seconds +2097152 0.004504 seconds +4194304 0.007551 seconds +8388608 0.014982 seconds +16777216 0.028604 seconds +33554432 0.056227 seconds diff --git a/nccl/all-gather/benchmarks/16_gpu.txt b/nccl/all-gather/benchmarks/16_gpu.txt new file mode 100644 index 0000000..1afafc0 --- /dev/null +++ b/nccl/all-gather/benchmarks/16_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 256 +Global data size: 4096 +Number of GPUs: 16 +Message size range: 2097152 - 268435456 +Number of iterations: 10 +2097152 0.000643 seconds +4194304 0.000944 seconds +8388608 0.001838 seconds +16777216 0.003452 seconds +33554432 0.007084 seconds +67108864 0.013794 seconds +134217728 0.026821 seconds +268435456 0.052760 seconds diff --git a/nccl/all-gather/benchmarks/32_gpu.txt b/nccl/all-gather/benchmarks/32_gpu.txt new file mode 100644 index 0000000..03e6ee9 --- /dev/null +++ b/nccl/all-gather/benchmarks/32_gpu.txt @@ -0,0 +1,14 @@ +Local data size: 64 +Global data size: 2048 +Number of GPUs: 32 +Message size range: 262144 - 67108864 +Number of iterations: 10 +262144 0.000528 seconds +524288 0.000604 seconds +1048576 0.000701 seconds +2097152 0.001044 seconds +4194304 0.002055 seconds +8388608 0.004240 seconds +16777216 0.006949 seconds +33554432 0.014221 seconds +67108864 0.027622 seconds diff --git a/nccl/all-gather/benchmarks/64_gpu.txt b/nccl/all-gather/benchmarks/64_gpu.txt new file mode 100644 index 0000000..c0872ab --- /dev/null +++ b/nccl/all-gather/benchmarks/64_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 32 +Global data size: 2048 +Number of GPUs: 64 +Message size range: 262144 - 33554432 +Number of iterations: 10 +262144 0.001230 seconds +524288 0.001226 seconds +1048576 0.001381 seconds +2097152 0.002098 seconds +4194304 0.003764 seconds +8388608 0.007649 seconds +16777216 0.014257 seconds +33554432 0.027941 seconds diff --git a/nccl/all-gather/benchmarks/8_gpu.txt b/nccl/all-gather/benchmarks/8_gpu.txt new file mode 100644 index 0000000..8fc4917 --- /dev/null +++ b/nccl/all-gather/benchmarks/8_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 256 +Global data size: 2048 +Number of GPUs: 8 +Message size range: 2097152 - 268435456 +Number of iterations: 10 +2097152 0.000325 seconds +4194304 0.000482 seconds +8388608 0.000881 seconds +16777216 0.001679 seconds +33554432 0.003206 seconds +67108864 0.006338 seconds +134217728 0.012452 seconds +268435456 0.024147 seconds diff --git a/nccl/all-reduce/128_gpu_run.sh b/nccl/all-reduce/128_gpu_run.sh new file mode 100644 index 0000000..591cdf3 --- /dev/null +++ b/nccl/all-reduce/128_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 32 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-reduce/benchmarks/128_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-reduce/16_gpu_run.sh b/nccl/all-reduce/16_gpu_run.sh new file mode 100644 index 0000000..9232407 --- /dev/null +++ b/nccl/all-reduce/16_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 4 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-reduce/benchmarks/16_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-reduce/32_gpu_run.sh b/nccl/all-reduce/32_gpu_run.sh new file mode 100644 index 0000000..7130fa8 --- /dev/null +++ b/nccl/all-reduce/32_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 8 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-reduce/benchmarks/32_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-reduce/64_gpu_run.sh b/nccl/all-reduce/64_gpu_run.sh new file mode 100644 index 0000000..057637f --- /dev/null +++ b/nccl/all-reduce/64_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 16 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-reduce/benchmarks/64_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-reduce/8_gpu_run.sh b/nccl/all-reduce/8_gpu_run.sh new file mode 100644 index 0000000..be7f5f1 --- /dev/null +++ b/nccl/all-reduce/8_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 2 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="$SCRATCH/gpu-benchmarks/nccl/all-reduce/allreduce.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/all-reduce/benchmarks/8_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/all-reduce/benchmarks/128_gpu.txt b/nccl/all-reduce/benchmarks/128_gpu.txt new file mode 100644 index 0000000..30388e3 --- /dev/null +++ b/nccl/all-reduce/benchmarks/128_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 128 +Message size range: 33554432 - 2147483648 +Number of iterations: 10 +33554432 0.002305 seconds +67108864 0.003309 seconds +134217728 0.005263 seconds +268435456 0.008851 seconds +536870912 0.017150 seconds +1073741824 0.037149 seconds +2147483648 0.075655 seconds diff --git a/nccl/all-reduce/benchmarks/16_gpu.txt b/nccl/all-reduce/benchmarks/16_gpu.txt new file mode 100644 index 0000000..26fc256 --- /dev/null +++ b/nccl/all-reduce/benchmarks/16_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 16 +Message size range: 33554432 - 2147483648 +Number of iterations: 10 +33554432 0.000969 seconds +67108864 0.001819 seconds +134217728 0.003596 seconds +268435456 0.006813 seconds +536870912 0.013459 seconds +1073741824 0.026683 seconds +2147483648 0.052290 seconds diff --git a/nccl/all-reduce/benchmarks/32_gpu.txt b/nccl/all-reduce/benchmarks/32_gpu.txt new file mode 100644 index 0000000..90fc0f0 --- /dev/null +++ b/nccl/all-reduce/benchmarks/32_gpu.txt @@ -0,0 +1,14 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 32 +Message size range: 8388608 - 2147483648 +Number of iterations: 10 +8388608 0.001510 seconds +16777216 0.001222 seconds +33554432 0.001317 seconds +67108864 0.002024 seconds +134217728 0.003762 seconds +268435456 0.007554 seconds +536870912 0.014173 seconds +1073741824 0.027756 seconds +2147483648 0.054544 seconds diff --git a/nccl/all-reduce/benchmarks/64_gpu.txt b/nccl/all-reduce/benchmarks/64_gpu.txt new file mode 100644 index 0000000..ebd310e --- /dev/null +++ b/nccl/all-reduce/benchmarks/64_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 64 +Message size range: 16777216 - 2147483648 +Number of iterations: 10 +16777216 0.001551 seconds +33554432 0.001949 seconds +67108864 0.002918 seconds +134217728 0.004132 seconds +268435456 0.007447 seconds +536870912 0.014747 seconds +1073741824 0.028172 seconds +2147483648 0.055372 seconds diff --git a/nccl/all-reduce/benchmarks/8_gpu.txt b/nccl/all-reduce/benchmarks/8_gpu.txt new file mode 100644 index 0000000..e5a5769 --- /dev/null +++ b/nccl/all-reduce/benchmarks/8_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 8 +Message size range: 16777216 - 2147483648 +Number of iterations: 10 +16777216 0.000635 seconds +33554432 0.000887 seconds +67108864 0.001639 seconds +134217728 0.003232 seconds +268435456 0.006303 seconds +536870912 0.011998 seconds +1073741824 0.024143 seconds +2147483648 0.047652 seconds diff --git a/nccl/reduce-scatter/128_gpu_run.sh b/nccl/reduce-scatter/128_gpu_run.sh new file mode 100644 index 0000000..e37f70b --- /dev/null +++ b/nccl/reduce-scatter/128_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 32 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="$SCRATCH/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/reduce-scatter/benchmarks/128_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/reduce-scatter/16_gpu_run.sh b/nccl/reduce-scatter/16_gpu_run.sh new file mode 100644 index 0000000..0ea1f3b --- /dev/null +++ b/nccl/reduce-scatter/16_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 4 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 32)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="$SCRATCH/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/reduce-scatter/benchmarks/16_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/reduce-scatter/32_gpu_run.sh b/nccl/reduce-scatter/32_gpu_run.sh new file mode 100644 index 0000000..0bccbb2 --- /dev/null +++ b/nccl/reduce-scatter/32_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 8 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 8)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="$SCRATCH/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/reduce-scatter/benchmarks/32_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/reduce-scatter/64_gpu_run.sh b/nccl/reduce-scatter/64_gpu_run.sh new file mode 100644 index 0000000..79dd4cb --- /dev/null +++ b/nccl/reduce-scatter/64_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 16 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="$SCRATCH/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/reduce-scatter/benchmarks/64_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/reduce-scatter/8_gpu_run.sh b/nccl/reduce-scatter/8_gpu_run.sh new file mode 100644 index 0000000..6fba196 --- /dev/null +++ b/nccl/reduce-scatter/8_gpu_run.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +#SBATCH -A m4641_g +#SBATCH -C gpu +#SBATCH -q regular +#SBATCH -t 10:00 +#SBATCH -N 2 +#SBATCH --ntasks-per-node=4 +#SBATCH -c 32 +#SBATCH --gpus-per-task=1 +#SBATCH --gpu-bind=none + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +NNODES=$SLURM_JOB_NUM_NODES +GPUS=$(( NNODES * 4 )) +export WORLD_SIZE=$GPUS +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export CUDA_VISIBLE_DEVICES=3,2,1,0 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsn +export NCCL_NET="AWS Libfabric" +export FI_CXI_RDZV_THRESHOLD=0 +export FI_CXI_RDZV_GET_MIN=0 +export FI_CXI_OFLOW_BUF_SIZE=1073741824 +export FI_CXI_OFLOW_BUF_COUNT=1 + +MIN_MSG_SIZE=$((1048576 * 16)) # 1048576 = 1024 * 1024 +MAX_MSG_SIZE=$((1048576 * 2048)) + +SCRIPT="$SCRATCH/gpu-benchmarks/nccl/reduce-scatter/reduce_scatter.x $GPUS $MIN_MSG_SIZE $MAX_MSG_SIZE 10" +run_cmd="srun -C gpu -N $NNODES -n $GPUS -c 32 --cpu-bind=cores --gpus-per-node=4 $SCRIPT >& $SCRATCH/gpu-benchmarks/nccl/reduce-scatter/benchmarks/8_gpu.txt" + +echo $run_cmd +eval $run_cmd +set +x diff --git a/nccl/reduce-scatter/benchmarks/128_gpu.txt b/nccl/reduce-scatter/benchmarks/128_gpu.txt new file mode 100644 index 0000000..846d583 --- /dev/null +++ b/nccl/reduce-scatter/benchmarks/128_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 128 +Message size range: 33554432 - 2147483648 +Number of iterations: 10 +33554432 0.002055 seconds +67108864 0.002314 seconds +134217728 0.003003 seconds +268435456 0.004164 seconds +536870912 0.007515 seconds +1073741824 0.014791 seconds +2147483648 0.027948 seconds diff --git a/nccl/reduce-scatter/benchmarks/16_gpu.txt b/nccl/reduce-scatter/benchmarks/16_gpu.txt new file mode 100644 index 0000000..0bae9e9 --- /dev/null +++ b/nccl/reduce-scatter/benchmarks/16_gpu.txt @@ -0,0 +1,12 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 16 +Message size range: 33554432 - 2147483648 +Number of iterations: 10 +33554432 0.000552 seconds +67108864 0.000933 seconds +134217728 0.001772 seconds +268435456 0.003462 seconds +536870912 0.007059 seconds +1073741824 0.013749 seconds +2147483648 0.026539 seconds diff --git a/nccl/reduce-scatter/benchmarks/32_gpu.txt b/nccl/reduce-scatter/benchmarks/32_gpu.txt new file mode 100644 index 0000000..307b0ce --- /dev/null +++ b/nccl/reduce-scatter/benchmarks/32_gpu.txt @@ -0,0 +1,14 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 32 +Message size range: 8388608 - 2147483648 +Number of iterations: 10 +8388608 0.000586 seconds +16777216 0.000629 seconds +33554432 0.000712 seconds +67108864 0.001141 seconds +134217728 0.002012 seconds +268435456 0.003715 seconds +536870912 0.007022 seconds +1073741824 0.014078 seconds +2147483648 0.027699 seconds diff --git a/nccl/reduce-scatter/benchmarks/64_gpu.txt b/nccl/reduce-scatter/benchmarks/64_gpu.txt new file mode 100644 index 0000000..45bd514 --- /dev/null +++ b/nccl/reduce-scatter/benchmarks/64_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 64 +Message size range: 16777216 - 2147483648 +Number of iterations: 10 +16777216 0.001059 seconds +33554432 0.001147 seconds +67108864 0.001410 seconds +134217728 0.002090 seconds +268435456 0.004116 seconds +536870912 0.007125 seconds +1073741824 0.014305 seconds +2147483648 0.028156 seconds diff --git a/nccl/reduce-scatter/benchmarks/8_gpu.txt b/nccl/reduce-scatter/benchmarks/8_gpu.txt new file mode 100644 index 0000000..5cee721 --- /dev/null +++ b/nccl/reduce-scatter/benchmarks/8_gpu.txt @@ -0,0 +1,13 @@ +Local data size: 2048 +Global data size: 2048 +Number of GPUs: 8 +Message size range: 16777216 - 2147483648 +Number of iterations: 10 +16777216 0.000363 seconds +33554432 0.000450 seconds +67108864 0.000876 seconds +134217728 0.001650 seconds +268435456 0.003169 seconds +536870912 0.006491 seconds +1073741824 0.012103 seconds +2147483648 0.024166 seconds diff --git a/rccl/Makefile b/rccl/Makefile new file mode 100644 index 0000000..aa0a7b9 --- /dev/null +++ b/rccl/Makefile @@ -0,0 +1,25 @@ +# Copyright 2024 Parallel Software and Systems Group, University of Maryland. +# See the top-level LICENSE file for details. +# +# SPDX-License-Identifier: MIT + +CC = cc + +# frontier flags +INC = -I${ROCM_PATH}/include +CFLAGS = -std=c++11 -O2 -D__HIP_ROCclr__ -D__HIP_ARCH_GFX90A__=1 --rocm-path=${ROCM_PATH} --offload-arch=gfx90a -x hip -DUSE_ROCM -DUSE_RCCL +LDFLAGS = -L${ROCM_PATH}/lib -lamdhip64 -lrccl + +all: allgather.x allreduce.x reduce_scatter.x + +allgather.x: ../allgather.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-gather/allgather.x ../allgather.cu + +allreduce.x: ../allreduce.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o all-reduce/allreduce.x ../allreduce.cu + +reduce_scatter.x: ../reduce_scatter.cu + ${CC} ${CFLAGS} ${INC} ${LDFLAGS} -o reduce-scatter/reduce_scatter.x ../reduce_scatter.cu + +clean: + rm -f all-gather/allgather.x all-reduce/allreduce.x reduce-scatter/reduce_scatter.x diff --git a/reduce_scatter.cu b/reduce_scatter.cu new file mode 100644 index 0000000..b2dc99e --- /dev/null +++ b/reduce_scatter.cu @@ -0,0 +1,271 @@ +/* \file reduce_scatter.cu + * Copyright 2024 Parallel Software and Systems Group, University of Maryland. + * See the top-level LICENSE file for details. + * + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include +#include + +#ifdef USE_CUDA + #include + #define bfloat16 nv_bfloat16 +#elif USE_ROCM + #define __HIP_PLATFORM_AMD__ + #include + #include + #include + #define bfloat16 hip_bfloat16 +#endif + +#ifdef USE_NCCL + #include "nccl.h" +#elif USE_RCCL + #include +#endif + +#define NUM_WARMUP_ITERATIONS 5 + +#define MPI_CHECK(cmd) do { \ + int64_t e = cmd; \ + if( e != MPI_SUCCESS ) { \ + printf("Failed: MPI error %s:%d '%ld'\n", \ + __FILE__,__LINE__, e); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +#define CUDA_CHECK(cmd) do { \ + cudaError_t e = cmd; \ + if(e != cudaSuccess) { \ + printf("CUDA error %s:%d: %s\n", \ + __FILE__, __LINE__, cudaGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +#define HIP_CHECK(cmd) do { \ + hipError_t e = cmd; \ + if(e != hipSuccess) { \ + printf("HIP error %s:%d: %s\n", \ + __FILE__, __LINE__, hipGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +// NCCL_CHECK is used to validate RCCL functions as well +#define NCCL_CHECK(cmd) do { \ + ncclResult_t e = cmd; \ + if (e != ncclSuccess) { \ + printf("NCCL error %s:%d %s\n", \ + __FILE__, __LINE__, ncclGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ +} while(0) + +void initializeData(bfloat16 *data, int64_t size) { + for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) { + #ifdef USE_CUDA + data[i] = __float2bfloat16((float)i); + #elif USE_ROCM + // ROCm doesn't have a float2bfloat16 method + data[i] = (bfloat16) ((float) i); + #endif + } +} + +void custom_bf16_sum(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { + bfloat16* in = (bfloat16*) invec; + bfloat16* inout = (bfloat16*) inoutvec; + for (int i = 0; i < *len; i++) { + #ifdef USE_CUDA + inout[i] = __hadd(in[i], inout[i]); + #elif USE_ROCM + inout[i] = in[i] + inout[i]; + #endif + } +} + +int main(int argc, char *argv[]) { + if (argc != 5) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return EXIT_FAILURE; + } + + int num_gpus = atoi(argv[1]); + int64_t min_msg_size = strtoll(argv[2], NULL, 10); + int64_t max_msg_size = strtoll(argv[3], NULL, 10); + int iterations = atoi(argv[4]); + + if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) { + fprintf(stderr, "Invalid input parameters.\n"); + return EXIT_FAILURE; + } + + int my_rank, num_pes; + int num_gpus_per_node; + int msg_count; + + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + MPI_Comm_size(MPI_COMM_WORLD, &num_pes); + + if (num_pes != num_gpus) { + fprintf(stderr, "Number of processes must match number of GPUs.\n"); + MPI_Finalize(); + return EXIT_FAILURE; + } + + // Initialize GPU context + #if USE_CUDA + cudaGetDeviceCount(&num_gpus_per_node); + cudaSetDevice((my_rank % num_gpus_per_node)); + #elif USE_ROCM + hipGetDeviceCount(&num_gpus_per_node); + hipSetDevice((my_rank % num_gpus_per_node)); + #endif + + int64_t local_data_size = max_msg_size; // Size of local data + int64_t global_data_size = local_data_size; // Size of global data + + if (my_rank == 0) { + fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024); + fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024); + } + + bfloat16 *local_data = (bfloat16*)malloc(local_data_size); + bfloat16 *global_data = (bfloat16*)malloc(global_data_size); + + // Initialize local data + initializeData(local_data, local_data_size); + + bfloat16 *d_local_data, *d_global_data; + #ifdef USE_CUDA + CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size)); + CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size)); + // Copy local data to GPU + CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice)); + + #elif USE_ROCM + HIP_CHECK(hipMalloc(&d_local_data, local_data_size)); + HIP_CHECK(hipMalloc(&d_global_data, global_data_size)); + HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice)); + #endif + + #ifdef USE_MPI + // create 2-byte datatype (send raw, un-interpreted bytes) + MPI_Datatype mpi_type_bfloat16; + MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16); + MPI_Type_commit(&mpi_type_bfloat16); + + // define custom reduce operation for nv_bfloat16 types + MPI_Op CUSTOM_SUM; + MPI_Op_create(&custom_bf16_sum, 1, &CUSTOM_SUM); + + #elif defined(USE_NCCL) || defined(USE_RCCL) + ncclUniqueId nccl_comm_id; + ncclComm_t nccl_comm; + + if (my_rank == 0) { + /* Generates an Id to be used in ncclCommInitRank. */ + ncclGetUniqueId(&nccl_comm_id); + } + + /* distribute nccl_comm_id to all ranks */ + MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE, + 0, MPI_COMM_WORLD)); + + /* Create a new NCCL/RCCL communicator */ + NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank)); + #endif + + // init recvcounts to send an equal portion of data from the reduce operation + int *recvcounts = (int*) malloc(sizeof(int) * num_pes); + int portion; + + // Perform MPI_Iallgather, NCCL allgather, or RCCL allgather + double total_time, start_time; + MPI_Request request; + MPI_Status status; + + // Print benchmark results + if (my_rank == 0) { + printf("Number of GPUs: %d\n", num_gpus); + printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size); + printf("Number of iterations: %d\n", iterations); + } + fflush(NULL); + + for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) { + msg_count = msg_size / sizeof(bfloat16); + + portion = msg_count / num_pes; + for (int i = 0; i < num_pes; i++) + recvcounts[i] = portion; + + // warmup iterations + for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) { + #ifdef USE_MPI + MPI_CHECK(MPI_Ireduce_scatter(d_local_data, d_global_data, recvcounts, mpi_type_bfloat16, + CUSTOM_SUM, MPI_COMM_WORLD, &request)); + + MPI_CHECK(MPI_Wait(&request, &status)); + #elif defined(USE_NCCL) || defined(USE_RCCL) + NCCL_CHECK(ncclReduceScatter((const void*)d_local_data, (void*)d_global_data, portion, ncclBfloat16, ncclSum, nccl_comm, NULL)); + #endif + + #ifdef USE_CUDA + cudaDeviceSynchronize(); + #elif USE_ROCM + hipDeviceSynchronize(); + #endif + } + + if(msg_size >= 8388608) + iterations = 20; + + MPI_Barrier(MPI_COMM_WORLD); + start_time = MPI_Wtime(); + for (int i = 0; i < iterations; ++i) { + #ifdef USE_MPI + MPI_CHECK(MPI_Ireduce_scatter(d_local_data, d_global_data, recvcounts, mpi_type_bfloat16, + CUSTOM_SUM, MPI_COMM_WORLD, &request)); + + MPI_CHECK(MPI_Wait(&request, &status)); + #elif defined(USE_NCCL) || defined(USE_RCCL) + NCCL_CHECK(ncclReduceScatter((const void*)d_local_data, (void*)d_global_data, portion, ncclBfloat16, ncclSum, nccl_comm, NULL)); + #endif + + #ifdef USE_CUDA + cudaDeviceSynchronize(); + #elif USE_ROCM + hipDeviceSynchronize(); + #endif + } + MPI_Barrier(MPI_COMM_WORLD); + total_time = MPI_Wtime() - start_time; + if (my_rank == 0) + printf("%ld %.6f seconds\n", msg_size, (total_time / iterations)); + } + + // Cleanup + free(local_data); + free(global_data); + #ifdef USE_CUDA + CUDA_CHECK(cudaFree(d_local_data)); + CUDA_CHECK(cudaFree(d_global_data)); + #elif USE_ROCM + HIP_CHECK(hipFree(d_local_data)); + HIP_CHECK(hipFree(d_global_data)); + #endif + + #ifdef defined(USE_NCCL) || defined(USE_RCCL) + ncclCommDestroy(nccl_comm); + #endif + + MPI_Finalize(); + return EXIT_SUCCESS; +}