Skip to content

add all-gather, all-reduce, and reduce-scatter benchmarks #13

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 59 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
59 commits
Select commit Hold shift + click to select a range
f7fb83c
add cudaDeviceSynchronize for NCCL
adistomar Mar 26, 2024
453a397
add allreduce code
adistomar Mar 30, 2024
32c1211
Merge branch 'hpcgroup:develop' into develop
adistomar Mar 30, 2024
f516fa0
add reduce scatter code
adistomar Mar 30, 2024
23e9f5c
remove duplicate commit
adistomar Mar 30, 2024
7ff3fb5
fix Makefile
adistomar Mar 30, 2024
982ccaf
add code for ROCm and RCCL
adistomar Apr 1, 2024
f70e65c
add flags for ROCm and RCCL
adistomar Apr 1, 2024
8ab25d1
revert Makefile to original
adistomar Apr 1, 2024
ef65ccd
remove unneeded import
adistomar Apr 1, 2024
795a6d3
add ROCm and RCCL code for all-reduce
adistomar Apr 1, 2024
b9e8824
add ROCm and RCCL code for reduce-scatter
adistomar Apr 1, 2024
e077503
Update and rename README to README.md
adistomar Apr 1, 2024
686be82
revert Makefile to original
adistomar Apr 1, 2024
ec4fded
revert Makefile to original
adistomar Apr 1, 2024
4a87dfc
revert Makefile to original
adistomar Apr 1, 2024
b6083d1
revert Makefile to original
adistomar Apr 1, 2024
79b2fb9
update custom bf16 sum function
adistomar Apr 1, 2024
60e6911
update custom bf16 sum function
adistomar Apr 1, 2024
ef6fb0d
fix custom bf16 sum function
adistomar Apr 1, 2024
8052ca7
fix indents
adistomar Apr 1, 2024
a67570e
fix indents
adistomar Apr 1, 2024
fdb324a
update Makefile
adistomar Apr 9, 2024
b04e902
Merge pull request #1 from RoastSea8/allreduce
adistomar Apr 11, 2024
73c53dc
Merge pull request #2 from RoastSea8/rccl-all-gather
adistomar Apr 11, 2024
3d26b3f
resolve merge conflict
adistomar Apr 11, 2024
fdce591
Merge pull request #3 from RoastSea8/reduce-scatter
adistomar Apr 11, 2024
8939f6f
Merge pull request #4 from RoastSea8/update-makefile
adistomar Apr 11, 2024
405f090
Create allreduce.cu
adistomar Apr 11, 2024
63bb696
change to int64_t for global/local data size
adistomar Apr 13, 2024
3082c98
change to int64_t for global/local data size
adistomar Apr 13, 2024
3c91d01
change to int64_t for global/local data size
adistomar Apr 13, 2024
0a33166
revert type change for custom sum
adistomar Apr 13, 2024
8be09db
setup benchmarks rig and add results so far
adistomar Apr 13, 2024
c7bb217
add results so far
adistomar Apr 13, 2024
cb99cad
add results so far
adistomar Apr 14, 2024
d2d2bbc
change atoi to strtoll
adistomar Apr 14, 2024
74cfdd8
change atoi to strtoll
adistomar Apr 14, 2024
ccb73ae
add all perlmutter code and benchmark data
adistomar Apr 14, 2024
fd73957
add frontier code and benchmark results so far
adistomar Apr 14, 2024
2077fe4
fix merge conflicts
adistomar Apr 14, 2024
79e7570
fix Makefiles
adistomar Apr 14, 2024
0cd86f0
add benchmark code for all-reduce and reduce-scatter
adistomar Apr 14, 2024
0320117
add results of MPI on Frontier so far
adistomar Apr 15, 2024
7752ced
add 64 gcd data for MPI
adistomar Apr 15, 2024
4d5a827
add 128 gcd numbers for MPI on Frontier
adistomar Apr 16, 2024
dffbac0
use latest nccl
adistomar Jul 12, 2024
7ca3d66
update .gitignore to ignore .x and .out files
adistomar Jul 12, 2024
f84dd26
update .gitignore to ignore .x and .out files
adistomar Jul 12, 2024
559f4bb
fix reduce_scatter bug
adistomar Jul 12, 2024
d70e475
push latest benchmarks
adistomar Jul 12, 2024
4e7ac6f
add benchmarks so far
adistomar Jul 12, 2024
e878a75
add mpi all-gather 128gpu benchmarks
adistomar Jul 12, 2024
75c8208
update benchmarks
adistomar Jul 12, 2024
39d17ff
update results
adistomar Jul 12, 2024
4c71403
push results
adistomar Jul 12, 2024
6e59c91
push results
adistomar Jul 12, 2024
cc1b03f
update results
adistomar Jul 12, 2024
92d5eec
push final results
adistomar Jul 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*.x
*.out
18 changes: 0 additions & 18 deletions Makefile

This file was deleted.

9 changes: 0 additions & 9 deletions README

This file was deleted.

15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
Before compiling do these:

### Perlmutter
```sh
module load PrgEnv-cray cudatoolkit craype-accel-nvidia80 nccl
export CRAY_ACCEL_TARGET=nvidia80
export MPICH_GPU_SUPPORT_ENABLED=1
```
### Frontier
```sh
module load PrgEnv-cray amd-mixed/5.6.0 craype-accel-amd-gfx90a cray-mpich/8.1.26 cpe/23.05
export MPICH_GPU_SUPPORT_ENABLED=1
export LD_LIBRARY_PATH="${CRAY_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}"
```

124 changes: 82 additions & 42 deletions allgather.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,24 +8,31 @@
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <stdint.h>

#ifdef USE_CUDA
#include <cuda_runtime.h>
#include <cuda_bf16.h>
#define bfloat16 nv_bfloat16
#elif USE_ROCM
#define __HIP_PLATFORM_AMD__
#include <hip/hip_bfloat16.h>
#include <hip/hip_runtime.h>
#include <hip/hip_runtime_api.h>
#define bfloat16 hip_bfloat16
#endif

#ifdef USE_NCCL
#include "nccl.h"
#elif defined(USE_RCCL)
#include "rccl.h"
#elif USE_RCCL
#include <rccl/rccl.h>
#endif

#define NUM_WARMUP_ITERATIONS 5

#define MPI_CHECK(cmd) do { \
int e = cmd; \
int64_t e = cmd; \
if( e != MPI_SUCCESS ) { \
printf("Failed: MPI error %s:%d '%d'\n", \
printf("Failed: MPI error %s:%d '%ld'\n", \
__FILE__,__LINE__, e); \
exit(EXIT_FAILURE); \
} \
Expand All @@ -40,6 +47,16 @@
} \
} while(0)

#define HIP_CHECK(cmd) do { \
hipError_t e = cmd; \
if(e != hipSuccess) { \
printf("HIP error %s:%d: %s\n", \
__FILE__, __LINE__, hipGetErrorString(e)); \
exit(EXIT_FAILURE); \
} \
} while(0)

// NCCL_CHECK is used to validate RCCL functions as well
#define NCCL_CHECK(cmd) do { \
ncclResult_t e = cmd; \
if (e != ncclSuccess) { \
Expand All @@ -49,9 +66,14 @@
} \
} while(0)

void initializeData(nv_bfloat16 *data, int size) {
for (int i = 0; i < (size / sizeof(nv_bfloat16)); ++i) {
void initializeData(bfloat16 *data, int64_t size) {
for (int64_t i = 0; i < (size / sizeof(bfloat16)); ++i) {
#ifdef USE_CUDA
data[i] = __float2bfloat16((float)i);
#elif USE_ROCM
// ROCm doesn't have a float2bfloat16 method
data[i] = (bfloat16) ((float) i);
#endif
}
}

Expand All @@ -62,8 +84,8 @@ int main(int argc, char *argv[]) {
}

int num_gpus = atoi(argv[1]);
int min_msg_size = atoi(argv[2]);
int max_msg_size = atoi(argv[3]);
int64_t min_msg_size = atoi(argv[2]);
int64_t max_msg_size = atoi(argv[3]);
int iterations = atoi(argv[4]);

if (num_gpus < 2 || min_msg_size <= 0 || max_msg_size <= 0 || min_msg_size > max_msg_size || iterations <= 0) {
Expand All @@ -86,33 +108,49 @@ int main(int argc, char *argv[]) {
}

// Initialize GPU context
#if USE_CUDA
cudaGetDeviceCount(&num_gpus_per_node);
cudaSetDevice((my_rank % num_gpus_per_node));
#elif USE_ROCM
hipGetDeviceCount(&num_gpus_per_node);
hipSetDevice((my_rank % num_gpus_per_node));
#endif

int local_data_size = max_msg_size; // Size of local data
int global_data_size = local_data_size * num_gpus; // Size of global data
int64_t local_data_size = max_msg_size; // Size of local data
int64_t global_data_size = local_data_size * num_gpus; // Size of global data

nv_bfloat16 *local_data = (nv_bfloat16*)malloc(local_data_size);
nv_bfloat16 *global_data = (nv_bfloat16*)malloc(global_data_size);
if (my_rank == 0) {
fprintf(stdout, "Local data size: %ld\n", (local_data_size / 1024) / 1024);
fprintf(stdout, "Global data size: %ld\n", (global_data_size / 1024) / 1024);
}

bfloat16 *local_data = (bfloat16*)malloc(local_data_size);
bfloat16 *global_data = (bfloat16*)malloc(global_data_size);

// Initialize local data
initializeData(local_data, local_data_size);

// Allocate memory on GPU
nv_bfloat16 *d_local_data, *d_global_data;
bfloat16 *d_local_data, *d_global_data;
#ifdef USE_CUDA
CUDA_CHECK(cudaMalloc(&d_local_data, local_data_size));
CUDA_CHECK(cudaMalloc(&d_global_data, global_data_size));

// Copy local data to GPU
CUDA_CHECK(cudaMemcpy(d_local_data, local_data, local_data_size, cudaMemcpyHostToDevice));

#elif USE_ROCM
HIP_CHECK(hipMalloc(&d_local_data, local_data_size));
HIP_CHECK(hipMalloc(&d_global_data, global_data_size));
HIP_CHECK(hipMemcpy(d_local_data, local_data, local_data_size, hipMemcpyHostToDevice));
#endif

#ifdef USE_MPI
// create 2-byte datatype (send raw, un-interpreted bytes)
MPI_Datatype mpi_type_bfloat16;
MPI_Type_contiguous(2, MPI_BYTE, &mpi_type_bfloat16);
MPI_Type_commit(&mpi_type_bfloat16);

#elif USE_NCCL
#elif defined(USE_NCCL) || defined(USE_RCCL)
ncclUniqueId nccl_comm_id;
ncclComm_t nccl_comm;

Expand All @@ -125,13 +163,8 @@ int main(int argc, char *argv[]) {
MPI_CHECK(MPI_Bcast((void *)&nccl_comm_id, sizeof(nccl_comm_id), MPI_BYTE,
0, MPI_COMM_WORLD));

/* Create a new NCCL communicator */
/* Create a new NCCL/RCCL communicator */
NCCL_CHECK(ncclCommInitRank(&nccl_comm, num_pes, nccl_comm_id, my_rank));

#elif defined(USE_RCCL)
// TODO: fix later
rcclComm_t rccl_comm;
rcclCommInitRank(&comm, num_gpus, 0, rccl_root);
#endif

// Perform MPI_Iallgather, NCCL allgather, or RCCL allgather
Expand All @@ -142,26 +175,28 @@ int main(int argc, char *argv[]) {
// Print benchmark results
if (my_rank == 0) {
printf("Number of GPUs: %d\n", num_gpus);
printf("Message size range: %d - %d\n", min_msg_size, max_msg_size);
printf("Message size range: %ld - %ld\n", min_msg_size, max_msg_size);
printf("Number of iterations: %d\n", iterations);
}
fflush(NULL);

for (int msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
msg_count = msg_size / sizeof(nv_bfloat16);
for (int64_t msg_size = min_msg_size; msg_size <= max_msg_size; msg_size *= 2) {
msg_count = msg_size / sizeof(bfloat16);
// warmup iterations
for (int i = 0; i < NUM_WARMUP_ITERATIONS; ++i) {
#ifdef USE_MPI
MPI_CHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16,
d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request));

MPI_CHECK(MPI_Wait(&request, &status));
#elif defined(USE_NCCL)
#elif defined(USE_NCCL) || defined(USE_RCCL)
NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL));
cudaDeviceSynchronize();
#elif defined(USE_RCCL)
// TODO: fix later
rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
#endif

#ifdef USE_CUDA
cudaDeviceSynchronize();
#elif USE_ROCM
hipDeviceSynchronize();
#endif
}

Expand All @@ -172,34 +207,39 @@ int main(int argc, char *argv[]) {
start_time = MPI_Wtime();
for (int i = 0; i < iterations; ++i) {
#ifdef USE_MPI
MPI_CHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16,
d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request));

MPI_CHECK(MPI_Iallgather(d_local_data, msg_count, mpi_type_bfloat16,
d_global_data, msg_count, mpi_type_bfloat16, MPI_COMM_WORLD, &request));
MPI_CHECK(MPI_Wait(&request, &status));
#elif defined(USE_NCCL)
#elif defined(USE_NCCL) || defined(USE_RCCL)
NCCL_CHECK(ncclAllGather((const void*)d_local_data, (void*)d_global_data, msg_count, ncclBfloat16, nccl_comm, NULL));
cudaDeviceSynchronize();
#elif defined(USE_RCCL)
// TODO: fix later
rcclAllReduce((const void*)d_local_data, (void*)d_global_data, global_data_size, rcclInt, rcclSum, comm, NULL);
#endif

#ifdef USE_CUDA
cudaDeviceSynchronize();
#elif USE_ROCM
hipDeviceSynchronize();
#endif
}
MPI_Barrier(MPI_COMM_WORLD);
total_time = MPI_Wtime() - start_time;
if (my_rank == 0)
printf("%d %.6f seconds\n", msg_size, (total_time / iterations));
printf("%ld %.6f seconds\n", msg_size, (total_time / iterations));
}

// Cleanup
free(local_data);
free(global_data);
#ifdef USE_CUDA
CUDA_CHECK(cudaFree(d_local_data));
CUDA_CHECK(cudaFree(d_global_data));
#elif USE_ROCM
HIP_CHECK(hipFree(d_local_data));
HIP_CHECK(hipFree(d_global_data));
#endif

#ifdef USE_NCCL
#ifdef defined(USE_NCCL) || defined(USE_RCCL)
ncclCommDestroy(nccl_comm);
#elif defined(USE_RCCL)
rcclCommDestroy(rccl_comm);
#endif

MPI_Finalize();
Expand Down
Loading