CUDA and MPI provide two different APIs for parallel programming that target very different parallel architectures. While CUDA allows to utilize parallel graphics hardware for general purpose computing, MPI is usually employed to write parallel programs that run on large SMP systems or on cluster computers. In order to improve a cluster’s overall computational capabilities it is not unusual to equip the nodes of a cluster with graphics cards. This, however, adds also another level of parallelism that a programmer must cope with. Likely one will combine MPI with CUDA when writing programs for such a system. Combining both kinds of parallel programming techniques becomes much easier if the MPI implementation is CUDA-aware. CUDA-aware means that one can send data from device memory of one graphics card directly to the device memory of another card without an intermediate copy to host memory. This magic becomes possible thanks to «unified virtual address space» that puts all CUDA execution, on CPU and on the GPU, into a single address space. (Unified virtual address space requires CUDA 4.0 or later and a GPU with compute capability 2.0 or higher.) CUDA-aware MPI eases multi-GPU programming a lot and improves performance. Among other implementations, Open MPI is CUDA-aware.
The following program below sends a message from one GPU to another via MPI. The program may be compiled with
mpic++ -o ping_pong ping_pong.cc -lcuda -lcudart
It should be run on a node with two or more GPUs and all of them set to compute mode «exclusive process». There is an obstacle when starting the program via mpirun. mpirun may not find the library libcuda.so or try to load the wrong library (e.g. a 32-bit library on a 64-bit system). In this case set the environment variable LD_LIBRARY_PATH to an appropriate value.
out << packet_size <<"\t\t"<< t_av <<"\t"<< t_max <<"\n";
}
packet_size*=2; // double packet size
}
if(rank==0)// close file
out.close();
}
MPI::Finalize(); // finish MPI
return EXIT_SUCCESS;
}
// ping_pong.cc
//
// determine bandwidth as a function of packet size
#include <cstdlib>
#include <iostream>
#include <fstream>
#include <string>
#include "mpi.h"
#include <cuda.h>
#include <cuda_runtime.h>
const int max_packet_size=0x1000000; // maximal message size
const int count=250; // number of messages
char *buff, *buff_2; // buffers
typedef enum { host, device, copy } benchmark_type;
int main(int argc, char *argv[]) {
if (argc!=2){
std::cerr << "usage: " << argv[0] << " [--host|--device|--copy]\n";
std::exit(EXIT_FAILURE);
}
std::string arg_str(argv[1]);
if (arg_str!="--host" and
arg_str!="--device" and
arg_str!="--copy") {
std::cerr << "usage: " << argv[0] << " [--host|--device|--copy]\n";
std::exit(EXIT_FAILURE);
}
benchmark_type benchmark;
if (arg_str=="--host")
benchmark=host;
else if (arg_str=="--device")
benchmark=device;
else if (arg_str=="--copy")
benchmark=copy;
else { // shold never be reached
std::cerr << "usage: " << argv[0] << " [--host|--device|--copy]\n";
std::exit(EXIT_FAILURE);
}
MPI::Init(); // initialize MPI
int rank=MPI::COMM_WORLD.Get_rank(); // get my rank
int size=MPI::COMM_WORLD.Get_size(); // get number of processes
std::string fname;
if (benchmark==host) {
buff=new char[max_packet_size]; // allocate host memory
fname="ping_pong_host.dat";
} else if (benchmark==device) {
cudaMalloc(&buff, max_packet_size); // allocate GPU memory
fname="ping_pong_device.dat";
} else {
buff=new char[max_packet_size]; // allocate host memory
cudaMalloc(&buff_2, max_packet_size); // allocate GPU memory
fname="ping_pong_copy.dat";
}
if (size==2) { // need exactly two processes
int device[2];
if (benchmark!=host) {
cudaGetDevice(device);
if (rank==0)
MPI::COMM_WORLD.Recv(&device[1], 1, MPI::INT, 1, 0);
else
MPI::COMM_WORLD.Send(&device[0], 1, MPI::INT, 0, 0);
}
std::ofstream out;
if (rank==0) { // open output file
out.open(fname.c_str());
if (!out)
MPI::COMM_WORLD.Abort(EXIT_FAILURE);
out << "# bandwidth as a function of packet size\n";
if (benchmark!=host)
out << "# process 0 using GPU " << device[0] << "\n"
<< "# process 1 using GPU " << device[1] << "\n";
out << "# clock resolution " << MPI::Wtick() << "sec.\n"
<< "# packet size\tmean time\tmaximal time\n";
}
cudaEvent_t start, stop;
if (benchmark==copy) {
cudaEventCreate(&start);
cudaEventCreate(&stop);
}
// try messages of different sizes
int packet_size=1;
while (packet_size<=max_packet_size) {
double t_av=0.0;
double t_max=0.0;
// average over several messages
for (int i=0; i<count; ++i) {
MPI::COMM_WORLD.Barrier(); // synchronize processes
if (rank==0) {
double t=MPI::Wtime(); // start time
if (benchmark==copy) {
cudaEventRecord(start, 0);
cudaMemcpy(buff, buff_2, packet_size, cudaMemcpyDeviceToHost);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
}
MPI::COMM_WORLD.Send(buff, packet_size, MPI::CHAR, 1, 0);
if (benchmark==copy) {
cudaEventRecord(start, 0);
cudaMemcpy(buff_2, buff, packet_size, cudaMemcpyHostToDevice);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventRecord(start, 0);
cudaMemcpy(buff, buff_2, packet_size, cudaMemcpyDeviceToHost);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
}
MPI::COMM_WORLD.Recv(buff, packet_size, MPI::CHAR, 1, 0);
if (benchmark==copy) {
cudaEventRecord(start, 0);
cudaMemcpy(buff_2, buff, packet_size, cudaMemcpyHostToDevice);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
}
t=(MPI::Wtime()-t)/2.0; // time difference
t_av+=t;
if (t>t_max)
t_max=t;
} else {
if (benchmark==copy) {
cudaEventRecord(start, 0);
cudaMemcpy(buff, buff_2, packet_size, cudaMemcpyDeviceToHost);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
}
MPI::COMM_WORLD.Recv(buff, packet_size, MPI::CHAR, 0, 0);
if (benchmark==copy) {
cudaEventRecord(start, 0);
cudaMemcpy(buff_2, buff, packet_size, cudaMemcpyHostToDevice);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventRecord(start, 0);
cudaMemcpy(buff, buff_2, packet_size, cudaMemcpyDeviceToHost);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
}
MPI::COMM_WORLD.Send(buff, packet_size, MPI::CHAR, 0, 0);
if (benchmark==copy) {
cudaEventRecord(start, 0);
cudaMemcpy(buff_2, buff, packet_size, cudaMemcpyHostToDevice);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
}
}
}
if (rank==0) { // print results to file
t_av/=count;
out << packet_size << "\t\t" << t_av << "\t" << t_max << "\n";
}
packet_size*=2; // double packet size
}
if (rank==0) // close file
out.close();
}
MPI::Finalize(); // finish MPI
return EXIT_SUCCESS;
}
// ping_pong.cc
//
// determine bandwidth as a function of packet size
#include <cstdlib>
#include <iostream>
#include <fstream>
#include <string>
#include "mpi.h"
#include <cuda.h>
#include <cuda_runtime.h>
const int max_packet_size=0x1000000; // maximal message size
const int count=250; // number of messages
char *buff, *buff_2; // buffers
typedef enum { host, device, copy } benchmark_type;
int main(int argc, char *argv[]) {
if (argc!=2){
std::cerr << "usage: " << argv[0] << " [--host|--device|--copy]\n";
std::exit(EXIT_FAILURE);
}
std::string arg_str(argv[1]);
if (arg_str!="--host" and
arg_str!="--device" and
arg_str!="--copy") {
std::cerr << "usage: " << argv[0] << " [--host|--device|--copy]\n";
std::exit(EXIT_FAILURE);
}
benchmark_type benchmark;
if (arg_str=="--host")
benchmark=host;
else if (arg_str=="--device")
benchmark=device;
else if (arg_str=="--copy")
benchmark=copy;
else { // shold never be reached
std::cerr << "usage: " << argv[0] << " [--host|--device|--copy]\n";
std::exit(EXIT_FAILURE);
}
MPI::Init(); // initialize MPI
int rank=MPI::COMM_WORLD.Get_rank(); // get my rank
int size=MPI::COMM_WORLD.Get_size(); // get number of processes
std::string fname;
if (benchmark==host) {
buff=new char[max_packet_size]; // allocate host memory
fname="ping_pong_host.dat";
} else if (benchmark==device) {
cudaMalloc(&buff, max_packet_size); // allocate GPU memory
fname="ping_pong_device.dat";
} else {
buff=new char[max_packet_size]; // allocate host memory
cudaMalloc(&buff_2, max_packet_size); // allocate GPU memory
fname="ping_pong_copy.dat";
}
if (size==2) { // need exactly two processes
int device[2];
if (benchmark!=host) {
cudaGetDevice(device);
if (rank==0)
MPI::COMM_WORLD.Recv(&device[1], 1, MPI::INT, 1, 0);
else
MPI::COMM_WORLD.Send(&device[0], 1, MPI::INT, 0, 0);
}
std::ofstream out;
if (rank==0) { // open output file
out.open(fname.c_str());
if (!out)
MPI::COMM_WORLD.Abort(EXIT_FAILURE);
out << "# bandwidth as a function of packet size\n";
if (benchmark!=host)
out << "# process 0 using GPU " << device[0] << "\n"
<< "# process 1 using GPU " << device[1] << "\n";
out << "# clock resolution " << MPI::Wtick() << "sec.\n"
<< "# packet size\tmean time\tmaximal time\n";
}
cudaEvent_t start, stop;
if (benchmark==copy) {
cudaEventCreate(&start);
cudaEventCreate(&stop);
}
// try messages of different sizes
int packet_size=1;
while (packet_size<=max_packet_size) {
double t_av=0.0;
double t_max=0.0;
// average over several messages
for (int i=0; i<count; ++i) {
MPI::COMM_WORLD.Barrier(); // synchronize processes
if (rank==0) {
double t=MPI::Wtime(); // start time
if (benchmark==copy) {
cudaEventRecord(start, 0);
cudaMemcpy(buff, buff_2, packet_size, cudaMemcpyDeviceToHost);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
}
MPI::COMM_WORLD.Send(buff, packet_size, MPI::CHAR, 1, 0);
if (benchmark==copy) {
cudaEventRecord(start, 0);
cudaMemcpy(buff_2, buff, packet_size, cudaMemcpyHostToDevice);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventRecord(start, 0);
cudaMemcpy(buff, buff_2, packet_size, cudaMemcpyDeviceToHost);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
}
MPI::COMM_WORLD.Recv(buff, packet_size, MPI::CHAR, 1, 0);
if (benchmark==copy) {
cudaEventRecord(start, 0);
cudaMemcpy(buff_2, buff, packet_size, cudaMemcpyHostToDevice);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
}
t=(MPI::Wtime()-t)/2.0; // time difference
t_av+=t;
if (t>t_max)
t_max=t;
} else {
if (benchmark==copy) {
cudaEventRecord(start, 0);
cudaMemcpy(buff, buff_2, packet_size, cudaMemcpyDeviceToHost);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
}
MPI::COMM_WORLD.Recv(buff, packet_size, MPI::CHAR, 0, 0);
if (benchmark==copy) {
cudaEventRecord(start, 0);
cudaMemcpy(buff_2, buff, packet_size, cudaMemcpyHostToDevice);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventRecord(start, 0);
cudaMemcpy(buff, buff_2, packet_size, cudaMemcpyDeviceToHost);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
}
MPI::COMM_WORLD.Send(buff, packet_size, MPI::CHAR, 0, 0);
if (benchmark==copy) {
cudaEventRecord(start, 0);
cudaMemcpy(buff_2, buff, packet_size, cudaMemcpyHostToDevice);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
}
}
}
if (rank==0) { // print results to file
t_av/=count;
out << packet_size << "\t\t" << t_av << "\t" << t_max << "\n";
}
packet_size*=2; // double packet size
}
if (rank==0) // close file
out.close();
}
MPI::Finalize(); // finish MPI
return EXIT_SUCCESS;
}
Half round trip time as a function of the message size. CUDA-aware MPI reduces round trip times by eliminating a temporary copy to host memory. For small messages communication via host shared memory is faster than inter-GPU communication. Test was done on a system with two Tesla M2090 cards.