Actual source code: ex1k.kokkos.cxx

  1: static const char help[] = "Benchmarking PetscSF Ping-pong latency (similar to osu_latency)\n\n";

  3: /*
  4:   This is a simple test to measure the latency of MPI communication.
  5:   The test is run with two processes.  The first process sends a message
  6:   to the second process, and after having received the message, the second
  7:   process sends a message back to the first process once.  The is repeated
  8:   a number of times.  The latency is defined as half time of the round-trip.

 10:   It mimics osu_latency from the OSU microbenchmarks (https://mvapich.cse.ohio-state.edu/benchmarks/).

 12:   Usage: mpirun -n 2 ./ex1k -mtype <type>
 13:   Other arguments have a default value that is also used in osu_latency.

 15:   Examples:

 17:   On Summit at OLCF:
 18:     jsrun --smpiargs "-gpu" -n 2 -a 1 -c 7 -g 1 -r 2 -l GPU-GPU -d packed -b packed:7 ./ex1k  -mtype kokkos

 20:   On Crusher at OLCF:
 21:     srun -n2 -c32 --cpu-bind=map_cpu:0,1 --gpus-per-node=8 --gpu-bind=map_gpu:0,1 ./ex1k -mtype kokkos
 22: */
 23: #include <petscsf.h>
 24: #include <Kokkos_Core.hpp>

 26: /* Same values as OSU microbenchmarks */
 27: #define LAT_LOOP_SMALL     10000
 28: #define LAT_SKIP_SMALL     100
 29: #define LAT_LOOP_LARGE     1000
 30: #define LAT_SKIP_LARGE     10
 31: #define LARGE_MESSAGE_SIZE 8192

 33: int main(int argc, char **argv)
 34: {
 35:   PetscSF        sf[64];
 36:   PetscLogDouble t_start = 0, t_end = 0, time[64];
 37:   PetscInt       i, j, n, nroots, nleaves, niter = 100, nskip = 10;
 38:   PetscInt       maxn = 512 * 1024; /* max 4M bytes messages */
 39:   PetscSFNode   *iremote;
 40:   PetscMPIInt    rank, size;
 41:   PetscScalar   *rootdata = NULL, *leafdata = NULL, *pbuf, *ebuf;
 42:   size_t         msgsize;
 43:   PetscMemType   mtype       = PETSC_MEMTYPE_HOST;
 44:   char           mstring[16] = {0};
 45:   PetscBool      set;
 46:   PetscInt       skipSmall = -1, loopSmall = -1;
 47:   MPI_Op         op = MPI_REPLACE;

 49:   PetscFunctionBeginUser;
 50:   Kokkos::initialize(argc, argv); // Test initializing kokkos before petsc
 51:   PetscCall(PetscInitialize(&argc, &argv, NULL, help));
 52:   PetscCall(PetscKokkosInitializeCheck());

 54:   PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size));
 55:   PetscCallMPI(MPI_Comm_rank(PETSC_COMM_WORLD, &rank));
 56:   PetscCheck(size == 2, PETSC_COMM_WORLD, PETSC_ERR_WRONG_MPI_SIZE, "Must run with 2 processes");

 58:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-maxn", &maxn, NULL)); /* maxn PetscScalars */
 59:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-skipSmall", &skipSmall, NULL));
 60:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-loopSmall", &loopSmall, NULL));

 62:   PetscCall(PetscMalloc1(maxn, &iremote));
 63:   PetscCall(PetscOptionsGetString(NULL, NULL, "-mtype", mstring, 16, &set));
 64:   if (set) {
 65:     PetscBool isHost, isKokkos;
 66:     PetscCall(PetscStrcasecmp(mstring, "host", &isHost));
 67:     PetscCall(PetscStrcasecmp(mstring, "kokkos", &isKokkos));
 68:     if (isHost) mtype = PETSC_MEMTYPE_HOST;
 69:     else if (isKokkos) mtype = PETSC_MEMTYPE_KOKKOS;
 70:     else SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_ARG_WRONG, "Unknown memory type: %s", mstring);
 71:   }

 73:   if (mtype == PETSC_MEMTYPE_HOST) {
 74:     PetscCall(PetscMalloc2(maxn, &rootdata, maxn, &leafdata));
 75:   } else {
 76:     PetscCallCXX(rootdata = (PetscScalar *)Kokkos::kokkos_malloc(sizeof(PetscScalar) * maxn));
 77:     PetscCallCXX(leafdata = (PetscScalar *)Kokkos::kokkos_malloc(sizeof(PetscScalar) * maxn));
 78:   }
 79:   PetscCall(PetscMalloc2(maxn, &pbuf, maxn, &ebuf));
 80:   for (i = 0; i < maxn; i++) {
 81:     pbuf[i] = 123.0;
 82:     ebuf[i] = 456.0;
 83:   }

 85:   for (n = 1, i = 0; n <= maxn; n *= 2, i++) {
 86:     PetscCall(PetscSFCreate(PETSC_COMM_WORLD, &sf[i]));
 87:     PetscCall(PetscSFSetFromOptions(sf[i]));
 88:     if (rank == 0) {
 89:       nroots  = n;
 90:       nleaves = 0;
 91:     } else {
 92:       nroots  = 0;
 93:       nleaves = n;
 94:       for (j = 0; j < nleaves; j++) {
 95:         iremote[j].rank  = 0;
 96:         iremote[j].index = j;
 97:       }
 98:     }
 99:     PetscCall(PetscSFSetGraph(sf[i], nroots, nleaves, NULL, PETSC_COPY_VALUES, iremote, PETSC_COPY_VALUES));
100:     PetscCall(PetscSFSetUp(sf[i]));
101:   }

103:   if (loopSmall > 0) {
104:     nskip = skipSmall;
105:     niter = loopSmall;
106:   } else {
107:     nskip = LAT_SKIP_SMALL;
108:     niter = LAT_LOOP_SMALL;
109:   }

111:   for (n = 1, j = 0; n <= maxn; n *= 2, j++) {
112:     msgsize = sizeof(PetscScalar) * n;
113:     if (mtype == PETSC_MEMTYPE_HOST) {
114:       PetscCall(PetscArraycpy(rootdata, pbuf, n));
115:       PetscCall(PetscArraycpy(leafdata, ebuf, n));
116:     } else {
117:       Kokkos::View<PetscScalar *>                          dst1((PetscScalar *)rootdata, n);
118:       Kokkos::View<PetscScalar *>                          dst2((PetscScalar *)leafdata, n);
119:       Kokkos::View<const PetscScalar *, Kokkos::HostSpace> src1((const PetscScalar *)pbuf, n);
120:       Kokkos::View<const PetscScalar *, Kokkos::HostSpace> src2((const PetscScalar *)ebuf, n);
121:       PetscCallCXX(Kokkos::deep_copy(dst1, src1));
122:       PetscCallCXX(Kokkos::deep_copy(dst2, src2));
123:     }

125:     if (msgsize > LARGE_MESSAGE_SIZE) {
126:       nskip = LAT_SKIP_LARGE;
127:       niter = LAT_LOOP_LARGE;
128:     }
129:     PetscCallMPI(MPI_Barrier(MPI_COMM_WORLD));

131:     for (i = 0; i < niter + nskip; i++) {
132:       if (i == nskip) {
133:         PetscCallCXX(Kokkos::fence());
134:         PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
135:         t_start = MPI_Wtime();
136:       }
137:       PetscCall(PetscSFBcastWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, rootdata, mtype, leafdata, op));
138:       PetscCall(PetscSFBcastEnd(sf[j], MPIU_SCALAR, rootdata, leafdata, op));
139:       PetscCall(PetscSFReduceWithMemTypeBegin(sf[j], MPIU_SCALAR, mtype, leafdata, mtype, rootdata, op));
140:       PetscCall(PetscSFReduceEnd(sf[j], MPIU_SCALAR, leafdata, rootdata, op));
141:     }
142:     PetscCallCXX(Kokkos::fence());
143:     PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
144:     t_end   = MPI_Wtime();
145:     time[j] = (t_end - t_start) * 1e6 / (niter * 2);
146:   }

148:   PetscCall(PetscPrintf(PETSC_COMM_WORLD, "\t##  PetscSF Ping-pong test on %s ##\n  Message(Bytes) \t\tLatency(us)\n", mtype == PETSC_MEMTYPE_HOST ? "Host" : "Device"));
149:   for (n = 1, j = 0; n <= maxn; n *= 2, j++) {
150:     PetscCall(PetscSFDestroy(&sf[j]));
151:     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "%16" PetscInt_FMT " \t %16.4f\n", ((PetscInt)sizeof(PetscScalar)) * n, time[j]));
152:   }
153:   PetscCall(PetscFree2(pbuf, ebuf));
154:   if (mtype == PETSC_MEMTYPE_HOST) {
155:     PetscCall(PetscFree2(rootdata, leafdata));
156:   } else {
157:     PetscCallCXX(Kokkos::kokkos_free(rootdata));
158:     PetscCallCXX(Kokkos::kokkos_free(leafdata));
159:   }
160:   PetscCall(PetscFree(iremote));
161:   PetscCall(PetscFinalize());
162:   Kokkos::finalize();
163:   return 0;
164: }

166: /*TEST
167:   testset:
168:     requires: kokkos
169:     # use small numbers to make the test cheap
170:     args: -maxn 4 -skipSmall 1 -loopSmall 1
171:     filter: grep "DOES_NOT_EXIST"
172:     output_file: output/empty.out
173:     nsize: 2

175:     test:
176:       args: -mtype {{host kokkos}}

178:     test:
179:       requires: mpix_stream
180:       args: -mtype kokkos -sf_use_stream_aware_mpi 1

182: TEST*/