Actual source code: ex1kok.kokkos.cxx
1: static char help[] = "Benchmarking device kernel launch time\n";
2: /*
3: Running example on Summit at OLCF:
4: # run with total 1 resource set (RS) (-n1), 1 RS per node (-r1), 1 MPI rank (-a1), 7 cores (-c7) and 1 GPU (-g1) per RS
5: $ jsrun -n1 -a1 -c7 -g1 -r1 ./ex1kok
6: Average asynchronous device kernel launch time = 4.86 microseconds
7: Average synchronous device kernel launch time = 12.83 microseconds
9: Frontier@OLCF
10: $ srun -n1 -c32 --cpu-bind=threads --gpus-per-node=8 --gpu-bind=closest ./ex1kok
11: Average asynchronous device kernel launch time = 1.88 microseconds
12: Average synchronous device kernel launch time = 7.78 microseconds
14: Aurora@ALCF
15: $ mpirun -n 1 ./ex1kok
16: Average asynchronous device kernel launch time = 3.34 microseconds
17: Average synchronous device kernel launch time = 6.24 microseconds
19: Perlmutter@NERSC
20: $ srun -n 1 --gpus-per-task=1 ./ex1kok
21: Average asynchronous device kernel launch time = 2.31 microseconds
22: Average synchronous device kernel launch time = 7.13 microseconds
23: */
25: #include <petscsys.h>
26: #include <petsc_kokkos.hpp>
28: int main(int argc, char **argv)
29: {
30: PetscInt i, n = 100000, N = 256;
31: PetscLogDouble tstart, tend, time;
33: PetscFunctionBeginUser;
34: PetscCall(PetscInitialize(&argc, &argv, nullptr, help));
35: PetscCall(PetscOptionsGetInt(NULL, NULL, "-n", &n, NULL));
36: PetscCall(PetscKokkosInitializeCheck());
37: {
38: Kokkos::DefaultExecutionSpace exec = PetscGetKokkosExecutionSpace();
39: Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace> policy(exec, 0, N);
41: PetscCallCXX(exec.fence()); // Initialize device runtime to get more accurate timing below
42: // Launch a sequence of kernels asynchronously. Previous launched kernels do not need to be completed before launching a new one
43: PetscCall(PetscTime(&tstart));
44: for (i = 0; i < n; i++) { PetscCallCXX(Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const PetscInt &i){})); }
45: PetscCall(PetscTime(&tend));
46: PetscCallCXX(exec.fence());
47: time = (tend - tstart) * 1e6 / n;
48: PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Average asynchronous device kernel launch time = %.2f microseconds\n", time));
50: // Launch a sequence of kernels synchronously. Only launch a new kernel after the one before it has been completed
51: PetscCall(PetscTime(&tstart));
52: for (i = 0; i < n; i++) {
53: PetscCallCXX(Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const PetscInt &i){}));
54: PetscCallCXX(exec.fence());
55: }
56: PetscCall(PetscTime(&tend));
57: time = (tend - tstart) * 1e6 / n;
58: PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Average synchronous device kernel launch time = %.2f microseconds\n", time));
59: }
61: PetscCall(PetscFinalize());
62: return 0;
63: }
65: /*TEST
66: test:
67: requires: kokkos
68: args: -n 2
69: output_file: output/empty.out
70: filter: grep "DOES_NOT_EXIST"
72: TEST*/