Actual source code: ex2k.kokkos.cxx

  1: static char help[] = "Benchmarking MatProduct with AIJ and its subclass matrix types\n";

  3: /*
  4: Usage:
  5:   mpirun -n <np> ./ex2k
  6:     -A <filename>     : input petsc binary file for matrix A; one can convert a file from MatrixMarket using mat/tests/ex72.c
  7:     -P <filename>     : input petsc binary file for matrix P; optional, if not given, P = A
  8:     -mat_type  <str>  : aij or its subclass. Default is aij.
  9:     -prod_type <str>  : AP, AtP, APt, PtAP or PAPt. Default is AP.
 10:     -n <num>          : run MatProductNumeric() this many times and report average time. Default is 100.

 12: Notes:
 13:   It uses CPU-timer to measure the time.

 15: Examples:
 16:   On OLCF Summit (with GPU-aware MPI)
 17:     # 6 MPI ranks:
 18:     # 6 resource sets (-n 6), 1 MPI rank per RS (-a 1), 7 CPU cores per RS (-c 7), and 1 GPU per RS (-g 1), 6 RSs per node (-r 6)
 19:     jsrun --smpiargs "-gpu" -n 6 -a 1 -c 7 -g 1 -r 6 ./ex2k -A cage12.aij -mat_type aijcusparse

 21:     # 1 MPI rank
 22:     jsrun --smpiargs "-gpu" -n 1 -a 1 -c 7 -g 1 -r 1 ./ex2k -A cage12.aij -mat_type aijcusparse

 24:   On OLCF Crusher:
 25:     # 1 MPI rank
 26:     # run with 1 node (-N1), 1 mpi rank (-n1), 2 hardware threads per rank (-c2)
 27:     srun -N1 -n1 -c2 --gpus-per-node=8 --gpu-bind=closest ./ex2k -A HV15R.aij -mat_type aijkokkos

 29:     # 8 MPI ranks
 30:     srun -N1 -n8 -c2 --gpus-per-node=8 --gpu-bind=closest ./ex2k -A HV15R.aij -mat_type aijkokkos
 31: */
 32: #include <petscmat.h>
 33: #include <petscdevice.h>

 35: #if defined(PETSC_HAVE_CUDA)
 36: #include <petscdevice_cuda.h>
 37:   #define SyncDevice() PetscCallCUDA(cudaDeviceSynchronize())
 38: #elif defined(PETSC_HAVE_HIP)
 39: #include <petscdevice_hip.h>
 40:   #define SyncDevice() PetscCallHIP(hipDeviceSynchronize())
 41: #elif defined(PETSC_HAVE_KOKKOS)
 42:   #include <Kokkos_Core.hpp>
 43:   #define SyncDevice() Kokkos::fence()
 44: #else
 45:   #define SyncDevice()
 46: #endif

 48: int main(int argc, char **args)
 49: {
 50:   Mat            A, P, C;
 51:   Mat            A2, P2, C2; /* Shadow matrices (of MATAIJ) of A,P,C for initialization and validation */
 52:   char           matTypeStr[64], prodTypeStr[32];
 53:   char           fileA[PETSC_MAX_PATH_LEN], fileP[PETSC_MAX_PATH_LEN];
 54:   PetscViewer    fdA, fdP;
 55:   PetscBool      flg, flgA, flgP, equal = PETSC_FALSE;
 56:   PetscLogStage  stage;
 57:   PetscInt       i, n = 100, nskip = 2, M, N;
 58:   MatInfo        info;
 59:   PetscLogDouble tstart = 0, tend = 0, avgTime;
 60:   PetscMPIInt    size;
 61:   MatProductType prodType;
 62:   PetscBool      isAP, isAtP, isAPt, isPtAP, isPAPt;

 64:   PetscFunctionBeginUser;
 65:   PetscCall(PetscInitialize(&argc, &args, nullptr, help));
 66:   PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size));

 68:   /* Read options -n */
 69:   PetscCall(PetscOptionsGetInt(NULL, NULL, "-n", &n, NULL));

 71:   /* Load the matrix from a binary file */
 72:   PetscCall(PetscOptionsGetString(NULL, NULL, "-A", fileA, PETSC_MAX_PATH_LEN, &flgA));
 73:   PetscCall(PetscOptionsGetString(NULL, NULL, "-P", fileP, PETSC_MAX_PATH_LEN, &flgP));
 74:   PetscCheck(flgA, PETSC_COMM_WORLD, PETSC_ERR_USER_INPUT, "Must give a petsc matrix binary file with the -A option");

 76:   PetscCall(PetscOptionsGetString(NULL, NULL, "-mat_type", matTypeStr, sizeof(matTypeStr), &flg));
 77:   if (!flg) PetscCall(PetscStrncpy(matTypeStr, MATAIJ, sizeof(matTypeStr))); /* Inject the default if not provided */

 79:   PetscCall(PetscOptionsGetString(NULL, NULL, "-prod_type", prodTypeStr, sizeof(prodTypeStr), &flg));
 80:   if (!flg) PetscCall(PetscStrncpy(prodTypeStr, "AP", sizeof(prodTypeStr))); /* Inject the default if not provided */

 82:   PetscCall(PetscStrcmp(prodTypeStr, "AP", &isAP));
 83:   PetscCall(PetscStrcmp(prodTypeStr, "AtP", &isAtP));
 84:   PetscCall(PetscStrcmp(prodTypeStr, "APt", &isAPt));
 85:   PetscCall(PetscStrcmp(prodTypeStr, "PtAP", &isPtAP));
 86:   PetscCall(PetscStrcmp(prodTypeStr, "PAPt", &isPAPt));

 88:   if (isAP) prodType = MATPRODUCT_AB;
 89:   else if (isAtP) prodType = MATPRODUCT_AtB;
 90:   else if (isAPt) prodType = MATPRODUCT_ABt;
 91:   else if (isPtAP) prodType = MATPRODUCT_PtAP;
 92:   else if (isPAPt) prodType = MATPRODUCT_RARt;
 93:   else SETERRQ(PETSC_COMM_WORLD, PETSC_ERR_USER, "Unsupported product type %s", prodTypeStr);

 95:   /* Read the matrix file to A2 */
 96:   PetscCall(PetscViewerBinaryOpen(PETSC_COMM_WORLD, fileA, FILE_MODE_READ, &fdA));
 97:   PetscCall(MatCreate(PETSC_COMM_WORLD, &A2));
 98:   PetscCall(MatSetType(A2, MATAIJ));
 99:   PetscCall(MatLoad(A2, fdA));
100:   PetscCall(PetscViewerDestroy(&fdA));

102:   PetscCall(MatGetSize(A2, &M, &N));
103:   PetscCall(MatGetInfo(A2, MAT_GLOBAL_SUM, &info));
104:   PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Input matrix A: %s, %" PetscInt_FMT " x %" PetscInt_FMT ", %lld nonzeros, %.1f per row\n", fileA, M, N, (long long)info.nz_used, (double)info.nz_used / (double)M));

106:   /* Copy A2 to A and convert A to the specified type */
107:   PetscCall(MatDuplicate(A2, MAT_COPY_VALUES, &A));
108:   PetscCall(MatConvert(A, matTypeStr, MAT_INPLACE_MATRIX, &A));

110:   /* Init P, P2 similarly */
111:   if (flgP) { /* If user provided P */
112:     PetscCall(PetscViewerBinaryOpen(PETSC_COMM_WORLD, fileP, FILE_MODE_READ, &fdP));
113:     PetscCall(MatCreate(PETSC_COMM_WORLD, &P2));
114:     PetscCall(MatSetType(P2, MATAIJ));
115:     PetscCall(MatLoad(P2, fdP));
116:     PetscCall(PetscViewerDestroy(&fdP));

118:     PetscCall(MatDuplicate(P2, MAT_COPY_VALUES, &P));
119:     PetscCall(MatConvert(P, matTypeStr, MAT_INPLACE_MATRIX, &P));

121:     PetscCall(MatGetSize(P2, &M, &N));
122:     PetscCall(MatGetInfo(P2, MAT_GLOBAL_SUM, &info));
123:     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Input matrix P: %s, %" PetscInt_FMT " x %" PetscInt_FMT ", %lld nonzeros, %.1f per row\n", fileP, M, N, (long long)info.nz_used, (double)info.nz_used / (double)M));
124:   } else { /* otherwise just let P = A */
125:     PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Input matrix P = A\n"));
126:     P2 = A2;
127:     P  = A;
128:   }

130:   /* Compute the reference C2 */
131:   PetscCall(MatProductCreate(A2, P2, NULL, &C2));
132:   PetscCall(MatProductSetType(C2, prodType));
133:   PetscCall(MatProductSetFill(C2, PETSC_DEFAULT));
134:   PetscCall(MatProductSetFromOptions(C2));
135:   PetscCall(MatProductSymbolic(C2));
136:   PetscCall(MatProductNumeric(C2));
137:   PetscCall(MatGetSize(C2, &M, &N));
138:   PetscCall(MatGetInfo(C2, MAT_GLOBAL_SUM, &info));
139:   PetscCall(PetscPrintf(PETSC_COMM_WORLD, "Mat product  C = %s: %" PetscInt_FMT " x %" PetscInt_FMT ", %lld nonzeros, %.1f per row\n", prodTypeStr, M, N, (long long)info.nz_used, (double)info.nz_used / (double)M));

141:   /* Compute C */
142:   PetscCall(MatProductCreate(A, P, NULL, &C));
143:   PetscCall(MatProductSetType(C, prodType));
144:   PetscCall(MatProductSetAlgorithm(C, MATPRODUCTALGORITHMBACKEND));
145:   PetscCall(MatProductSetFill(C, PETSC_DEFAULT));
146:   PetscCall(MatProductSetFromOptions(C));

148:   /* Measure  MatProductSymbolic */
149:   PetscCall(PetscLogStageRegister("MatProductSymbolic", &stage));
150:   PetscCall(PetscLogStagePush(stage));
151:   SyncDevice();
152:   PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
153:   PetscCall(PetscTime(&tstart));
154:   PetscCall(MatProductSymbolic(C));
155:   SyncDevice();
156:   PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
157:   PetscCall(PetscTime(&tend));
158:   avgTime = (tend - tstart) * 1e6; /* microseconds */
159:   PetscCall(PetscLogStagePop());
160:   PetscCall(PetscPrintf(PETSC_COMM_WORLD, "\nMatProductSymbolic()         time (us) with %d MPI ranks = %8.2f\n", size, avgTime));

162:   /* Measure  MatProductNumeric */
163:   PetscCall(PetscLogStageRegister("MatProductNumeric", &stage));
164:   for (i = 0; i < n + nskip; i++) {
165:     if (i == nskip) {
166:       SyncDevice();
167:       PetscCall(PetscLogStagePush(stage));
168:       PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
169:       PetscCall(PetscTime(&tstart));
170:     }
171:     PetscCall(MatProductReplaceMats(A, P, NULL, C));
172:     PetscCall(MatProductNumeric(C));
173:   }
174:   SyncDevice();
175:   PetscCallMPI(MPI_Barrier(PETSC_COMM_WORLD));
176:   PetscCall(PetscTime(&tend));
177:   avgTime = (tend - tstart) * 1e6 / n; /* microseconds */
178:   PetscCall(PetscLogStagePop());

180:   PetscCall(MatMultEqual(C, C2, 8, &equal)); /* Not MatEqual() since C and C2 are not necessarily bitwise equal */

182:   PetscCheck(equal, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Matrix production error");
183:   PetscCall(PetscPrintf(PETSC_COMM_WORLD, "MatProductNumeric()  average time (us) with %d MPI ranks = %8.2f\n", size, avgTime));

185:   PetscCall(MatDestroy(&A));
186:   if (flgP) PetscCall(MatDestroy(&P));
187:   PetscCall(MatDestroy(&C));

189:   PetscCall(MatDestroy(&A2));
190:   if (flgP) PetscCall(MatDestroy(&P2));
191:   PetscCall(MatDestroy(&C2));

193:   PetscCall(PetscFinalize());
194:   return 0;
195: }

197: /*TEST

199:   testset:
200:     args: -n 2 -A ${DATAFILESPATH}/matrices/small
201:     nsize: 1
202:     filter: grep "DOES_NOT_EXIST"
203:     output_file: output/empty.out
204:     requires: datafilespath !complex double !defined(PETSC_USE_64BIT_INDICES) kokkos_kernels

206:     test:
207:       suffix: 1
208:       requires: cuda
209:       args: -mat_type aijcusparse

211:     test:
212:       suffix: 2
213:       args: -mat_type aijkokkos

215:     test:
216:       suffix: 3
217:       requires: hip
218:       args: -mat_type aijhipsparse

220: TEST*/