Actual source code: ex2.c

  1: static char help[] = "Test SF cuda stream synchronization in device to host communication\n\n";
  2: /*
  3:   SF uses asynchronous operations internally. When destination data is on GPU, it does asynchronous
  4:   operations in the default stream and does not sync these operations since it assumes routines consume
  5:   the destination data are also on the default stream. However, when destination data in on CPU,
  6:   SF must guarantee the data is ready to use on CPU after PetscSFXxxEnd().
  7:  */

  9: #include <petscvec.h>
 10: int main(int argc, char **argv)
 11: {
 12:   PetscInt           i, n = 100000; /* Big enough to make the asynchronous copy meaningful */
 13:   PetscScalar       *val;
 14:   const PetscScalar *yval;
 15:   Vec                x, y;
 16:   PetscMPIInt        size;
 17:   IS                 ix, iy;
 18:   VecScatter         vscat;

 20:   PetscFunctionBegin;
 21:   PetscFunctionBeginUser;
 22:   PetscCall(PetscInitialize(&argc, &argv, NULL, help));
 23:   PetscCallMPI(MPI_Comm_size(PETSC_COMM_WORLD, &size));
 24:   PetscCheck(size == 1, PETSC_COMM_WORLD, PETSC_ERR_WRONG_MPI_SIZE, "This is a uni-processor test");

 26:   /* Create two CUDA vectors x, y. Though we only care y's memory on host, we make y a CUDA vector,
 27:      since we want to have y's memory on host pinned (i.e.,non-pagable), to really trigger asynchronous
 28:      cudaMemcpyDeviceToHost.
 29:    */
 30:   PetscCall(VecCreateSeq(PETSC_COMM_WORLD, n, &x));
 31:   PetscCall(VecSetFromOptions(x));
 32:   PetscCall(VecCreateSeq(PETSC_COMM_WORLD, n, &y));
 33:   PetscCall(VecSetFromOptions(y));

 35:   /* Init x, y, and push them to GPU (their offloadmask = PETSC_OFFLOAD_GPU) */
 36:   PetscCall(VecGetArray(x, &val));
 37:   for (i = 0; i < n; i++) val[i] = i / 2.0;
 38:   PetscCall(VecRestoreArray(x, &val));
 39:   PetscCall(VecScale(x, 2.0));
 40:   PetscCall(VecSet(y, 314));

 42:   /* Pull y to CPU (make its offloadmask = PETSC_OFFLOAD_CPU) */
 43:   PetscCall(VecGetArray(y, &val));
 44:   PetscCall(VecRestoreArray(y, &val));

 46:   /* The vscat is simply a vector copy */
 47:   PetscCall(ISCreateStride(PETSC_COMM_SELF, n, 0, 1, &ix));
 48:   PetscCall(ISCreateStride(PETSC_COMM_SELF, n, 0, 1, &iy));
 49:   PetscCall(VecScatterCreate(x, ix, y, iy, &vscat));

 51:   /* Do device to host vecscatter and then immediately use y on host. VecScat/SF may use asynchronous
 52:      cudaMemcpy or kernels, but it must guarantee y is ready to use on host. Otherwise, wrong data will be displayed.
 53:    */
 54:   PetscCall(VecScatterBegin(vscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
 55:   PetscCall(VecScatterEnd(vscat, x, y, INSERT_VALUES, SCATTER_FORWARD));
 56:   PetscCall(VecGetArrayRead(y, &yval));
 57:   /* Display the first and the last entries of y to see if it is valid on host */
 58:   PetscCall(PetscPrintf(PETSC_COMM_SELF, "y[0]=%.f, y[%" PetscInt_FMT "] = %.f\n", (float)PetscRealPart(yval[0]), n - 1, (float)PetscRealPart(yval[n - 1])));
 59:   PetscCall(VecRestoreArrayRead(y, &yval));

 61:   PetscCall(VecDestroy(&x));
 62:   PetscCall(VecDestroy(&y));
 63:   PetscCall(ISDestroy(&ix));
 64:   PetscCall(ISDestroy(&iy));
 65:   PetscCall(VecScatterDestroy(&vscat));
 66:   PetscCall(PetscFinalize());
 67:   return 0;
 68: }

 70: /*TEST

 72:    test:
 73:     requires: cuda
 74:     diff_args: -j
 75:     #make sure the host memory is pinned
 76:     # sf_backend cuda is not needed if compiling only with cuda
 77:     args: -vec_type cuda -sf_backend cuda -vec_pinned_memory_min 0

 79:    test:
 80:     suffix: hip
 81:     requires: hip
 82:     diff_args: -j
 83:     output_file: output/ex2_1.out
 84:     #make sure the host memory is pinned
 85:     # sf_backend hip is not needed if compiling only with hip
 86:     args: -vec_type hip -sf_backend hip -vec_pinned_memory_min 0

 88: TEST*/