Actual source code: pbjacobi_cuda.cu
1: #include <petscdevice_cuda.h>
2: #include <petsc/private/petsclegacycupmblas.h>
3: #include <../src/ksp/pc/impls/pbjacobi/pbjacobi.h>
5: #if PETSC_PKG_CUDA_VERSION_LT(11, 7, 0)
6: __global__ static void MatMultBatched(PetscInt bs, PetscInt mbs, const PetscScalar *A, const PetscScalar *x, PetscScalar *y, PetscBool transpose)
7: {
8: const PetscInt gridSize = gridDim.x * blockDim.x;
9: PetscInt row = blockIdx.x * blockDim.x + threadIdx.x;
10: const PetscInt bs2 = bs * bs;
12: /* One row per thread. The blocks are stored in column-major order */
13: for (; row < bs * mbs; row += gridSize) {
14: const PetscScalar *Ap, *xp;
15: PetscScalar *yp;
16: PetscInt i, j, k;
18: k = row / bs; /* k-th block */
19: i = row % bs; /* this thread deals with i-th row of the block */
20: Ap = &A[bs2 * k + i * (transpose ? bs : 1)]; /* Ap points to the first entry of i-th row */
21: xp = &x[bs * k];
22: yp = &y[bs * k];
23: /* multiply i-th row (column) with x */
24: yp[i] = 0.0;
25: for (j = 0; j < bs; j++) {
26: yp[i] += Ap[0] * xp[j];
27: Ap += (transpose ? 1 : bs); /* block is in column major order */
28: }
29: }
30: }
31: #endif
33: static PetscErrorCode PCApplyOrTranspose_PBJacobi_CUDA(PC pc, Vec x, Vec y, cublasOperation_t op)
34: {
35: const PetscScalar *xx;
36: PetscScalar *yy;
37: cublasHandle_t handle;
38: PC_PBJacobi *jac = (PC_PBJacobi *)pc->data;
39: const PetscScalar *A = (const PetscScalar *)jac->spptr;
40: const PetscInt bs = jac->bs, mbs = jac->mbs;
42: PetscFunctionBegin;
43: PetscCall(VecCUDAGetArrayRead(x, &xx));
44: PetscCall(VecCUDAGetArrayWrite(y, &yy));
45: PetscCall(PetscCUBLASGetHandle(&handle));
46: PetscCallCUBLAS(cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST)); /* alpha, beta are on host */
48: #if PETSC_PKG_CUDA_VERSION_GE(11, 7, 0)
49: /* y = alpha op(A) x + beta y */
50: const PetscScalar alpha = 1.0, beta = 0.0;
51: PetscCallCUBLAS(cublasXgemvStridedBatched(handle, op, bs, bs, &alpha, A, bs, bs * bs, xx, 1, bs, &beta, yy, 1, bs, mbs));
52: #else
53: PetscInt gridSize = PetscMin((bs * mbs + 255) / 256, 2147483647); /* <= 2^31-1 */
54: MatMultBatched<<<gridSize, 256>>>(bs, mbs, A, xx, yy, op == CUBLAS_OP_T ? PETSC_TRUE : PETSC_FALSE);
55: PetscCallCUDA(cudaGetLastError());
56: #endif
57: PetscCall(VecCUDARestoreArrayRead(x, &xx));
58: PetscCall(VecCUDARestoreArrayWrite(y, &yy));
59: PetscCall(PetscLogGpuFlops(bs * bs * mbs * 2));
60: PetscFunctionReturn(PETSC_SUCCESS);
61: }
63: static PetscErrorCode PCApply_PBJacobi_CUDA(PC pc, Vec x, Vec y)
64: {
65: PetscFunctionBegin;
66: PetscCall(PCApplyOrTranspose_PBJacobi_CUDA(pc, x, y, CUBLAS_OP_N)); // No transpose
67: PetscFunctionReturn(PETSC_SUCCESS);
68: }
70: static PetscErrorCode PCApplyTranspose_PBJacobi_CUDA(PC pc, Vec x, Vec y)
71: {
72: PetscFunctionBegin;
73: PetscCall(PCApplyOrTranspose_PBJacobi_CUDA(pc, x, y, CUBLAS_OP_T)); // Transpose
74: PetscFunctionReturn(PETSC_SUCCESS);
75: }
77: static PetscErrorCode PCDestroy_PBJacobi_CUDA(PC pc)
78: {
79: PC_PBJacobi *jac = (PC_PBJacobi *)pc->data;
81: PetscFunctionBegin;
82: PetscCallCUDA(cudaFree(jac->spptr));
83: PetscCall(PCDestroy_PBJacobi(pc));
84: PetscFunctionReturn(PETSC_SUCCESS);
85: }
87: PETSC_INTERN PetscErrorCode PCSetUp_PBJacobi_CUDA(PC pc, Mat diagVB)
88: {
89: PC_PBJacobi *jac = (PC_PBJacobi *)pc->data;
90: size_t size;
92: PetscFunctionBegin;
93: PetscCall(PCSetUp_PBJacobi_Host(pc, diagVB)); /* Compute the inverse on host now. Might worth doing it on device directly */
94: size = sizeof(PetscScalar) * jac->bs * jac->bs * jac->mbs;
96: /* PBJacobi_CUDA is simple so that we use jac->spptr as if it is diag_d */
97: if (!jac->spptr) PetscCallCUDAVoid(cudaMalloc(&jac->spptr, size));
98: PetscCallCUDAVoid(cudaMemcpy(jac->spptr, jac->diag, size, cudaMemcpyHostToDevice));
99: PetscCall(PetscLogCpuToGpu(size));
101: pc->ops->apply = PCApply_PBJacobi_CUDA;
102: pc->ops->applytranspose = PCApplyTranspose_PBJacobi_CUDA;
103: pc->ops->destroy = PCDestroy_PBJacobi_CUDA;
104: PetscFunctionReturn(PETSC_SUCCESS);
105: }