Actual source code: matseqdensecupm.hpp

  1: #pragma once

  3: #include <petsc/private/matdensecupmimpl.h>
  4: #include <../src/mat/impls/dense/seq/dense.h>

  6: #include <petsc/private/deviceimpl.h>
  7: #include <petsc/private/randomimpl.h>
  8: #include <petsc/private/vecimpl.h>
  9: #include <petsc/private/cupmobject.hpp>
 10: #include <petsc/private/cupmsolverinterface.hpp>

 12: #include <petsc/private/cpp/type_traits.hpp>
 13: #include <petsc/private/cpp/utility.hpp>

 15: #include <../src/vec/vec/impls/seq/cupm/vecseqcupm.hpp>

 17: namespace Petsc
 18: {

 20: namespace mat
 21: {

 23: namespace cupm
 24: {

 26: namespace impl
 27: {

 29: template <device::cupm::DeviceType T>
 30: class PETSC_SINGLE_LIBRARY_VISIBILITY_INTERNAL MatDense_Seq_CUPM : MatDense_CUPM<T, MatDense_Seq_CUPM<T>> {
 31: public:
 32:   MATDENSECUPM_HEADER(T, MatDense_Seq_CUPM<T>);

 34: private:
 35:   struct Mat_SeqDenseCUPM {
 36:     PetscScalar *d_v;           // pointer to the matrix on the GPU
 37:     PetscScalar *unplacedarray; // if one called MatCUPMDensePlaceArray(), this is where it stashed the original
 38:     bool         d_user_alloc;
 39:     bool         d_unplaced_user_alloc;
 40:     // factorization support
 41:     cupmBlasInt_t *d_fact_ipiv;  // device pivots
 42:     cupmScalar_t  *d_fact_tau;   // device QR tau vector
 43:     cupmBlasInt_t *d_fact_info;  // device info
 44:     cupmScalar_t  *d_fact_work;  // device workspace
 45:     cupmBlasInt_t  d_fact_lwork; // size of device workspace
 46:     // workspace
 47:     Vec workvec;
 48:   };

 50:   static PetscErrorCode SetPreallocation_(Mat, PetscDeviceContext, PetscScalar *) noexcept;

 52:   static PetscErrorCode HostToDevice_(Mat, PetscDeviceContext) noexcept;
 53:   static PetscErrorCode DeviceToHost_(Mat, PetscDeviceContext) noexcept;

 55:   static PetscErrorCode CheckCUPMSolverInfo_(const cupmBlasInt_t *, cupmStream_t) noexcept;

 57:   template <typename Derived>
 58:   struct SolveCommon;
 59:   struct SolveQR;
 60:   struct SolveCholesky;
 61:   struct SolveLU;

 63:   template <typename Solver, bool transpose>
 64:   static PetscErrorCode MatSolve_Factored_Dispatch_(Mat, Vec, Vec) noexcept;
 65:   template <typename Solver, bool transpose>
 66:   static PetscErrorCode MatMatSolve_Factored_Dispatch_(Mat, Mat, Mat) noexcept;
 67:   template <bool transpose, bool hermitian>
 68:   static PetscErrorCode MatMultAddColumnRange_Dispatch_(Mat, Vec, Vec, Vec, PetscInt, PetscInt) noexcept;
 69:   template <bool transpose, bool hermitian>
 70:   static PetscErrorCode MatMultColumnRange_Dispatch_(Mat, Vec, Vec, PetscInt, PetscInt) noexcept;
 71:   template <bool transpose, bool hermitian>
 72:   static PetscErrorCode MatMultAdd_Dispatch_(Mat, Vec, Vec, Vec) noexcept;

 74:   template <bool to_host>
 75:   static PetscErrorCode Convert_Dispatch_(Mat, MatType, MatReuse, Mat *) noexcept;

 77:   PETSC_NODISCARD static constexpr MatType       MATIMPLCUPM_() noexcept;
 78:   PETSC_NODISCARD static constexpr Mat_SeqDense *MatIMPLCast_(Mat) noexcept;

 80: public:
 81:   PETSC_NODISCARD static constexpr Mat_SeqDenseCUPM *MatCUPMCast(Mat) noexcept;

 83:   // define these by hand since they don't fit the above mold
 84:   PETSC_NODISCARD static constexpr const char *MatConvert_seqdensecupm_seqdense_C() noexcept;
 85:   PETSC_NODISCARD static constexpr const char *MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept;

 87:   static PetscErrorCode Create(Mat) noexcept;
 88:   static PetscErrorCode Destroy(Mat) noexcept;
 89:   static PetscErrorCode SetUp(Mat) noexcept;
 90:   static PetscErrorCode Reset(Mat) noexcept;

 92:   static PetscErrorCode BindToCPU(Mat, PetscBool) noexcept;
 93:   static PetscErrorCode Convert_SeqDense_SeqDenseCUPM(Mat, MatType, MatReuse, Mat *) noexcept;
 94:   static PetscErrorCode Convert_SeqDenseCUPM_SeqDense(Mat, MatType, MatReuse, Mat *) noexcept;

 96:   template <PetscMemType, PetscMemoryAccessMode>
 97:   static PetscErrorCode GetArray(Mat, PetscScalar **, PetscDeviceContext) noexcept;
 98:   template <PetscMemType, PetscMemoryAccessMode>
 99:   static PetscErrorCode RestoreArray(Mat, PetscScalar **, PetscDeviceContext) noexcept;
100:   template <PetscMemoryAccessMode>
101:   static PetscErrorCode GetArrayAndMemType(Mat, PetscScalar **, PetscMemType *, PetscDeviceContext) noexcept;
102:   template <PetscMemoryAccessMode>
103:   static PetscErrorCode RestoreArrayAndMemType(Mat, PetscScalar **, PetscDeviceContext) noexcept;

105: private:
106:   template <PetscMemType mtype, PetscMemoryAccessMode mode>
107:   static PetscErrorCode GetArrayC_(Mat m, PetscScalar **p) noexcept
108:   {
109:     PetscDeviceContext dctx;

111:     PetscFunctionBegin;
112:     PetscCall(GetHandles_(&dctx));
113:     PetscCall(GetArray<mtype, mode>(m, p, dctx));
114:     PetscFunctionReturn(PETSC_SUCCESS);
115:   }

117:   template <PetscMemType mtype, PetscMemoryAccessMode mode>
118:   static PetscErrorCode RestoreArrayC_(Mat m, PetscScalar **p) noexcept
119:   {
120:     PetscDeviceContext dctx;

122:     PetscFunctionBegin;
123:     PetscCall(GetHandles_(&dctx));
124:     PetscCall(RestoreArray<mtype, mode>(m, p, dctx));
125:     PetscFunctionReturn(PETSC_SUCCESS);
126:   }

128:   template <PetscMemoryAccessMode mode>
129:   static PetscErrorCode GetArrayAndMemTypeC_(Mat m, PetscScalar **p, PetscMemType *tp) noexcept
130:   {
131:     PetscDeviceContext dctx;

133:     PetscFunctionBegin;
134:     PetscCall(GetHandles_(&dctx));
135:     PetscCall(GetArrayAndMemType<mode>(m, p, tp, dctx));
136:     PetscFunctionReturn(PETSC_SUCCESS);
137:   }

139:   template <PetscMemoryAccessMode mode>
140:   static PetscErrorCode RestoreArrayAndMemTypeC_(Mat m, PetscScalar **p) noexcept
141:   {
142:     PetscDeviceContext dctx;

144:     PetscFunctionBegin;
145:     PetscCall(GetHandles_(&dctx));
146:     PetscCall(RestoreArrayAndMemType<mode>(m, p, dctx));
147:     PetscFunctionReturn(PETSC_SUCCESS);
148:   }

150: public:
151:   static PetscErrorCode PlaceArray(Mat, const PetscScalar *) noexcept;
152:   static PetscErrorCode ReplaceArray(Mat, const PetscScalar *) noexcept;
153:   static PetscErrorCode ResetArray(Mat) noexcept;

155:   template <bool transpose_A, bool transpose_B>
156:   static PetscErrorCode MatMatMult_Numeric_Dispatch(Mat, Mat, Mat) noexcept;
157:   static PetscErrorCode Copy(Mat, Mat, MatStructure) noexcept;
158:   static PetscErrorCode ZeroEntries(Mat) noexcept;
159:   static PetscErrorCode Scale(Mat, PetscScalar) noexcept;
160:   static PetscErrorCode AXPY(Mat, PetscScalar, Mat, MatStructure) noexcept;
161:   static PetscErrorCode Duplicate(Mat, MatDuplicateOption, Mat *) noexcept;
162:   static PetscErrorCode SetRandom(Mat, PetscRandom) noexcept;

164:   static PetscErrorCode GetColumnVector(Mat, Vec, PetscInt) noexcept;
165:   template <PetscMemoryAccessMode>
166:   static PetscErrorCode GetColumnVec(Mat, PetscInt, Vec *) noexcept;
167:   template <PetscMemoryAccessMode>
168:   static PetscErrorCode RestoreColumnVec(Mat, PetscInt, Vec *) noexcept;

170:   static PetscErrorCode GetFactor(Mat, MatFactorType, Mat *) noexcept;
171:   static PetscErrorCode InvertFactors(Mat) noexcept;

173:   static PetscErrorCode GetSubMatrix(Mat, PetscInt, PetscInt, PetscInt, PetscInt, Mat *) noexcept;
174:   static PetscErrorCode RestoreSubMatrix(Mat, Mat *) noexcept;
175: };

177: } // namespace impl

179: namespace
180: {

182: // Declare this here so that the functions below can make use of it
183: template <device::cupm::DeviceType T>
184: inline PetscErrorCode MatCreateSeqDenseCUPM(MPI_Comm comm, PetscInt m, PetscInt n, PetscScalar *data, Mat *A, PetscDeviceContext dctx = nullptr, bool preallocate = true) noexcept
185: {
186:   PetscFunctionBegin;
187:   PetscCall(impl::MatDense_Seq_CUPM<T>::CreateIMPLDenseCUPM(comm, m, n, m, n, data, A, dctx, preallocate));
188:   PetscFunctionReturn(PETSC_SUCCESS);
189: }

191: } // anonymous namespace

193: namespace impl
194: {

196: // ==========================================================================================
197: // MatDense_Seq_CUPM - Private API - Utility
198: // ==========================================================================================

200: template <device::cupm::DeviceType T>
201: inline PetscErrorCode MatDense_Seq_CUPM<T>::SetPreallocation_(Mat m, PetscDeviceContext dctx, PetscScalar *user_device_array) noexcept
202: {
203:   const auto   mcu   = MatCUPMCast(m);
204:   const auto   nrows = m->rmap->n;
205:   const auto   ncols = m->cmap->n;
206:   auto        &lda   = MatIMPLCast(m)->lda;
207:   cupmStream_t stream;

209:   PetscFunctionBegin;
210:   PetscCheckTypeName(m, MATSEQDENSECUPM());
212:   PetscCall(checkCupmBlasIntCast(nrows));
213:   PetscCall(checkCupmBlasIntCast(ncols));
214:   PetscCall(GetHandlesFrom_(dctx, &stream));
215:   if (lda <= 0) lda = nrows;
216:   if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
217:   if (user_device_array) {
218:     mcu->d_user_alloc = PETSC_TRUE;
219:     mcu->d_v          = user_device_array;
220:   } else {
221:     std::size_t size;

223:     mcu->d_user_alloc = PETSC_FALSE;
224:     size              = lda * ncols;
225:     PetscCall(PetscCUPMMallocAsync(&mcu->d_v, size, stream));
226:     PetscCall(PetscCUPMMemsetAsync(mcu->d_v, 0, size, stream));
227:   }
228:   m->offloadmask = PETSC_OFFLOAD_GPU;
229:   PetscFunctionReturn(PETSC_SUCCESS);
230: }

232: template <device::cupm::DeviceType T>
233: inline PetscErrorCode MatDense_Seq_CUPM<T>::HostToDevice_(Mat m, PetscDeviceContext dctx) noexcept
234: {
235:   const auto nrows = m->rmap->n;
236:   const auto ncols = m->cmap->n;
237:   const auto copy  = m->offloadmask == PETSC_OFFLOAD_CPU || m->offloadmask == PETSC_OFFLOAD_UNALLOCATED;

239:   PetscFunctionBegin;
240:   PetscCheckTypeName(m, MATSEQDENSECUPM());
241:   if (m->boundtocpu) PetscFunctionReturn(PETSC_SUCCESS);
242:   PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols));
243:   if (copy) {
244:     const auto   mcu = MatCUPMCast(m);
245:     cupmStream_t stream;

247:     // Allocate GPU memory if not present
248:     if (!mcu->d_v) PetscCall(SetPreallocation(m, dctx, nullptr));
249:     PetscCall(GetHandlesFrom_(dctx, &stream));
250:     PetscCall(PetscLogEventBegin(MAT_DenseCopyToGPU, m, 0, 0, 0));
251:     {
252:       const auto mimpl = MatIMPLCast(m);
253:       const auto lda   = mimpl->lda;
254:       const auto src   = mimpl->v;
255:       const auto dest  = mcu->d_v;

257:       if (lda > nrows) {
258:         PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyHostToDevice, stream));
259:       } else {
260:         PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyHostToDevice, stream));
261:       }
262:     }
263:     PetscCall(PetscLogEventEnd(MAT_DenseCopyToGPU, m, 0, 0, 0));
264:     // order important, ensure that offloadmask is PETSC_OFFLOAD_BOTH
265:     m->offloadmask = PETSC_OFFLOAD_BOTH;
266:   }
267:   PetscFunctionReturn(PETSC_SUCCESS);
268: }

270: template <device::cupm::DeviceType T>
271: inline PetscErrorCode MatDense_Seq_CUPM<T>::DeviceToHost_(Mat m, PetscDeviceContext dctx) noexcept
272: {
273:   const auto nrows = m->rmap->n;
274:   const auto ncols = m->cmap->n;
275:   const auto copy  = m->offloadmask == PETSC_OFFLOAD_GPU;

277:   PetscFunctionBegin;
278:   PetscCheckTypeName(m, MATSEQDENSECUPM());
279:   PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols));
280:   if (copy) {
281:     const auto   mimpl = MatIMPLCast(m);
282:     cupmStream_t stream;

284:     // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed
285:     if (!mimpl->v) PetscCall(MatSeqDenseSetPreallocation(m, nullptr));
286:     PetscCall(GetHandlesFrom_(dctx, &stream));
287:     PetscCall(PetscLogEventBegin(MAT_DenseCopyFromGPU, m, 0, 0, 0));
288:     {
289:       const auto lda  = mimpl->lda;
290:       const auto dest = mimpl->v;
291:       const auto src  = MatCUPMCast(m)->d_v;

293:       if (lda > nrows) {
294:         PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyDeviceToHost, stream));
295:       } else {
296:         PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyDeviceToHost, stream));
297:       }
298:     }
299:     PetscCall(PetscLogEventEnd(MAT_DenseCopyFromGPU, m, 0, 0, 0));
300:     // order is important, MatSeqDenseSetPreallocation() might set offloadmask
301:     m->offloadmask = PETSC_OFFLOAD_BOTH;
302:   }
303:   PetscFunctionReturn(PETSC_SUCCESS);
304: }

306: template <device::cupm::DeviceType T>
307: inline PetscErrorCode MatDense_Seq_CUPM<T>::CheckCUPMSolverInfo_(const cupmBlasInt_t *fact_info, cupmStream_t stream) noexcept
308: {
309:   PetscFunctionBegin;
310:   if (PetscDefined(USE_DEBUG)) {
311:     cupmBlasInt_t info = 0;

313:     PetscCall(PetscCUPMMemcpyAsync(&info, fact_info, 1, cupmMemcpyDeviceToHost, stream));
314:     if (stream) PetscCallCUPM(cupmStreamSynchronize(stream));
315:     static_assert(std::is_same<decltype(info), int>::value, "");
316:     PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1);
317:     PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cupmSolver %d", -info);
318:   }
319:   PetscFunctionReturn(PETSC_SUCCESS);
320: }

322: // ==========================================================================================
323: // MatDense_Seq_CUPM - Private API - Solver Dispatch
324: // ==========================================================================================

326: // specific solvers called through the dispatch_() family of functions
327: template <device::cupm::DeviceType T>
328: template <typename Derived>
329: struct MatDense_Seq_CUPM<T>::SolveCommon {
330:   using derived_type = Derived;

332:   template <typename F>
333:   static PetscErrorCode ResizeFactLwork(Mat_SeqDenseCUPM *mcu, cupmStream_t stream, F &&cupmSolverComputeFactLwork) noexcept
334:   {
335:     cupmBlasInt_t lwork;

337:     PetscFunctionBegin;
338:     PetscCallCUPMSOLVER(cupmSolverComputeFactLwork(&lwork));
339:     if (lwork > mcu->d_fact_lwork) {
340:       mcu->d_fact_lwork = lwork;
341:       PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
342:       PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, lwork, stream));
343:     }
344:     PetscFunctionReturn(PETSC_SUCCESS);
345:   }

347:   static PetscErrorCode FactorPrepare(Mat A, cupmStream_t stream) noexcept
348:   {
349:     const auto mcu = MatCUPMCast(A);

351:     PetscFunctionBegin;
352:     PetscCall(PetscInfo(A, "%s factor %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", derived_type::NAME(), A->rmap->n, A->cmap->n));
353:     A->factortype             = derived_type::MATFACTORTYPE();
354:     A->ops->solve             = MatSolve_Factored_Dispatch_<derived_type, false>;
355:     A->ops->solvetranspose    = MatSolve_Factored_Dispatch_<derived_type, true>;
356:     A->ops->matsolve          = MatMatSolve_Factored_Dispatch_<derived_type, false>;
357:     A->ops->matsolvetranspose = MatMatSolve_Factored_Dispatch_<derived_type, true>;

359:     PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &A->solvertype));
360:     if (!mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream));
361:     PetscFunctionReturn(PETSC_SUCCESS);
362:   }
363: };

365: template <device::cupm::DeviceType T>
366: struct MatDense_Seq_CUPM<T>::SolveLU : SolveCommon<SolveLU> {
367:   using base_type = SolveCommon<SolveLU>;

369:   static constexpr const char   *NAME() noexcept { return "LU"; }
370:   static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_LU; }

372:   static PetscErrorCode Factor(Mat A, IS, IS, const MatFactorInfo *) noexcept
373:   {
374:     const auto         m = static_cast<cupmBlasInt_t>(A->rmap->n);
375:     const auto         n = static_cast<cupmBlasInt_t>(A->cmap->n);
376:     cupmStream_t       stream;
377:     cupmSolverHandle_t handle;
378:     PetscDeviceContext dctx;

380:     PetscFunctionBegin;
381:     if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS);
382:     PetscCall(GetHandles_(&dctx, &handle, &stream));
383:     PetscCall(base_type::FactorPrepare(A, stream));
384:     {
385:       const auto mcu = MatCUPMCast(A);
386:       const auto da  = DeviceArrayReadWrite(dctx, A);
387:       const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);

389:       // clang-format off
390:       PetscCall(
391:         base_type::ResizeFactLwork(
392:           mcu, stream,
393:           [&](cupmBlasInt_t *fact_lwork)
394:           {
395:             return cupmSolverXgetrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork);
396:           }
397:         )
398:       );
399:       // clang-format on
400:       if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream));

402:       PetscCall(PetscLogGpuTimeBegin());
403:       PetscCallCUPMSOLVER(cupmSolverXgetrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_ipiv, mcu->d_fact_info));
404:       PetscCall(PetscLogGpuTimeEnd());
405:       PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
406:     }
407:     PetscCall(PetscLogGpuFlops(2.0 * n * n * m / 3.0));
408:     PetscFunctionReturn(PETSC_SUCCESS);
409:   }

411:   template <bool transpose>
412:   static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
413:   {
414:     const auto         mcu       = MatCUPMCast(A);
415:     const auto         fact_info = mcu->d_fact_info;
416:     const auto         fact_ipiv = mcu->d_fact_ipiv;
417:     cupmSolverHandle_t handle;

419:     PetscFunctionBegin;
420:     PetscCall(GetHandlesFrom_(dctx, &handle));
421:     PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
422:     PetscCall(PetscLogGpuTimeBegin());
423:     {
424:       constexpr auto op  = transpose ? CUPMSOLVER_OP_T : CUPMSOLVER_OP_N;
425:       const auto     da  = DeviceArrayRead(dctx, A);
426:       const auto     lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);

428:       // clang-format off
429:       PetscCall(
430:         base_type::ResizeFactLwork(
431:           mcu, stream,
432:           [&](cupmBlasInt_t *lwork)
433:           {
434:             return cupmSolverXgetrs_bufferSize(
435:               handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, lwork
436:             );
437:           }
438:         )
439:       );
440:       // clang-format on
441:       PetscCallCUPMSOLVER(cupmSolverXgetrs(handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info));
442:       PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
443:     }
444:     PetscCall(PetscLogGpuTimeEnd());
445:     PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
446:     PetscFunctionReturn(PETSC_SUCCESS);
447:   }
448: };

450: template <device::cupm::DeviceType T>
451: struct MatDense_Seq_CUPM<T>::SolveCholesky : SolveCommon<SolveCholesky> {
452:   using base_type = SolveCommon<SolveCholesky>;

454:   static constexpr const char   *NAME() noexcept { return "Cholesky"; }
455:   static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_CHOLESKY; }

457:   static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept
458:   {
459:     const auto         n = static_cast<cupmBlasInt_t>(A->rmap->n);
460:     PetscDeviceContext dctx;
461:     cupmSolverHandle_t handle;
462:     cupmStream_t       stream;

464:     PetscFunctionBegin;
465:     if (!n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
466:     PetscCheck(A->spd == PETSC_BOOL3_TRUE, PETSC_COMM_SELF, PETSC_ERR_SUP, "%ssytrs unavailable. Use MAT_FACTOR_LU", cupmSolverName());
467:     PetscCall(GetHandles_(&dctx, &handle, &stream));
468:     PetscCall(base_type::FactorPrepare(A, stream));
469:     {
470:       const auto mcu = MatCUPMCast(A);
471:       const auto da  = DeviceArrayReadWrite(dctx, A);
472:       const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);

474:       // clang-format off
475:       PetscCall(
476:         base_type::ResizeFactLwork(
477:           mcu, stream,
478:           [&](cupmBlasInt_t *fact_lwork)
479:           {
480:             return cupmSolverXpotrf_bufferSize(
481:               handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, fact_lwork
482:             );
483:           }
484:         )
485:       );
486:       // clang-format on
487:       PetscCall(PetscLogGpuTimeBegin());
488:       PetscCallCUPMSOLVER(cupmSolverXpotrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
489:       PetscCall(PetscLogGpuTimeEnd());
490:       PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
491:     }
492:     PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));

494: #if 0
495:     // At the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs
496:     // and *hetr* routines. The code below should work, and it can be activated when *sytrs
497:     // routines will be available
498:     if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream));
499:     if (!mcu->d_fact_lwork) {
500:       PetscCallCUPMSOLVER(cupmSolverDnXsytrf_bufferSize(handle, n, da.cupmdata(), lda, &mcu->d_fact_lwork));
501:       PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, mcu->d_fact_lwork, stream));
502:     }
503:     if (mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream));
504:     PetscCall(PetscLogGpuTimeBegin());
505:     PetscCallCUPMSOLVER(cupmSolverXsytrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da, lda, mcu->d_fact_ipiv, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
506:     PetscCall(PetscLogGpuTimeEnd());
507: #endif
508:     PetscFunctionReturn(PETSC_SUCCESS);
509:   }

511:   template <bool transpose>
512:   static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
513:   {
514:     const auto         mcu       = MatCUPMCast(A);
515:     const auto         fact_info = mcu->d_fact_info;
516:     cupmSolverHandle_t handle;

518:     PetscFunctionBegin;
519:     PetscAssert(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%ssytrs not implemented", cupmSolverName());
520:     PetscCall(GetHandlesFrom_(dctx, &handle));
521:     PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
522:     PetscCall(PetscLogGpuTimeBegin());
523:     {
524:       const auto da  = DeviceArrayRead(dctx, A);
525:       const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);

527:       // clang-format off
528:       PetscCall(
529:         base_type::ResizeFactLwork(
530:           mcu, stream,
531:           [&](cupmBlasInt_t *lwork)
532:           {
533:             return cupmSolverXpotrs_bufferSize(
534:               handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, lwork
535:             );
536:           }
537:         )
538:       );
539:       // clang-format on
540:       PetscCallCUPMSOLVER(cupmSolverXpotrs(handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info));
541:       PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
542:     }
543:     PetscCall(PetscLogGpuTimeEnd());
544:     PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
545:     PetscFunctionReturn(PETSC_SUCCESS);
546:   }
547: };

549: template <device::cupm::DeviceType T>
550: struct MatDense_Seq_CUPM<T>::SolveQR : SolveCommon<SolveQR> {
551:   using base_type = SolveCommon<SolveQR>;

553:   static constexpr const char   *NAME() noexcept { return "QR"; }
554:   static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_QR; }

556:   static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept
557:   {
558:     const auto         m     = static_cast<cupmBlasInt_t>(A->rmap->n);
559:     const auto         n     = static_cast<cupmBlasInt_t>(A->cmap->n);
560:     const auto         min   = std::min(m, n);
561:     const auto         mimpl = MatIMPLCast(A);
562:     cupmStream_t       stream;
563:     cupmSolverHandle_t handle;
564:     PetscDeviceContext dctx;

566:     PetscFunctionBegin;
567:     if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS);
568:     PetscCall(GetHandles_(&dctx, &handle, &stream));
569:     PetscCall(base_type::FactorPrepare(A, stream));
570:     mimpl->rank = min;
571:     {
572:       const auto mcu = MatCUPMCast(A);
573:       const auto da  = DeviceArrayReadWrite(dctx, A);
574:       const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);

576:       if (!mcu->workvec) PetscCall(vec::cupm::VecCreateSeqCUPMAsync<T>(PetscObjectComm(PetscObjectCast(A)), m, &mcu->workvec));
577:       if (!mcu->d_fact_tau) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_tau, min, stream));
578:       // clang-format off
579:       PetscCall(
580:         base_type::ResizeFactLwork(
581:           mcu, stream,
582:           [&](cupmBlasInt_t *fact_lwork)
583:           {
584:             return cupmSolverXgeqrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork);
585:           }
586:         )
587:       );
588:       // clang-format on
589:       PetscCall(PetscLogGpuTimeBegin());
590:       PetscCallCUPMSOLVER(cupmSolverXgeqrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_tau, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
591:       PetscCall(PetscLogGpuTimeEnd());
592:       PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
593:     }
594:     PetscCall(PetscLogGpuFlops(2.0 * min * min * (std::max(m, n) - min / 3.0)));
595:     PetscFunctionReturn(PETSC_SUCCESS);
596:   }

598:   template <bool transpose>
599:   static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
600:   {
601:     const auto         mimpl      = MatIMPLCast(A);
602:     const auto         rank       = static_cast<cupmBlasInt_t>(mimpl->rank);
603:     const auto         mcu        = MatCUPMCast(A);
604:     const auto         fact_info  = mcu->d_fact_info;
605:     const auto         fact_tau   = mcu->d_fact_tau;
606:     const auto         fact_work  = mcu->d_fact_work;
607:     const auto         fact_lwork = mcu->d_fact_lwork;
608:     cupmSolverHandle_t solver_handle;
609:     cupmBlasHandle_t   blas_handle;

611:     PetscFunctionBegin;
612:     PetscCall(GetHandlesFrom_(dctx, &blas_handle, &solver_handle));
613:     PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
614:     PetscCall(PetscLogGpuTimeBegin());
615:     {
616:       const auto da  = DeviceArrayRead(dctx, A);
617:       const auto one = cupmScalarCast(1.0);
618:       const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);

620:       if (transpose) {
621:         PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_T, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx));
622:         PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, CUPMSOLVER_OP_N, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info));
623:         PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
624:       } else {
625:         constexpr auto op = PetscDefined(USE_COMPLEX) ? CUPMSOLVER_OP_C : CUPMSOLVER_OP_T;

627:         PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, op, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info));
628:         PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
629:         PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_N, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx));
630:       }
631:     }
632:     PetscCall(PetscLogGpuTimeEnd());
633:     PetscCall(PetscLogFlops(nrhs * (4.0 * m * rank - (rank * rank))));
634:     PetscFunctionReturn(PETSC_SUCCESS);
635:   }
636: };

638: template <device::cupm::DeviceType T>
639: template <typename Solver, bool transpose>
640: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatSolve_Factored_Dispatch_(Mat A, Vec x, Vec y) noexcept
641: {
642:   using namespace vec::cupm;
643:   const auto         pobj_A  = PetscObjectCast(A);
644:   const auto         m       = static_cast<cupmBlasInt_t>(A->rmap->n);
645:   const auto         k       = static_cast<cupmBlasInt_t>(A->cmap->n);
646:   auto              &workvec = MatCUPMCast(A)->workvec;
647:   PetscScalar       *y_array = nullptr;
648:   PetscDeviceContext dctx;
649:   PetscBool          xiscupm, yiscupm, aiscupm;
650:   bool               use_y_array_directly;
651:   cupmStream_t       stream;

653:   PetscFunctionBegin;
654:   PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
655:   PetscCall(PetscObjectTypeCompare(PetscObjectCast(x), VecSeq_CUPM::VECSEQCUPM(), &xiscupm));
656:   PetscCall(PetscObjectTypeCompare(PetscObjectCast(y), VecSeq_CUPM::VECSEQCUPM(), &yiscupm));
657:   PetscCall(PetscObjectTypeCompare(pobj_A, MATSEQDENSECUPM(), &aiscupm));
658:   PetscAssert(aiscupm, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix A is somehow not CUPM?????????????????????????????");
659:   PetscCall(GetHandles_(&dctx, &stream));
660:   use_y_array_directly = yiscupm && (k >= m);
661:   {
662:     const PetscScalar *x_array;
663:     const auto         xisdevice = xiscupm && PetscOffloadDevice(x->offloadmask);
664:     const auto         copy_mode = xisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice;

666:     if (!use_y_array_directly && !workvec) PetscCall(VecCreateSeqCUPMAsync<T>(PetscObjectComm(pobj_A), m, &workvec));
667:     // The logic here is to try to minimize the amount of memory copying:
668:     //
669:     // If we call VecCUPMGetArrayRead(X, &x) every time xiscupm and the data is not offloaded
670:     // to the GPU yet, then the data is copied to the GPU. But we are only trying to get the
671:     // data in order to copy it into the y array. So the array x will be wherever the data
672:     // already is so that only one memcpy is performed
673:     if (xisdevice) {
674:       PetscCall(VecCUPMGetArrayReadAsync<T>(x, &x_array, dctx));
675:     } else {
676:       PetscCall(VecGetArrayRead(x, &x_array));
677:     }
678:     PetscCall(VecCUPMGetArrayWriteAsync<T>(use_y_array_directly ? y : workvec, &y_array, dctx));
679:     PetscCall(PetscCUPMMemcpyAsync(y_array, x_array, m, copy_mode, stream));
680:     if (xisdevice) {
681:       PetscCall(VecCUPMRestoreArrayReadAsync<T>(x, &x_array, dctx));
682:     } else {
683:       PetscCall(VecRestoreArrayRead(x, &x_array));
684:     }
685:   }

687:   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
688:   PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y_array), m, m, 1, k, dctx, stream));
689:   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));

691:   if (use_y_array_directly) {
692:     PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &y_array, dctx));
693:   } else {
694:     const auto   copy_mode = yiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost;
695:     PetscScalar *yv;

697:     // The logic here is that the data is not yet in either y's GPU array or its CPU array.
698:     // There is nothing in the interface to say where the user would like it to end up. So we
699:     // choose the GPU, because it is the faster option
700:     if (yiscupm) {
701:       PetscCall(VecCUPMGetArrayWriteAsync<T>(y, &yv, dctx));
702:     } else {
703:       PetscCall(VecGetArray(y, &yv));
704:     }
705:     PetscCall(PetscCUPMMemcpyAsync(yv, y_array, k, copy_mode, stream));
706:     if (yiscupm) {
707:       PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &yv, dctx));
708:     } else {
709:       PetscCall(VecRestoreArray(y, &yv));
710:     }
711:     PetscCall(VecCUPMRestoreArrayWriteAsync<T>(workvec, &y_array));
712:   }
713:   PetscFunctionReturn(PETSC_SUCCESS);
714: }

716: template <device::cupm::DeviceType T>
717: template <typename Solver, bool transpose>
718: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatSolve_Factored_Dispatch_(Mat A, Mat B, Mat X) noexcept
719: {
720:   const auto         m = static_cast<cupmBlasInt_t>(A->rmap->n);
721:   const auto         k = static_cast<cupmBlasInt_t>(A->cmap->n);
722:   cupmBlasInt_t      nrhs, ldb, ldx, ldy;
723:   PetscScalar       *y;
724:   PetscBool          biscupm, xiscupm, aiscupm;
725:   PetscDeviceContext dctx;
726:   cupmStream_t       stream;

728:   PetscFunctionBegin;
729:   PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
730:   PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &biscupm));
731:   PetscCall(PetscObjectTypeCompare(PetscObjectCast(X), MATSEQDENSECUPM(), &xiscupm));
732:   PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &aiscupm));
733:   PetscCall(GetHandles_(&dctx, &stream));
734:   {
735:     PetscInt n;

737:     PetscCall(MatGetSize(B, nullptr, &n));
738:     PetscCall(PetscCUPMBlasIntCast(n, &nrhs));
739:     PetscCall(MatDenseGetLDA(B, &n));
740:     PetscCall(PetscCUPMBlasIntCast(n, &ldb));
741:     PetscCall(MatDenseGetLDA(X, &n));
742:     PetscCall(PetscCUPMBlasIntCast(n, &ldx));
743:   }
744:   {
745:     // The logic here is to try to minimize the amount of memory copying:
746:     //
747:     // If we call MatDenseCUPMGetArrayRead(B, &b) every time biscupm and the data is not
748:     // offloaded to the GPU yet, then the data is copied to the GPU. But we are only trying to
749:     // get the data in order to copy it into the y array. So the array b will be wherever the
750:     // data already is so that only one memcpy is performed
751:     const auto         bisdevice = biscupm && PetscOffloadDevice(B->offloadmask);
752:     const auto         copy_mode = bisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice;
753:     const PetscScalar *b;

755:     if (bisdevice) {
756:       b = DeviceArrayRead(dctx, B);
757:     } else if (biscupm) {
758:       b = HostArrayRead(dctx, B);
759:     } else {
760:       PetscCall(MatDenseGetArrayRead(B, &b));
761:     }

763:     if (ldx < m || !xiscupm) {
764:       // X's array cannot serve as the array (too small or not on device), B's array cannot
765:       // serve as the array (const), so allocate a new array
766:       ldy = m;
767:       PetscCall(PetscCUPMMallocAsync(&y, nrhs * m));
768:     } else {
769:       // X's array should serve as the array
770:       ldy = ldx;
771:       y   = DeviceArrayWrite(dctx, X);
772:     }
773:     PetscCall(PetscCUPMMemcpy2DAsync(y, ldy, b, ldb, m, nrhs, copy_mode, stream));
774:     if (!bisdevice && !biscupm) PetscCall(MatDenseRestoreArrayRead(B, &b));
775:   }

777:   // convert to CUPM twice??????????????????????????????????
778:   // but A should already be CUPM??????????????????????????????????????
779:   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
780:   PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y), ldy, m, nrhs, k, dctx, stream));
781:   if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));

783:   if (ldx < m || !xiscupm) {
784:     const auto   copy_mode = xiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost;
785:     PetscScalar *x;

787:     // The logic here is that the data is not yet in either X's GPU array or its CPU
788:     // array. There is nothing in the interface to say where the user would like it to end up.
789:     // So we choose the GPU, because it is the faster option
790:     if (xiscupm) {
791:       x = DeviceArrayWrite(dctx, X);
792:     } else {
793:       PetscCall(MatDenseGetArray(X, &x));
794:     }
795:     PetscCall(PetscCUPMMemcpy2DAsync(x, ldx, y, ldy, k, nrhs, copy_mode, stream));
796:     if (!xiscupm) PetscCall(MatDenseRestoreArray(X, &x));
797:     PetscCallCUPM(cupmFreeAsync(y, stream));
798:   }
799:   PetscFunctionReturn(PETSC_SUCCESS);
800: }

802: template <device::cupm::DeviceType T>
803: template <bool transpose, bool hermitian>
804: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultAddColumnRange_Dispatch_(Mat A, Vec xx, Vec yy, Vec zz, PetscInt c_start, PetscInt c_end) noexcept
805: {
806:   const auto         m   = static_cast<cupmBlasInt_t>(A->rmap->n);
807:   const auto         n   = static_cast<cupmBlasInt_t>(c_end - c_start);
808:   const auto         lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
809:   cupmBlasHandle_t   handle;
810:   PetscDeviceContext dctx;

812:   PetscFunctionBegin;
813:   if (yy && yy != zz) PetscCall(VecSeq_CUPM::Copy(yy, zz)); // mult add
814:   if (!m || !n) {
815:     // mult only
816:     if (!yy) PetscCall(VecSeq_CUPM::Set(zz, 0.0));
817:     PetscFunctionReturn(PETSC_SUCCESS);
818:   }
819:   PetscCall(PetscInfo(A, "Matrix-vector product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, n));
820:   PetscCall(GetHandles_(&dctx, &handle));
821:   {
822:     constexpr auto op   = transpose ? (hermitian ? CUPMBLAS_OP_C : CUPMBLAS_OP_T) : CUPMBLAS_OP_N;
823:     const auto     one  = cupmScalarCast(1.0);
824:     const auto     zero = cupmScalarCast(0.0);
825:     const auto     da   = DeviceArrayRead(dctx, A);
826:     const auto     dxx  = VecSeq_CUPM::DeviceArrayRead(dctx, xx);
827:     const auto     dzz  = VecSeq_CUPM::DeviceArrayReadWrite(dctx, zz);

829:     PetscCall(PetscLogGpuTimeBegin());
830:     PetscCallCUPMBLAS(cupmBlasXgemv(handle, op, m, n, &one, da.cupmdata() + c_start * lda, lda, dxx.cupmdata() + (transpose ? 0 : c_start), 1, yy ? &one : &zero, dzz.cupmdata() + (transpose ? c_start : 0), 1));
831:     PetscCall(PetscLogGpuTimeEnd());
832:   }
833:   PetscCall(PetscLogGpuFlops(2.0 * m * n - (yy ? 0 : m)));
834:   PetscFunctionReturn(PETSC_SUCCESS);
835: }

837: template <device::cupm::DeviceType T>
838: template <bool transpose, bool hermitian>
839: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultColumnRange_Dispatch_(Mat A, Vec xx, Vec yy, PetscInt c_start, PetscInt c_end) noexcept
840: {
841:   PetscFunctionBegin;
842:   PetscCall(MatMultAddColumnRange_Dispatch_<transpose, hermitian>(A, xx, nullptr, yy, c_start, c_end));
843:   PetscFunctionReturn(PETSC_SUCCESS);
844: }

846: template <device::cupm::DeviceType T>
847: template <bool transpose, bool hermitian>
848: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultAdd_Dispatch_(Mat A, Vec xx, Vec yy, Vec zz) noexcept
849: {
850:   PetscFunctionBegin;
851:   PetscCall(MatMultAddColumnRange_Dispatch_<transpose, hermitian>(A, xx, yy, zz, 0, A->cmap->n));
852:   PetscFunctionReturn(PETSC_SUCCESS);
853: }

855: // ==========================================================================================
856: // MatDense_Seq_CUPM - Private API - Conversion Dispatch
857: // ==========================================================================================

859: template <device::cupm::DeviceType T>
860: template <bool to_host>
861: inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_Dispatch_(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
862: {
863:   PetscFunctionBegin;
864:   if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
865:     // TODO these cases should be optimized
866:     PetscCall(MatConvert_Basic(M, type, reuse, newmat));
867:   } else {
868:     const auto B    = *newmat;
869:     const auto pobj = PetscObjectCast(B);

871:     if (to_host) {
872:       PetscCall(BindToCPU(B, PETSC_TRUE));
873:       PetscCall(Reset(B));
874:     } else {
875:       PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
876:     }

878:     PetscCall(PetscStrFreeAllocpy(to_host ? VECSTANDARD : VecSeq_CUPM::VECCUPM(), &B->defaultvectype));
879:     PetscCall(PetscObjectChangeTypeName(pobj, to_host ? MATSEQDENSE : MATSEQDENSECUPM()));
880:     // cvec might be the wrong VecType, destroy and rebuild it if necessary
881:     // REVIEW ME: this is possibly very inefficient
882:     PetscCall(VecDestroy(&MatIMPLCast(B)->cvec));

884:     MatComposeOp_CUPM(to_host, pobj, MatConvert_seqdensecupm_seqdense_C(), nullptr, Convert_SeqDenseCUPM_SeqDense);
885:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArray_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>);
886:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayRead_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>);
887:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayWrite_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>);
888:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArray_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>);
889:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayRead_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>);
890:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayWrite_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>);
891:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMPlaceArray_C(), nullptr, PlaceArray);
892:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMResetArray_C(), nullptr, ResetArray);
893:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMReplaceArray_C(), nullptr, ReplaceArray);
894:     MatComposeOp_CUPM(to_host, pobj, MatProductSetFromOptions_seqaij_seqdensecupm_C(), nullptr, MatProductSetFromOptions_SeqAIJ_SeqDense);
895:     MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMSetPreallocation_C(), nullptr, SetPreallocation);

897:     if (to_host) {
898:       B->offloadmask = PETSC_OFFLOAD_CPU;
899:     } else {
900:       Mat_SeqDenseCUPM *mcu;

902:       PetscCall(PetscNew(&mcu));
903:       B->spptr       = mcu;
904:       B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; // REVIEW ME: why not offload host??
905:       PetscCall(BindToCPU(B, PETSC_FALSE));
906:     }

908:     MatSetOp_CUPM(to_host, B, bindtocpu, nullptr, BindToCPU);
909:     MatSetOp_CUPM(to_host, B, destroy, MatDestroy_SeqDense, Destroy);
910:   }
911:   PetscFunctionReturn(PETSC_SUCCESS);
912: }

914: // ==========================================================================================
915: // MatDense_Seq_CUPM - Public API
916: // ==========================================================================================

918: template <device::cupm::DeviceType T>
919: inline constexpr MatType MatDense_Seq_CUPM<T>::MATIMPLCUPM_() noexcept
920: {
921:   return MATSEQDENSECUPM();
922: }

924: template <device::cupm::DeviceType T>
925: inline constexpr typename MatDense_Seq_CUPM<T>::Mat_SeqDenseCUPM *MatDense_Seq_CUPM<T>::MatCUPMCast(Mat m) noexcept
926: {
927:   return static_cast<Mat_SeqDenseCUPM *>(m->spptr);
928: }

930: template <device::cupm::DeviceType T>
931: inline constexpr Mat_SeqDense *MatDense_Seq_CUPM<T>::MatIMPLCast_(Mat m) noexcept
932: {
933:   return static_cast<Mat_SeqDense *>(m->data);
934: }

936: template <device::cupm::DeviceType T>
937: inline constexpr const char *MatDense_Seq_CUPM<T>::MatConvert_seqdensecupm_seqdense_C() noexcept
938: {
939:   return T == device::cupm::DeviceType::CUDA ? "MatConvert_seqdensecuda_seqdense_C" : "MatConvert_seqdensehip_seqdense_C";
940: }

942: template <device::cupm::DeviceType T>
943: inline constexpr const char *MatDense_Seq_CUPM<T>::MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept
944: {
945:   return T == device::cupm::DeviceType::CUDA ? "MatProductSetFromOptions_seqaij_seqdensecuda_C" : "MatProductSetFromOptions_seqaij_seqdensehip_C";
946: }

948: // ==========================================================================================

950: // MatCreate_SeqDenseCUPM()
951: template <device::cupm::DeviceType T>
952: inline PetscErrorCode MatDense_Seq_CUPM<T>::Create(Mat A) noexcept
953: {
954:   PetscFunctionBegin;
955:   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
956:   PetscCall(MatCreate_SeqDense(A));
957:   PetscCall(Convert_SeqDense_SeqDenseCUPM(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
958:   PetscFunctionReturn(PETSC_SUCCESS);
959: }

961: template <device::cupm::DeviceType T>
962: inline PetscErrorCode MatDense_Seq_CUPM<T>::Destroy(Mat A) noexcept
963: {
964:   PetscFunctionBegin;
965:   // prevent copying back data if we own the data pointer
966:   if (!MatIMPLCast(A)->user_alloc) A->offloadmask = PETSC_OFFLOAD_CPU;
967:   PetscCall(Convert_SeqDenseCUPM_SeqDense(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
968:   PetscCall(MatDestroy_SeqDense(A));
969:   PetscFunctionReturn(PETSC_SUCCESS);
970: }

972: // obj->ops->setup()
973: template <device::cupm::DeviceType T>
974: inline PetscErrorCode MatDense_Seq_CUPM<T>::SetUp(Mat A) noexcept
975: {
976:   PetscFunctionBegin;
977:   PetscCall(PetscLayoutSetUp(A->rmap));
978:   PetscCall(PetscLayoutSetUp(A->cmap));
979:   if (!A->preallocated) {
980:     PetscDeviceContext dctx;

982:     PetscCall(GetHandles_(&dctx));
983:     PetscCall(SetPreallocation(A, dctx, nullptr));
984:   }
985:   PetscFunctionReturn(PETSC_SUCCESS);
986: }

988: template <device::cupm::DeviceType T>
989: inline PetscErrorCode MatDense_Seq_CUPM<T>::Reset(Mat A) noexcept
990: {
991:   PetscFunctionBegin;
992:   if (const auto mcu = MatCUPMCast(A)) {
993:     cupmStream_t stream;

995:     PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
996:     PetscCall(GetHandles_(&stream));
997:     if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
998:     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_tau, stream));
999:     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_ipiv, stream));
1000:     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_info, stream));
1001:     PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
1002:     PetscCall(VecDestroy(&mcu->workvec));
1003:     PetscCall(PetscFree(A->spptr /* mcu */));
1004:   }
1005:   PetscFunctionReturn(PETSC_SUCCESS);
1006: }

1008: // ==========================================================================================

1010: template <device::cupm::DeviceType T>
1011: inline PetscErrorCode MatDense_Seq_CUPM<T>::BindToCPU(Mat A, PetscBool to_host) noexcept
1012: {
1013:   const auto mimpl = MatIMPLCast(A);
1014:   const auto pobj  = PetscObjectCast(A);

1016:   PetscFunctionBegin;
1017:   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1018:   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1019:   A->boundtocpu = to_host;
1020:   PetscCall(PetscStrFreeAllocpy(to_host ? PETSCRANDER48 : PETSCDEVICERAND(), &A->defaultrandtype));
1021:   if (to_host) {
1022:     PetscDeviceContext dctx;

1024:     // make sure we have an up-to-date copy on the CPU
1025:     PetscCall(GetHandles_(&dctx));
1026:     PetscCall(DeviceToHost_(A, dctx));
1027:   } else {
1028:     PetscBool iscupm;

1030:     if (auto &cvec = mimpl->cvec) {
1031:       PetscCall(PetscObjectTypeCompare(PetscObjectCast(cvec), VecSeq_CUPM::VECSEQCUPM(), &iscupm));
1032:       if (!iscupm) PetscCall(VecDestroy(&cvec));
1033:     }
1034:     if (auto &cmat = mimpl->cmat) {
1035:       PetscCall(PetscObjectTypeCompare(PetscObjectCast(cmat), MATSEQDENSECUPM(), &iscupm));
1036:       if (!iscupm) PetscCall(MatDestroy(&cmat));
1037:     }
1038:   }

1040:   // ============================================================
1041:   // Composed ops
1042:   // ============================================================
1043:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArray_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>);
1044:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayRead_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>);
1045:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWrite_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>);
1046:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>);
1047:   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>);
1048:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayReadAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>);
1049:   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayReadAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>);
1050:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWriteAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>);
1051:   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayWriteAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>);
1052:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>);
1053:   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>);
1054:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ>);
1055:   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ>);
1056:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_WRITE>);
1057:   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_WRITE>);
1058:   MatComposeOp_CUPM(to_host, pobj, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDense, GetSubMatrix);
1059:   MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDense, RestoreSubMatrix);
1060:   MatComposeOp_CUPM(to_host, pobj, "MatQRFactor_C", MatQRFactor_SeqDense, SolveQR::Factor);
1061:   MatComposeOp_CUPM(to_host, pobj, "MatMultAddColumnRange_C", MatMultAddColumnRange_SeqDense, MatMultAddColumnRange_Dispatch_</* transpose */ false, /* hermitian */ false>);
1062:   MatComposeOp_CUPM(to_host, pobj, "MatMultHermitianTransposeColumnRange_C", MatMultHermitianTransposeColumnRange_SeqDense, MatMultColumnRange_Dispatch_</* transpose */ true, /* hermitian */ true>);
1063:   MatComposeOp_CUPM(to_host, pobj, "MatMultHermitianTransposeAddColumnRange_C", MatMultHermitianTransposeAddColumnRange_SeqDense, MatMultAddColumnRange_Dispatch_</* transpose */ true, /* hermitian */ true>);
1064:   // always the same
1065:   PetscCall(PetscObjectComposeFunction(pobj, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDense));

1067:   // ============================================================
1068:   // Function pointer ops
1069:   // ============================================================
1070:   MatSetOp_CUPM(to_host, A, duplicate, MatDuplicate_SeqDense, Duplicate);
1071:   MatSetOp_CUPM(to_host, A, mult, MatMult_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ false, /* hermitian */ false>(A, xx, nullptr, yy); });
1072:   MatSetOp_CUPM(to_host, A, multtranspose, MatMultTranspose_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ false>(A, xx, nullptr, yy); });
1073:   MatSetOp_CUPM(to_host, A, multhermitiantranspose, MatMultTranspose_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ true>(A, xx, nullptr, yy); });
1074:   MatSetOp_CUPM(to_host, A, multadd, MatMultAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ false, /* hermitian */ false>);
1075:   MatSetOp_CUPM(to_host, A, multtransposeadd, MatMultTransposeAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ false>);
1076:   MatSetOp_CUPM(to_host, A, multhermitiantransposeadd, MatMultHermitianTransposeAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ true>);
1077:   MatSetOp_CUPM(to_host, A, matmultnumeric, MatMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ false>);
1078:   MatSetOp_CUPM(to_host, A, mattransposemultnumeric, MatMatTransposeMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ true>);
1079:   MatSetOp_CUPM(to_host, A, transposematmultnumeric, MatTransposeMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ true, /* transpose_B */ false>);
1080:   MatSetOp_CUPM(to_host, A, axpy, MatAXPY_SeqDense, AXPY);
1081:   MatSetOp_CUPM(to_host, A, choleskyfactor, MatCholeskyFactor_SeqDense, SolveCholesky::Factor);
1082:   MatSetOp_CUPM(to_host, A, lufactor, MatLUFactor_SeqDense, SolveLU::Factor);
1083:   MatSetOp_CUPM(to_host, A, getcolumnvector, MatGetColumnVector_SeqDense, GetColumnVector);
1084:   MatSetOp_CUPM(to_host, A, scale, MatScale_SeqDense, Scale);
1085:   MatSetOp_CUPM(to_host, A, shift, MatShift_SeqDense, Shift);
1086:   MatSetOp_CUPM(to_host, A, copy, MatCopy_SeqDense, Copy);
1087:   MatSetOp_CUPM(to_host, A, zeroentries, MatZeroEntries_SeqDense, ZeroEntries);
1088:   MatSetOp_CUPM(to_host, A, setup, MatSetUp_SeqDense, SetUp);
1089:   MatSetOp_CUPM(to_host, A, setrandom, MatSetRandom_SeqDense, SetRandom);
1090:   MatSetOp_CUPM(to_host, A, getdiagonal, MatGetDiagonal_SeqDense, GetDiagonal);
1091:   // seemingly always the same
1092:   A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;

1094:   if (const auto cmat = mimpl->cmat) PetscCall(MatBindToCPU(cmat, to_host));
1095:   PetscFunctionReturn(PETSC_SUCCESS);
1096: }

1098: template <device::cupm::DeviceType T>
1099: inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDenseCUPM_SeqDense(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
1100: {
1101:   PetscFunctionBegin;
1102:   PetscCall(Convert_Dispatch_</* to host */ true>(M, type, reuse, newmat));
1103:   PetscFunctionReturn(PETSC_SUCCESS);
1104: }

1106: template <device::cupm::DeviceType T>
1107: inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDense_SeqDenseCUPM(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
1108: {
1109:   PetscFunctionBegin;
1110:   PetscCall(Convert_Dispatch_</* to host */ false>(M, type, reuse, newmat));
1111:   PetscFunctionReturn(PETSC_SUCCESS);
1112: }

1114: // ==========================================================================================

1116: template <device::cupm::DeviceType T>
1117: template <PetscMemType mtype, PetscMemoryAccessMode access>
1118: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArray(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept
1119: {
1120:   constexpr auto hostmem     = PetscMemTypeHost(mtype);
1121:   constexpr auto read_access = PetscMemoryAccessRead(access);

1123:   PetscFunctionBegin;
1124:   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
1125:   PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx));
1126:   if (hostmem) {
1127:     if (read_access) {
1128:       PetscCall(DeviceToHost_(m, dctx));
1129:     } else if (!MatIMPLCast(m)->v) {
1130:       // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed
1131:       PetscCall(MatSeqDenseSetPreallocation(m, nullptr));
1132:     }
1133:     *array = MatIMPLCast(m)->v;
1134:   } else {
1135:     if (read_access) {
1136:       PetscCall(HostToDevice_(m, dctx));
1137:     } else if (!MatCUPMCast(m)->d_v) {
1138:       // write-only
1139:       PetscCall(SetPreallocation(m, dctx, nullptr));
1140:     }
1141:     *array = MatCUPMCast(m)->d_v;
1142:   }
1143:   if (PetscMemoryAccessWrite(access)) {
1144:     m->offloadmask = hostmem ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1145:     PetscCall(PetscObjectStateIncrease(PetscObjectCast(m)));
1146:   }
1147:   PetscFunctionReturn(PETSC_SUCCESS);
1148: }

1150: template <device::cupm::DeviceType T>
1151: template <PetscMemType mtype, PetscMemoryAccessMode access>
1152: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArray(Mat m, PetscScalar **array, PetscDeviceContext) noexcept
1153: {
1154:   PetscFunctionBegin;
1155:   static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
1156:   if (PetscMemoryAccessWrite(access)) {
1157:     // WRITE or READ_WRITE
1158:     m->offloadmask = PetscMemTypeHost(mtype) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1159:     PetscCall(PetscObjectStateIncrease(PetscObjectCast(m)));
1160:   }
1161:   if (array) {
1162:     PetscCall(CheckPointerMatchesMemType_(*array, mtype));
1163:     *array = nullptr;
1164:   }
1165:   PetscFunctionReturn(PETSC_SUCCESS);
1166: }

1168: template <device::cupm::DeviceType T>
1169: template <PetscMemoryAccessMode access>
1170: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArrayAndMemType(Mat m, PetscScalar **array, PetscMemType *mtype, PetscDeviceContext dctx) noexcept
1171: {
1172:   PetscFunctionBegin;
1173:   PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx));
1174:   if (mtype) *mtype = PETSC_MEMTYPE_CUPM();
1175:   PetscFunctionReturn(PETSC_SUCCESS);
1176: }

1178: template <device::cupm::DeviceType T>
1179: template <PetscMemoryAccessMode access>
1180: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArrayAndMemType(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept
1181: {
1182:   PetscFunctionBegin;
1183:   PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx));
1184:   PetscFunctionReturn(PETSC_SUCCESS);
1185: }

1187: // ==========================================================================================

1189: template <device::cupm::DeviceType T>
1190: inline PetscErrorCode MatDense_Seq_CUPM<T>::PlaceArray(Mat A, const PetscScalar *array) noexcept
1191: {
1192:   const auto mimpl = MatIMPLCast(A);
1193:   const auto mcu   = MatCUPMCast(A);

1195:   PetscFunctionBegin;
1196:   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1197:   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1198:   PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1199:   if (mimpl->v) {
1200:     PetscDeviceContext dctx;

1202:     PetscCall(GetHandles_(&dctx));
1203:     PetscCall(HostToDevice_(A, dctx));
1204:   }
1205:   mcu->unplacedarray         = util::exchange(mcu->d_v, const_cast<PetscScalar *>(array));
1206:   mcu->d_unplaced_user_alloc = util::exchange(mcu->d_user_alloc, PETSC_TRUE);
1207:   PetscFunctionReturn(PETSC_SUCCESS);
1208: }

1210: template <device::cupm::DeviceType T>
1211: inline PetscErrorCode MatDense_Seq_CUPM<T>::ReplaceArray(Mat A, const PetscScalar *array) noexcept
1212: {
1213:   const auto mimpl = MatIMPLCast(A);
1214:   const auto mcu   = MatCUPMCast(A);

1216:   PetscFunctionBegin;
1217:   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1218:   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1219:   PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1220:   if (!mcu->d_user_alloc) {
1221:     cupmStream_t stream;

1223:     PetscCall(GetHandles_(&stream));
1224:     PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
1225:   }
1226:   mcu->d_v          = const_cast<PetscScalar *>(array);
1227:   mcu->d_user_alloc = PETSC_FALSE;
1228:   PetscFunctionReturn(PETSC_SUCCESS);
1229: }

1231: template <device::cupm::DeviceType T>
1232: inline PetscErrorCode MatDense_Seq_CUPM<T>::ResetArray(Mat A) noexcept
1233: {
1234:   const auto mimpl = MatIMPLCast(A);
1235:   const auto mcu   = MatCUPMCast(A);

1237:   PetscFunctionBegin;
1238:   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1239:   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1240:   if (mimpl->v) {
1241:     PetscDeviceContext dctx;

1243:     PetscCall(GetHandles_(&dctx));
1244:     PetscCall(HostToDevice_(A, dctx));
1245:   }
1246:   mcu->d_v          = util::exchange(mcu->unplacedarray, nullptr);
1247:   mcu->d_user_alloc = mcu->d_unplaced_user_alloc;
1248:   PetscFunctionReturn(PETSC_SUCCESS);
1249: }

1251: // ==========================================================================================

1253: template <device::cupm::DeviceType T>
1254: template <bool transpose_A, bool transpose_B>
1255: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatMult_Numeric_Dispatch(Mat A, Mat B, Mat C) noexcept
1256: {
1257:   cupmBlasInt_t      m, n, k;
1258:   PetscBool          Aiscupm, Biscupm;
1259:   PetscDeviceContext dctx;
1260:   cupmBlasHandle_t   handle;

1262:   PetscFunctionBegin;
1263:   PetscCall(PetscCUPMBlasIntCast(C->rmap->n, &m));
1264:   PetscCall(PetscCUPMBlasIntCast(C->cmap->n, &n));
1265:   PetscCall(PetscCUPMBlasIntCast(transpose_A ? A->rmap->n : A->cmap->n, &k));
1266:   if (!m || !n || !k) PetscFunctionReturn(PETSC_SUCCESS);

1268:   // we may end up with SEQDENSE as one of the arguments
1269:   // REVIEW ME: how? and why is it not B and C????????
1270:   PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &Aiscupm));
1271:   PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &Biscupm));
1272:   if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
1273:   if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &B));
1274:   PetscCall(PetscInfo(C, "Matrix-Matrix product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, k, n));
1275:   PetscCall(GetHandles_(&dctx, &handle));

1277:   PetscCall(PetscLogGpuTimeBegin());
1278:   {
1279:     const auto one  = cupmScalarCast(1.0);
1280:     const auto zero = cupmScalarCast(0.0);
1281:     const auto da   = DeviceArrayRead(dctx, A);
1282:     const auto db   = DeviceArrayRead(dctx, B);
1283:     const auto dc   = DeviceArrayWrite(dctx, C);
1284:     PetscInt   alda, blda, clda;

1286:     PetscCall(MatDenseGetLDA(A, &alda));
1287:     PetscCall(MatDenseGetLDA(B, &blda));
1288:     PetscCall(MatDenseGetLDA(C, &clda));
1289:     PetscCallCUPMBLAS(cupmBlasXgemm(handle, transpose_A ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, transpose_B ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, m, n, k, &one, da.cupmdata(), alda, db.cupmdata(), blda, &zero, dc.cupmdata(), clda));
1290:   }
1291:   PetscCall(PetscLogGpuTimeEnd());

1293:   PetscCall(PetscLogGpuFlops(1.0 * m * n * k + 1.0 * m * n * (k - 1)));
1294:   if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
1295:   if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
1296:   PetscFunctionReturn(PETSC_SUCCESS);
1297: }

1299: template <device::cupm::DeviceType T>
1300: inline PetscErrorCode MatDense_Seq_CUPM<T>::Copy(Mat A, Mat B, MatStructure str) noexcept
1301: {
1302:   const auto m = A->rmap->n;
1303:   const auto n = A->cmap->n;

1305:   PetscFunctionBegin;
1306:   PetscAssert(m == B->rmap->n && n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "size(B) != size(A)");
1307:   // The two matrices must have the same copy implementation to be eligible for fast copy
1308:   if (A->ops->copy == B->ops->copy) {
1309:     PetscDeviceContext dctx;
1310:     cupmStream_t       stream;

1312:     PetscCall(GetHandles_(&dctx, &stream));
1313:     PetscCall(PetscLogGpuTimeBegin());
1314:     {
1315:       const auto va = DeviceArrayRead(dctx, A);
1316:       const auto vb = DeviceArrayWrite(dctx, B);
1317:       // order is important, DeviceArrayRead/Write() might call SetPreallocation() which sets
1318:       // lda!
1319:       const auto lda_a = MatIMPLCast(A)->lda;
1320:       const auto lda_b = MatIMPLCast(B)->lda;

1322:       if (lda_a > m || lda_b > m) {
1323:         PetscAssert(lda_b > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "B lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_b, cupmNAME());
1324:         PetscAssert(lda_a > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_a, cupmNAME());
1325:         PetscCall(PetscCUPMMemcpy2DAsync(vb.data(), lda_b, va.data(), lda_a, m, n, cupmMemcpyDeviceToDevice, stream));
1326:       } else {
1327:         PetscCall(PetscCUPMMemcpyAsync(vb.data(), va.data(), m * n, cupmMemcpyDeviceToDevice, stream));
1328:       }
1329:     }
1330:     PetscCall(PetscLogGpuTimeEnd());
1331:   } else {
1332:     PetscCall(MatCopy_Basic(A, B, str));
1333:   }
1334:   PetscFunctionReturn(PETSC_SUCCESS);
1335: }

1337: template <device::cupm::DeviceType T>
1338: inline PetscErrorCode MatDense_Seq_CUPM<T>::ZeroEntries(Mat m) noexcept
1339: {
1340:   PetscDeviceContext dctx;
1341:   cupmStream_t       stream;

1343:   PetscFunctionBegin;
1344:   PetscCall(GetHandles_(&dctx, &stream));
1345:   PetscCall(PetscLogGpuTimeBegin());
1346:   {
1347:     const auto va  = DeviceArrayWrite(dctx, m);
1348:     const auto lda = MatIMPLCast(m)->lda;
1349:     const auto ma  = m->rmap->n;
1350:     const auto na  = m->cmap->n;

1352:     if (lda > ma) {
1353:       PetscCall(PetscCUPMMemset2DAsync(va.data(), lda, 0, ma, na, stream));
1354:     } else {
1355:       PetscCall(PetscCUPMMemsetAsync(va.data(), 0, ma * na, stream));
1356:     }
1357:   }
1358:   PetscCall(PetscLogGpuTimeEnd());
1359:   PetscFunctionReturn(PETSC_SUCCESS);
1360: }

1362: namespace detail
1363: {

1365: // ==========================================================================================
1366: // SubMatIndexFunctor
1367: //
1368: // Iterator which permutes a linear index range into matrix indices for am nrows x ncols
1369: // submat with leading dimension lda. Essentially SubMatIndexFunctor(i) returns the index for
1370: // the i'th sequential entry in the matrix.
1371: // ==========================================================================================
1372: template <typename T>
1373: struct SubMatIndexFunctor {
1374:   PETSC_HOSTDEVICE_INLINE_DECL T operator()(T x) const noexcept { return ((x / nrows) * lda) + (x % nrows); }

1376:   PetscInt nrows;
1377:   PetscInt ncols;
1378:   PetscInt lda;
1379: };

1381: template <typename Iterator>
1382: struct SubMatrixIterator : MatrixIteratorBase<Iterator, SubMatIndexFunctor<typename thrust::iterator_difference<Iterator>::type>> {
1383:   using base_type = MatrixIteratorBase<Iterator, SubMatIndexFunctor<typename thrust::iterator_difference<Iterator>::type>>;

1385:   using iterator = typename base_type::iterator;

1387:   constexpr SubMatrixIterator(Iterator first, Iterator last, PetscInt nrows, PetscInt ncols, PetscInt lda) noexcept :
1388:     base_type{
1389:       std::move(first), std::move(last), {nrows, ncols, lda}
1390:   }
1391:   {
1392:   }

1394:   PETSC_NODISCARD iterator end() const noexcept { return this->begin() + (this->func.nrows * this->func.ncols); }
1395: };

1397: namespace
1398: {

1400: template <typename T>
1401: PETSC_NODISCARD inline SubMatrixIterator<typename thrust::device_vector<T>::iterator> make_submat_iterator(PetscInt rstart, PetscInt rend, PetscInt cstart, PetscInt cend, PetscInt lda, T *ptr) noexcept
1402: {
1403:   const auto nrows = rend - rstart;
1404:   const auto ncols = cend - cstart;
1405:   const auto dptr  = thrust::device_pointer_cast(ptr);

1407:   return {dptr + (rstart * lda) + cstart, dptr + ((rstart + nrows) * lda) + cstart, nrows, ncols, lda};
1408: }

1410: } // namespace

1412: } // namespace detail

1414: template <device::cupm::DeviceType T>
1415: inline PetscErrorCode MatDense_Seq_CUPM<T>::Scale(Mat A, PetscScalar alpha) noexcept
1416: {
1417:   const auto         m = A->rmap->n;
1418:   const auto         n = A->cmap->n;
1419:   const auto         N = m * n;
1420:   PetscDeviceContext dctx;

1422:   PetscFunctionBegin;
1423:   PetscCall(PetscInfo(A, "Performing Scale %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n));
1424:   PetscCall(GetHandles_(&dctx));
1425:   {
1426:     const auto da  = DeviceArrayReadWrite(dctx, A);
1427:     const auto lda = MatIMPLCast(A)->lda;

1429:     if (lda > m) {
1430:       cupmStream_t stream;

1432:       PetscCall(GetHandlesFrom_(dctx, &stream));
1433:       // clang-format off
1434:       PetscCallThrust(
1435:         const auto sub_mat = detail::make_submat_iterator(0, m, 0, n, lda, da.data());

1437:         THRUST_CALL(
1438:           thrust::transform,
1439:           stream,
1440:           sub_mat.begin(), sub_mat.end(), sub_mat.begin(),
1441:           device::cupm::functors::make_times_equals(alpha)
1442:         )
1443:       );
1444:       // clang-format on
1445:     } else {
1446:       const auto       cu_alpha = cupmScalarCast(alpha);
1447:       cupmBlasHandle_t handle;

1449:       PetscCall(GetHandlesFrom_(dctx, &handle));
1450:       PetscCall(PetscLogGpuTimeBegin());
1451:       PetscCallCUPMBLAS(cupmBlasXscal(handle, N, &cu_alpha, da.cupmdata(), 1));
1452:       PetscCall(PetscLogGpuTimeEnd());
1453:     }
1454:   }
1455:   PetscCall(PetscLogGpuFlops(N));
1456:   PetscFunctionReturn(PETSC_SUCCESS);
1457: }

1459: template <device::cupm::DeviceType T>
1460: inline PetscErrorCode MatDense_Seq_CUPM<T>::AXPY(Mat Y, PetscScalar alpha, Mat X, MatStructure) noexcept
1461: {
1462:   const auto         m_x = X->rmap->n, m_y = Y->rmap->n;
1463:   const auto         n_x = X->cmap->n, n_y = Y->cmap->n;
1464:   const auto         N = m_x * n_x;
1465:   PetscDeviceContext dctx;

1467:   PetscFunctionBegin;
1468:   if (!m_x || !n_x || alpha == (PetscScalar)0.0) PetscFunctionReturn(PETSC_SUCCESS);
1469:   PetscCall(PetscInfo(Y, "Performing AXPY %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m_y, n_y));
1470:   PetscCall(GetHandles_(&dctx));
1471:   {
1472:     const auto dx    = DeviceArrayRead(dctx, X);
1473:     const auto dy    = DeviceArrayReadWrite(dctx, Y);
1474:     const auto lda_x = MatIMPLCast(X)->lda;
1475:     const auto lda_y = MatIMPLCast(Y)->lda;

1477:     if (lda_x > m_x || lda_y > m_x) {
1478:       cupmStream_t stream;

1480:       PetscCall(GetHandlesFrom_(dctx, &stream));
1481:       // clang-format off
1482:       PetscCallThrust(
1483:         const auto sub_mat_y = detail::make_submat_iterator(0, m_y, 0, n_y, lda_y, dy.data());
1484:         const auto sub_mat_x = detail::make_submat_iterator(0, m_x, 0, n_x, lda_x, dx.data());

1486:         THRUST_CALL(
1487:           thrust::transform,
1488:           stream,
1489:           sub_mat_x.begin(), sub_mat_x.end(), sub_mat_y.begin(), sub_mat_y.begin(),
1490:           device::cupm::functors::make_axpy(alpha)
1491:         );
1492:       );
1493:       // clang-format on
1494:     } else {
1495:       const auto       cu_alpha = cupmScalarCast(alpha);
1496:       cupmBlasHandle_t handle;

1498:       PetscCall(GetHandlesFrom_(dctx, &handle));
1499:       PetscCall(PetscLogGpuTimeBegin());
1500:       PetscCallCUPMBLAS(cupmBlasXaxpy(handle, N, &cu_alpha, dx.cupmdata(), 1, dy.cupmdata(), 1));
1501:       PetscCall(PetscLogGpuTimeEnd());
1502:     }
1503:   }
1504:   PetscCall(PetscLogGpuFlops(PetscMax(2 * N - 1, 0)));
1505:   PetscFunctionReturn(PETSC_SUCCESS);
1506: }

1508: template <device::cupm::DeviceType T>
1509: inline PetscErrorCode MatDense_Seq_CUPM<T>::Duplicate(Mat A, MatDuplicateOption opt, Mat *B) noexcept
1510: {
1511:   const auto         hopt = (opt == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) ? MAT_DO_NOT_COPY_VALUES : opt;
1512:   PetscDeviceContext dctx;

1514:   PetscFunctionBegin;
1515:   PetscCall(GetHandles_(&dctx));
1516:   // do not call SetPreallocation() yet, we call it afterwards??
1517:   PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, B, dctx, /* preallocate */ false));
1518:   PetscCall(MatDuplicateNoCreate_SeqDense(*B, A, hopt));
1519:   if (opt == MAT_COPY_VALUES && hopt != MAT_COPY_VALUES) PetscCall(Copy(A, *B, SAME_NONZERO_PATTERN));
1520:   // allocate memory if needed
1521:   if (opt != MAT_COPY_VALUES && !MatCUPMCast(*B)->d_v) PetscCall(SetPreallocation(*B, dctx, nullptr));
1522:   PetscFunctionReturn(PETSC_SUCCESS);
1523: }

1525: template <device::cupm::DeviceType T>
1526: inline PetscErrorCode MatDense_Seq_CUPM<T>::SetRandom(Mat A, PetscRandom rng) noexcept
1527: {
1528:   PetscBool device;

1530:   PetscFunctionBegin;
1531:   PetscCall(PetscObjectTypeCompare(PetscObjectCast(rng), PETSCDEVICERAND(), &device));
1532:   if (device) {
1533:     const auto         m = A->rmap->n;
1534:     const auto         n = A->cmap->n;
1535:     PetscDeviceContext dctx;

1537:     PetscCall(GetHandles_(&dctx));
1538:     {
1539:       const auto a = DeviceArrayWrite(dctx, A);
1540:       PetscInt   lda;

1542:       PetscCall(MatDenseGetLDA(A, &lda));
1543:       if (lda > m) {
1544:         for (PetscInt i = 0; i < n; i++) PetscCall(PetscRandomGetValues(rng, m, a.data() + i * lda));
1545:       } else {
1546:         PetscInt mn;

1548:         PetscCall(PetscIntMultError(m, n, &mn));
1549:         PetscCall(PetscRandomGetValues(rng, mn, a));
1550:       }
1551:     }
1552:   } else {
1553:     PetscCall(MatSetRandom_SeqDense(A, rng));
1554:   }
1555:   PetscFunctionReturn(PETSC_SUCCESS);
1556: }

1558: // ==========================================================================================

1560: template <device::cupm::DeviceType T>
1561: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVector(Mat A, Vec v, PetscInt col) noexcept
1562: {
1563:   const auto         offloadmask = A->offloadmask;
1564:   const auto         n           = A->rmap->n;
1565:   const auto         col_offset  = [&](const PetscScalar *ptr) { return ptr + col * MatIMPLCast(A)->lda; };
1566:   PetscBool          viscupm;
1567:   PetscDeviceContext dctx;
1568:   cupmStream_t       stream;

1570:   PetscFunctionBegin;
1571:   PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(v), &viscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), ""));
1572:   PetscCall(GetHandles_(&dctx, &stream));
1573:   if (viscupm && !v->boundtocpu) {
1574:     const auto x = VecSeq_CUPM::DeviceArrayWrite(dctx, v);

1576:     // update device data
1577:     if (PetscOffloadDevice(offloadmask)) {
1578:       PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToDevice, stream));
1579:     } else {
1580:       PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(HostArrayRead(dctx, A)), n, cupmMemcpyHostToDevice, stream));
1581:     }
1582:   } else {
1583:     PetscScalar *x;

1585:     // update host data
1586:     PetscCall(VecGetArrayWrite(v, &x));
1587:     if (PetscOffloadUnallocated(offloadmask) || PetscOffloadHost(offloadmask)) {
1588:       PetscCall(PetscArraycpy(x, col_offset(HostArrayRead(dctx, A)), n));
1589:     } else if (PetscOffloadDevice(offloadmask)) {
1590:       PetscCall(PetscCUPMMemcpyAsync(x, col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToHost, stream));
1591:     }
1592:     PetscCall(VecRestoreArrayWrite(v, &x));
1593:   }
1594:   PetscFunctionReturn(PETSC_SUCCESS);
1595: }

1597: template <device::cupm::DeviceType T>
1598: template <PetscMemoryAccessMode access>
1599: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVec(Mat A, PetscInt col, Vec *v) noexcept
1600: {
1601:   using namespace vec::cupm;
1602:   const auto         mimpl = MatIMPLCast(A);
1603:   PetscDeviceContext dctx;

1605:   PetscFunctionBegin;
1606:   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1607:   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1608:   mimpl->vecinuse = col + 1;
1609:   if (!mimpl->cvec) PetscCall(MatDenseCreateColumnVec_Private(A, &mimpl->cvec));
1610:   PetscCall(GetHandles_(&dctx));
1611:   PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx));
1612:   PetscCall(VecCUPMPlaceArrayAsync<T>(mimpl->cvec, mimpl->ptrinuse + static_cast<std::size_t>(col) * static_cast<std::size_t>(mimpl->lda)));
1613:   if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPush(mimpl->cvec));
1614:   *v = mimpl->cvec;
1615:   PetscFunctionReturn(PETSC_SUCCESS);
1616: }

1618: template <device::cupm::DeviceType T>
1619: template <PetscMemoryAccessMode access>
1620: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreColumnVec(Mat A, PetscInt, Vec *v) noexcept
1621: {
1622:   using namespace vec::cupm;
1623:   const auto         mimpl = MatIMPLCast(A);
1624:   const auto         cvec  = mimpl->cvec;
1625:   PetscDeviceContext dctx;

1627:   PetscFunctionBegin;
1628:   PetscCheck(mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1629:   PetscCheck(cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector");
1630:   mimpl->vecinuse = 0;
1631:   if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPop(cvec));
1632:   PetscCall(VecCUPMResetArrayAsync<T>(cvec));
1633:   PetscCall(GetHandles_(&dctx));
1634:   PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx));
1635:   if (v) *v = nullptr;
1636:   PetscFunctionReturn(PETSC_SUCCESS);
1637: }

1639: // ==========================================================================================

1641: template <device::cupm::DeviceType T>
1642: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetFactor(Mat A, MatFactorType ftype, Mat *fact_out) noexcept
1643: {
1644:   Mat                fact = nullptr;
1645:   PetscDeviceContext dctx;

1647:   PetscFunctionBegin;
1648:   PetscCall(GetHandles_(&dctx));
1649:   PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, &fact, dctx, /* preallocate */ false));
1650:   fact->factortype = ftype;
1651:   switch (ftype) {
1652:   case MAT_FACTOR_LU:
1653:   case MAT_FACTOR_ILU: // fall-through
1654:     fact->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqDense;
1655:     fact->ops->ilufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1656:     break;
1657:   case MAT_FACTOR_CHOLESKY:
1658:   case MAT_FACTOR_ICC: // fall-through
1659:     fact->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1660:     break;
1661:   case MAT_FACTOR_QR: {
1662:     const auto pobj = PetscObjectCast(fact);

1664:     PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactor_C", MatQRFactor_SeqDense));
1665:     PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactorSymbolic_C", MatQRFactorSymbolic_SeqDense));
1666:   } break;
1667:   case MAT_FACTOR_NONE:
1668:   case MAT_FACTOR_ILUDT:     // fall-through
1669:   case MAT_FACTOR_NUM_TYPES: // fall-through
1670:     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s not supported", MatFactorTypes[ftype]);
1671:   }
1672:   PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &fact->solvertype));
1673:   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_LU));
1674:   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ILU));
1675:   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_CHOLESKY));
1676:   PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ICC));
1677:   *fact_out = fact;
1678:   PetscFunctionReturn(PETSC_SUCCESS);
1679: }

1681: template <device::cupm::DeviceType T>
1682: inline PetscErrorCode MatDense_Seq_CUPM<T>::InvertFactors(Mat A) noexcept
1683: {
1684:   const auto         mimpl = MatIMPLCast(A);
1685:   const auto         mcu   = MatCUPMCast(A);
1686:   const auto         n     = static_cast<cupmBlasInt_t>(A->cmap->n);
1687:   cupmSolverHandle_t handle;
1688:   PetscDeviceContext dctx;
1689:   cupmStream_t       stream;

1691:   PetscFunctionBegin;
1692: #if PetscDefined(HAVE_CUDA) && PetscDefined(USING_NVCC)
1693:   // HIP appears to have this by default??
1694:   PetscCheck(PETSC_PKG_CUDA_VERSION_GE(10, 1, 0), PETSC_COMM_SELF, PETSC_ERR_SUP, "Upgrade to CUDA version 10.1.0 or higher");
1695: #endif
1696:   if (!n || !A->rmap->n) PetscFunctionReturn(PETSC_SUCCESS);
1697:   PetscCheck(A->factortype == MAT_FACTOR_CHOLESKY, PETSC_COMM_SELF, PETSC_ERR_LIB, "Factor type %s not implemented", MatFactorTypes[A->factortype]);
1698:   // spd
1699:   PetscCheck(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%sDnsytri not implemented", cupmSolverName());

1701:   PetscCall(GetHandles_(&dctx, &handle, &stream));
1702:   {
1703:     const auto    da  = DeviceArrayReadWrite(dctx, A);
1704:     const auto    lda = static_cast<cupmBlasInt_t>(mimpl->lda);
1705:     cupmBlasInt_t il;

1707:     PetscCallCUPMSOLVER(cupmSolverXpotri_bufferSize(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, &il));
1708:     if (il > mcu->d_fact_lwork) {
1709:       mcu->d_fact_lwork = il;
1710:       PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
1711:       PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, il, stream));
1712:     }
1713:     PetscCall(PetscLogGpuTimeBegin());
1714:     PetscCallCUPMSOLVER(cupmSolverXpotri(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
1715:     PetscCall(PetscLogGpuTimeEnd());
1716:   }
1717:   PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
1718:   // TODO (write cuda kernel)
1719:   PetscCall(MatSeqDenseSymmetrize_Private(A, PETSC_TRUE));
1720:   PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));

1722:   A->ops->solve          = nullptr;
1723:   A->ops->solvetranspose = nullptr;
1724:   A->ops->matsolve       = nullptr;
1725:   A->factortype          = MAT_FACTOR_NONE;

1727:   PetscCall(PetscFree(A->solvertype));
1728:   PetscFunctionReturn(PETSC_SUCCESS);
1729: }

1731: // ==========================================================================================

1733: template <device::cupm::DeviceType T>
1734: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetSubMatrix(Mat A, PetscInt rbegin, PetscInt rend, PetscInt cbegin, PetscInt cend, Mat *mat) noexcept
1735: {
1736:   const auto         mimpl        = MatIMPLCast(A);
1737:   const auto         array_offset = [&](PetscScalar *ptr) { return ptr + rbegin + static_cast<std::size_t>(cbegin) * mimpl->lda; };
1738:   const auto         n            = rend - rbegin;
1739:   const auto         m            = cend - cbegin;
1740:   auto              &cmat         = mimpl->cmat;
1741:   PetscDeviceContext dctx;

1743:   PetscFunctionBegin;
1744:   PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1745:   PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1746:   mimpl->matinuse = cbegin + 1;

1748:   PetscCall(GetHandles_(&dctx));
1749:   PetscCall(HostToDevice_(A, dctx));

1751:   if (cmat && ((m != cmat->cmap->N) || (n != cmat->rmap->N))) PetscCall(MatDestroy(&cmat));
1752:   {
1753:     const auto device_array = array_offset(MatCUPMCast(A)->d_v);

1755:     if (cmat) {
1756:       PetscCall(PlaceArray(cmat, device_array));
1757:     } else {
1758:       PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), n, m, device_array, &cmat, dctx));
1759:     }
1760:   }
1761:   PetscCall(MatDenseSetLDA(cmat, mimpl->lda));
1762:   // place CPU array if present but do not copy any data
1763:   if (const auto host_array = mimpl->v) {
1764:     cmat->offloadmask = PETSC_OFFLOAD_GPU;
1765:     PetscCall(MatDensePlaceArray(cmat, array_offset(host_array)));
1766:   }

1768:   cmat->offloadmask = A->offloadmask;
1769:   *mat              = cmat;
1770:   PetscFunctionReturn(PETSC_SUCCESS);
1771: }

1773: template <device::cupm::DeviceType T>
1774: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreSubMatrix(Mat A, Mat *m) noexcept
1775: {
1776:   const auto mimpl = MatIMPLCast(A);
1777:   const auto cmat  = mimpl->cmat;
1778:   const auto reset = static_cast<bool>(mimpl->v);
1779:   bool       copy, was_offload_host;

1781:   PetscFunctionBegin;
1782:   PetscCheck(mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetSubMatrix() first");
1783:   PetscCheck(cmat, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column matrix");
1784:   PetscCheck(*m == cmat, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Not the matrix obtained from MatDenseGetSubMatrix()");
1785:   mimpl->matinuse = 0;

1787:   // calls to ResetArray may change it, so save it here
1788:   was_offload_host = cmat->offloadmask == PETSC_OFFLOAD_CPU;
1789:   if (was_offload_host && !reset) {
1790:     copy = true;
1791:     PetscCall(MatSeqDenseSetPreallocation(A, nullptr));
1792:   } else {
1793:     copy = false;
1794:   }

1796:   PetscCall(ResetArray(cmat));
1797:   if (reset) PetscCall(MatDenseResetArray(cmat));
1798:   if (copy) {
1799:     PetscDeviceContext dctx;

1801:     PetscCall(GetHandles_(&dctx));
1802:     PetscCall(DeviceToHost_(A, dctx));
1803:   } else {
1804:     A->offloadmask = was_offload_host ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1805:   }

1807:   cmat->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1808:   *m                = nullptr;
1809:   PetscFunctionReturn(PETSC_SUCCESS);
1810: }

1812: // ==========================================================================================

1814: namespace
1815: {

1817: template <device::cupm::DeviceType T>
1818: inline PetscErrorCode MatMatMultNumeric_SeqDenseCUPM_SeqDenseCUPM(Mat A, Mat B, Mat C, PetscBool TA, PetscBool TB) noexcept
1819: {
1820:   PetscFunctionBegin;
1821:   if (TA) {
1822:     if (TB) {
1823:       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, true>(A, B, C));
1824:     } else {
1825:       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, false>(A, B, C));
1826:     }
1827:   } else {
1828:     if (TB) {
1829:       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, true>(A, B, C));
1830:     } else {
1831:       PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, false>(A, B, C));
1832:     }
1833:   }
1834:   PetscFunctionReturn(PETSC_SUCCESS);
1835: }

1837: template <device::cupm::DeviceType T>
1838: inline PetscErrorCode MatSolverTypeRegister_DENSECUPM() noexcept
1839: {
1840:   PetscFunctionBegin;
1841:   for (auto ftype : util::make_array(MAT_FACTOR_LU, MAT_FACTOR_CHOLESKY, MAT_FACTOR_QR)) {
1842:     PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MATSEQDENSE, ftype, MatDense_Seq_CUPM<T>::GetFactor));
1843:     PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MatDense_Seq_CUPM<T>::MATSEQDENSECUPM(), ftype, MatDense_Seq_CUPM<T>::GetFactor));
1844:   }
1845:   PetscFunctionReturn(PETSC_SUCCESS);
1846: }

1848: } // anonymous namespace

1850: } // namespace impl

1852: } // namespace cupm

1854: } // namespace mat

1856: } // namespace Petsc