Actual source code: matseqdensecupm.hpp
1: #pragma once
3: #include <petsc/private/matdensecupmimpl.h>
4: #include <../src/mat/impls/dense/seq/dense.h>
6: #include <petsc/private/deviceimpl.h>
7: #include <petsc/private/randomimpl.h>
8: #include <petsc/private/vecimpl.h>
9: #include <petsc/private/cupmobject.hpp>
10: #include <petsc/private/cupmsolverinterface.hpp>
12: #include <petsc/private/cpp/type_traits.hpp>
13: #include <petsc/private/cpp/utility.hpp>
15: #include <../src/vec/vec/impls/seq/cupm/vecseqcupm.hpp>
17: namespace Petsc
18: {
20: namespace mat
21: {
23: namespace cupm
24: {
26: namespace impl
27: {
29: template <device::cupm::DeviceType T>
30: class PETSC_SINGLE_LIBRARY_VISIBILITY_INTERNAL MatDense_Seq_CUPM : MatDense_CUPM<T, MatDense_Seq_CUPM<T>> {
31: public:
32: MATDENSECUPM_HEADER(T, MatDense_Seq_CUPM<T>);
34: private:
35: struct Mat_SeqDenseCUPM {
36: PetscScalar *d_v; // pointer to the matrix on the GPU
37: PetscScalar *unplacedarray; // if one called MatCUPMDensePlaceArray(), this is where it stashed the original
38: bool d_user_alloc;
39: bool d_unplaced_user_alloc;
40: // factorization support
41: cupmBlasInt_t *d_fact_ipiv; // device pivots
42: cupmScalar_t *d_fact_tau; // device QR tau vector
43: cupmBlasInt_t *d_fact_info; // device info
44: cupmScalar_t *d_fact_work; // device workspace
45: cupmBlasInt_t d_fact_lwork; // size of device workspace
46: // workspace
47: Vec workvec;
48: };
50: static PetscErrorCode SetPreallocation_(Mat, PetscDeviceContext, PetscScalar *) noexcept;
52: static PetscErrorCode HostToDevice_(Mat, PetscDeviceContext) noexcept;
53: static PetscErrorCode DeviceToHost_(Mat, PetscDeviceContext) noexcept;
55: static PetscErrorCode CheckCUPMSolverInfo_(const cupmBlasInt_t *, cupmStream_t) noexcept;
57: template <typename Derived>
58: struct SolveCommon;
59: struct SolveQR;
60: struct SolveCholesky;
61: struct SolveLU;
63: template <typename Solver, bool transpose>
64: static PetscErrorCode MatSolve_Factored_Dispatch_(Mat, Vec, Vec) noexcept;
65: template <typename Solver, bool transpose>
66: static PetscErrorCode MatMatSolve_Factored_Dispatch_(Mat, Mat, Mat) noexcept;
67: template <bool transpose, bool hermitian>
68: static PetscErrorCode MatMultAddColumnRange_Dispatch_(Mat, Vec, Vec, Vec, PetscInt, PetscInt) noexcept;
69: template <bool transpose, bool hermitian>
70: static PetscErrorCode MatMultColumnRange_Dispatch_(Mat, Vec, Vec, PetscInt, PetscInt) noexcept;
71: template <bool transpose, bool hermitian>
72: static PetscErrorCode MatMultAdd_Dispatch_(Mat, Vec, Vec, Vec) noexcept;
74: template <bool to_host>
75: static PetscErrorCode Convert_Dispatch_(Mat, MatType, MatReuse, Mat *) noexcept;
77: PETSC_NODISCARD static constexpr MatType MATIMPLCUPM_() noexcept;
78: PETSC_NODISCARD static constexpr Mat_SeqDense *MatIMPLCast_(Mat) noexcept;
80: public:
81: PETSC_NODISCARD static constexpr Mat_SeqDenseCUPM *MatCUPMCast(Mat) noexcept;
83: // define these by hand since they don't fit the above mold
84: PETSC_NODISCARD static constexpr const char *MatConvert_seqdensecupm_seqdense_C() noexcept;
85: PETSC_NODISCARD static constexpr const char *MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept;
87: static PetscErrorCode Create(Mat) noexcept;
88: static PetscErrorCode Destroy(Mat) noexcept;
89: static PetscErrorCode SetUp(Mat) noexcept;
90: static PetscErrorCode Reset(Mat) noexcept;
92: static PetscErrorCode BindToCPU(Mat, PetscBool) noexcept;
93: static PetscErrorCode Convert_SeqDense_SeqDenseCUPM(Mat, MatType, MatReuse, Mat *) noexcept;
94: static PetscErrorCode Convert_SeqDenseCUPM_SeqDense(Mat, MatType, MatReuse, Mat *) noexcept;
96: template <PetscMemType, PetscMemoryAccessMode>
97: static PetscErrorCode GetArray(Mat, PetscScalar **, PetscDeviceContext) noexcept;
98: template <PetscMemType, PetscMemoryAccessMode>
99: static PetscErrorCode RestoreArray(Mat, PetscScalar **, PetscDeviceContext) noexcept;
100: template <PetscMemoryAccessMode>
101: static PetscErrorCode GetArrayAndMemType(Mat, PetscScalar **, PetscMemType *, PetscDeviceContext) noexcept;
102: template <PetscMemoryAccessMode>
103: static PetscErrorCode RestoreArrayAndMemType(Mat, PetscScalar **, PetscDeviceContext) noexcept;
105: private:
106: template <PetscMemType mtype, PetscMemoryAccessMode mode>
107: static PetscErrorCode GetArrayC_(Mat m, PetscScalar **p) noexcept
108: {
109: PetscDeviceContext dctx;
111: PetscFunctionBegin;
112: PetscCall(GetHandles_(&dctx));
113: PetscCall(GetArray<mtype, mode>(m, p, dctx));
114: PetscFunctionReturn(PETSC_SUCCESS);
115: }
117: template <PetscMemType mtype, PetscMemoryAccessMode mode>
118: static PetscErrorCode RestoreArrayC_(Mat m, PetscScalar **p) noexcept
119: {
120: PetscDeviceContext dctx;
122: PetscFunctionBegin;
123: PetscCall(GetHandles_(&dctx));
124: PetscCall(RestoreArray<mtype, mode>(m, p, dctx));
125: PetscFunctionReturn(PETSC_SUCCESS);
126: }
128: template <PetscMemoryAccessMode mode>
129: static PetscErrorCode GetArrayAndMemTypeC_(Mat m, PetscScalar **p, PetscMemType *tp) noexcept
130: {
131: PetscDeviceContext dctx;
133: PetscFunctionBegin;
134: PetscCall(GetHandles_(&dctx));
135: PetscCall(GetArrayAndMemType<mode>(m, p, tp, dctx));
136: PetscFunctionReturn(PETSC_SUCCESS);
137: }
139: template <PetscMemoryAccessMode mode>
140: static PetscErrorCode RestoreArrayAndMemTypeC_(Mat m, PetscScalar **p) noexcept
141: {
142: PetscDeviceContext dctx;
144: PetscFunctionBegin;
145: PetscCall(GetHandles_(&dctx));
146: PetscCall(RestoreArrayAndMemType<mode>(m, p, dctx));
147: PetscFunctionReturn(PETSC_SUCCESS);
148: }
150: public:
151: static PetscErrorCode PlaceArray(Mat, const PetscScalar *) noexcept;
152: static PetscErrorCode ReplaceArray(Mat, const PetscScalar *) noexcept;
153: static PetscErrorCode ResetArray(Mat) noexcept;
155: template <bool transpose_A, bool transpose_B>
156: static PetscErrorCode MatMatMult_Numeric_Dispatch(Mat, Mat, Mat) noexcept;
157: static PetscErrorCode Copy(Mat, Mat, MatStructure) noexcept;
158: static PetscErrorCode ZeroEntries(Mat) noexcept;
159: static PetscErrorCode Scale(Mat, PetscScalar) noexcept;
160: static PetscErrorCode AXPY(Mat, PetscScalar, Mat, MatStructure) noexcept;
161: static PetscErrorCode Duplicate(Mat, MatDuplicateOption, Mat *) noexcept;
162: static PetscErrorCode SetRandom(Mat, PetscRandom) noexcept;
164: static PetscErrorCode GetColumnVector(Mat, Vec, PetscInt) noexcept;
165: template <PetscMemoryAccessMode>
166: static PetscErrorCode GetColumnVec(Mat, PetscInt, Vec *) noexcept;
167: template <PetscMemoryAccessMode>
168: static PetscErrorCode RestoreColumnVec(Mat, PetscInt, Vec *) noexcept;
170: static PetscErrorCode GetFactor(Mat, MatFactorType, Mat *) noexcept;
171: static PetscErrorCode InvertFactors(Mat) noexcept;
173: static PetscErrorCode GetSubMatrix(Mat, PetscInt, PetscInt, PetscInt, PetscInt, Mat *) noexcept;
174: static PetscErrorCode RestoreSubMatrix(Mat, Mat *) noexcept;
175: };
177: } // namespace impl
179: namespace
180: {
182: // Declare this here so that the functions below can make use of it
183: template <device::cupm::DeviceType T>
184: inline PetscErrorCode MatCreateSeqDenseCUPM(MPI_Comm comm, PetscInt m, PetscInt n, PetscScalar *data, Mat *A, PetscDeviceContext dctx = nullptr, bool preallocate = true) noexcept
185: {
186: PetscFunctionBegin;
187: PetscCall(impl::MatDense_Seq_CUPM<T>::CreateIMPLDenseCUPM(comm, m, n, m, n, data, A, dctx, preallocate));
188: PetscFunctionReturn(PETSC_SUCCESS);
189: }
191: } // anonymous namespace
193: namespace impl
194: {
196: // ==========================================================================================
197: // MatDense_Seq_CUPM - Private API - Utility
198: // ==========================================================================================
200: template <device::cupm::DeviceType T>
201: inline PetscErrorCode MatDense_Seq_CUPM<T>::SetPreallocation_(Mat m, PetscDeviceContext dctx, PetscScalar *user_device_array) noexcept
202: {
203: const auto mcu = MatCUPMCast(m);
204: const auto nrows = m->rmap->n;
205: const auto ncols = m->cmap->n;
206: auto &lda = MatIMPLCast(m)->lda;
207: cupmStream_t stream;
209: PetscFunctionBegin;
210: PetscCheckTypeName(m, MATSEQDENSECUPM());
212: PetscCall(checkCupmBlasIntCast(nrows));
213: PetscCall(checkCupmBlasIntCast(ncols));
214: PetscCall(GetHandlesFrom_(dctx, &stream));
215: if (lda <= 0) lda = nrows;
216: if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
217: if (user_device_array) {
218: mcu->d_user_alloc = PETSC_TRUE;
219: mcu->d_v = user_device_array;
220: } else {
221: std::size_t size;
223: mcu->d_user_alloc = PETSC_FALSE;
224: size = lda * ncols;
225: PetscCall(PetscCUPMMallocAsync(&mcu->d_v, size, stream));
226: PetscCall(PetscCUPMMemsetAsync(mcu->d_v, 0, size, stream));
227: }
228: m->offloadmask = PETSC_OFFLOAD_GPU;
229: PetscFunctionReturn(PETSC_SUCCESS);
230: }
232: template <device::cupm::DeviceType T>
233: inline PetscErrorCode MatDense_Seq_CUPM<T>::HostToDevice_(Mat m, PetscDeviceContext dctx) noexcept
234: {
235: const auto nrows = m->rmap->n;
236: const auto ncols = m->cmap->n;
237: const auto copy = m->offloadmask == PETSC_OFFLOAD_CPU || m->offloadmask == PETSC_OFFLOAD_UNALLOCATED;
239: PetscFunctionBegin;
240: PetscCheckTypeName(m, MATSEQDENSECUPM());
241: if (m->boundtocpu) PetscFunctionReturn(PETSC_SUCCESS);
242: PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols));
243: if (copy) {
244: const auto mcu = MatCUPMCast(m);
245: cupmStream_t stream;
247: // Allocate GPU memory if not present
248: if (!mcu->d_v) PetscCall(SetPreallocation(m, dctx, nullptr));
249: PetscCall(GetHandlesFrom_(dctx, &stream));
250: PetscCall(PetscLogEventBegin(MAT_DenseCopyToGPU, m, 0, 0, 0));
251: {
252: const auto mimpl = MatIMPLCast(m);
253: const auto lda = mimpl->lda;
254: const auto src = mimpl->v;
255: const auto dest = mcu->d_v;
257: if (lda > nrows) {
258: PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyHostToDevice, stream));
259: } else {
260: PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyHostToDevice, stream));
261: }
262: }
263: PetscCall(PetscLogEventEnd(MAT_DenseCopyToGPU, m, 0, 0, 0));
264: // order important, ensure that offloadmask is PETSC_OFFLOAD_BOTH
265: m->offloadmask = PETSC_OFFLOAD_BOTH;
266: }
267: PetscFunctionReturn(PETSC_SUCCESS);
268: }
270: template <device::cupm::DeviceType T>
271: inline PetscErrorCode MatDense_Seq_CUPM<T>::DeviceToHost_(Mat m, PetscDeviceContext dctx) noexcept
272: {
273: const auto nrows = m->rmap->n;
274: const auto ncols = m->cmap->n;
275: const auto copy = m->offloadmask == PETSC_OFFLOAD_GPU;
277: PetscFunctionBegin;
278: PetscCheckTypeName(m, MATSEQDENSECUPM());
279: PetscCall(PetscInfo(m, "%s matrix %" PetscInt_FMT " x %" PetscInt_FMT "\n", copy ? "Copy" : "Reusing", nrows, ncols));
280: if (copy) {
281: const auto mimpl = MatIMPLCast(m);
282: cupmStream_t stream;
284: // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed
285: if (!mimpl->v) PetscCall(MatSeqDenseSetPreallocation(m, nullptr));
286: PetscCall(GetHandlesFrom_(dctx, &stream));
287: PetscCall(PetscLogEventBegin(MAT_DenseCopyFromGPU, m, 0, 0, 0));
288: {
289: const auto lda = mimpl->lda;
290: const auto dest = mimpl->v;
291: const auto src = MatCUPMCast(m)->d_v;
293: if (lda > nrows) {
294: PetscCall(PetscCUPMMemcpy2DAsync(dest, lda, src, lda, nrows, ncols, cupmMemcpyDeviceToHost, stream));
295: } else {
296: PetscCall(PetscCUPMMemcpyAsync(dest, src, lda * ncols, cupmMemcpyDeviceToHost, stream));
297: }
298: }
299: PetscCall(PetscLogEventEnd(MAT_DenseCopyFromGPU, m, 0, 0, 0));
300: // order is important, MatSeqDenseSetPreallocation() might set offloadmask
301: m->offloadmask = PETSC_OFFLOAD_BOTH;
302: }
303: PetscFunctionReturn(PETSC_SUCCESS);
304: }
306: template <device::cupm::DeviceType T>
307: inline PetscErrorCode MatDense_Seq_CUPM<T>::CheckCUPMSolverInfo_(const cupmBlasInt_t *fact_info, cupmStream_t stream) noexcept
308: {
309: PetscFunctionBegin;
310: if (PetscDefined(USE_DEBUG)) {
311: cupmBlasInt_t info = 0;
313: PetscCall(PetscCUPMMemcpyAsync(&info, fact_info, 1, cupmMemcpyDeviceToHost, stream));
314: if (stream) PetscCallCUPM(cupmStreamSynchronize(stream));
315: static_assert(std::is_same<decltype(info), int>::value, "");
316: PetscCheck(info <= 0, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Bad factorization: zero pivot in row %d", info - 1);
317: PetscCheck(info >= 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Wrong argument to cupmSolver %d", -info);
318: }
319: PetscFunctionReturn(PETSC_SUCCESS);
320: }
322: // ==========================================================================================
323: // MatDense_Seq_CUPM - Private API - Solver Dispatch
324: // ==========================================================================================
326: // specific solvers called through the dispatch_() family of functions
327: template <device::cupm::DeviceType T>
328: template <typename Derived>
329: struct MatDense_Seq_CUPM<T>::SolveCommon {
330: using derived_type = Derived;
332: template <typename F>
333: static PetscErrorCode ResizeFactLwork(Mat_SeqDenseCUPM *mcu, cupmStream_t stream, F &&cupmSolverComputeFactLwork) noexcept
334: {
335: cupmBlasInt_t lwork;
337: PetscFunctionBegin;
338: PetscCallCUPMSOLVER(cupmSolverComputeFactLwork(&lwork));
339: if (lwork > mcu->d_fact_lwork) {
340: mcu->d_fact_lwork = lwork;
341: PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
342: PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, lwork, stream));
343: }
344: PetscFunctionReturn(PETSC_SUCCESS);
345: }
347: static PetscErrorCode FactorPrepare(Mat A, cupmStream_t stream) noexcept
348: {
349: const auto mcu = MatCUPMCast(A);
351: PetscFunctionBegin;
352: PetscCall(PetscInfo(A, "%s factor %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", derived_type::NAME(), A->rmap->n, A->cmap->n));
353: A->factortype = derived_type::MATFACTORTYPE();
354: A->ops->solve = MatSolve_Factored_Dispatch_<derived_type, false>;
355: A->ops->solvetranspose = MatSolve_Factored_Dispatch_<derived_type, true>;
356: A->ops->matsolve = MatMatSolve_Factored_Dispatch_<derived_type, false>;
357: A->ops->matsolvetranspose = MatMatSolve_Factored_Dispatch_<derived_type, true>;
359: PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &A->solvertype));
360: if (!mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream));
361: PetscFunctionReturn(PETSC_SUCCESS);
362: }
363: };
365: template <device::cupm::DeviceType T>
366: struct MatDense_Seq_CUPM<T>::SolveLU : SolveCommon<SolveLU> {
367: using base_type = SolveCommon<SolveLU>;
369: static constexpr const char *NAME() noexcept { return "LU"; }
370: static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_LU; }
372: static PetscErrorCode Factor(Mat A, IS, IS, const MatFactorInfo *) noexcept
373: {
374: const auto m = static_cast<cupmBlasInt_t>(A->rmap->n);
375: const auto n = static_cast<cupmBlasInt_t>(A->cmap->n);
376: cupmStream_t stream;
377: cupmSolverHandle_t handle;
378: PetscDeviceContext dctx;
380: PetscFunctionBegin;
381: if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS);
382: PetscCall(GetHandles_(&dctx, &handle, &stream));
383: PetscCall(base_type::FactorPrepare(A, stream));
384: {
385: const auto mcu = MatCUPMCast(A);
386: const auto da = DeviceArrayReadWrite(dctx, A);
387: const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
389: // clang-format off
390: PetscCall(
391: base_type::ResizeFactLwork(
392: mcu, stream,
393: [&](cupmBlasInt_t *fact_lwork)
394: {
395: return cupmSolverXgetrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork);
396: }
397: )
398: );
399: // clang-format on
400: if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream));
402: PetscCall(PetscLogGpuTimeBegin());
403: PetscCallCUPMSOLVER(cupmSolverXgetrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_ipiv, mcu->d_fact_info));
404: PetscCall(PetscLogGpuTimeEnd());
405: PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
406: }
407: PetscCall(PetscLogGpuFlops(2.0 * n * n * m / 3.0));
408: PetscFunctionReturn(PETSC_SUCCESS);
409: }
411: template <bool transpose>
412: static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
413: {
414: const auto mcu = MatCUPMCast(A);
415: const auto fact_info = mcu->d_fact_info;
416: const auto fact_ipiv = mcu->d_fact_ipiv;
417: cupmSolverHandle_t handle;
419: PetscFunctionBegin;
420: PetscCall(GetHandlesFrom_(dctx, &handle));
421: PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
422: PetscCall(PetscLogGpuTimeBegin());
423: {
424: constexpr auto op = transpose ? CUPMSOLVER_OP_T : CUPMSOLVER_OP_N;
425: const auto da = DeviceArrayRead(dctx, A);
426: const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
428: // clang-format off
429: PetscCall(
430: base_type::ResizeFactLwork(
431: mcu, stream,
432: [&](cupmBlasInt_t *lwork)
433: {
434: return cupmSolverXgetrs_bufferSize(
435: handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, lwork
436: );
437: }
438: )
439: );
440: // clang-format on
441: PetscCallCUPMSOLVER(cupmSolverXgetrs(handle, op, m, nrhs, da.cupmdata(), lda, fact_ipiv, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info));
442: PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
443: }
444: PetscCall(PetscLogGpuTimeEnd());
445: PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
446: PetscFunctionReturn(PETSC_SUCCESS);
447: }
448: };
450: template <device::cupm::DeviceType T>
451: struct MatDense_Seq_CUPM<T>::SolveCholesky : SolveCommon<SolveCholesky> {
452: using base_type = SolveCommon<SolveCholesky>;
454: static constexpr const char *NAME() noexcept { return "Cholesky"; }
455: static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_CHOLESKY; }
457: static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept
458: {
459: const auto n = static_cast<cupmBlasInt_t>(A->rmap->n);
460: PetscDeviceContext dctx;
461: cupmSolverHandle_t handle;
462: cupmStream_t stream;
464: PetscFunctionBegin;
465: if (!n || !A->cmap->n) PetscFunctionReturn(PETSC_SUCCESS);
466: PetscCheck(A->spd == PETSC_BOOL3_TRUE, PETSC_COMM_SELF, PETSC_ERR_SUP, "%ssytrs unavailable. Use MAT_FACTOR_LU", cupmSolverName());
467: PetscCall(GetHandles_(&dctx, &handle, &stream));
468: PetscCall(base_type::FactorPrepare(A, stream));
469: {
470: const auto mcu = MatCUPMCast(A);
471: const auto da = DeviceArrayReadWrite(dctx, A);
472: const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
474: // clang-format off
475: PetscCall(
476: base_type::ResizeFactLwork(
477: mcu, stream,
478: [&](cupmBlasInt_t *fact_lwork)
479: {
480: return cupmSolverXpotrf_bufferSize(
481: handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, fact_lwork
482: );
483: }
484: )
485: );
486: // clang-format on
487: PetscCall(PetscLogGpuTimeBegin());
488: PetscCallCUPMSOLVER(cupmSolverXpotrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
489: PetscCall(PetscLogGpuTimeEnd());
490: PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
491: }
492: PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));
494: #if 0
495: // At the time of writing this interface (cuda 10.0), cusolverDn does not implement *sytrs
496: // and *hetr* routines. The code below should work, and it can be activated when *sytrs
497: // routines will be available
498: if (!mcu->d_fact_ipiv) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_ipiv, n, stream));
499: if (!mcu->d_fact_lwork) {
500: PetscCallCUPMSOLVER(cupmSolverDnXsytrf_bufferSize(handle, n, da.cupmdata(), lda, &mcu->d_fact_lwork));
501: PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, mcu->d_fact_lwork, stream));
502: }
503: if (mcu->d_fact_info) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_info, 1, stream));
504: PetscCall(PetscLogGpuTimeBegin());
505: PetscCallCUPMSOLVER(cupmSolverXsytrf(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da, lda, mcu->d_fact_ipiv, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
506: PetscCall(PetscLogGpuTimeEnd());
507: #endif
508: PetscFunctionReturn(PETSC_SUCCESS);
509: }
511: template <bool transpose>
512: static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
513: {
514: const auto mcu = MatCUPMCast(A);
515: const auto fact_info = mcu->d_fact_info;
516: cupmSolverHandle_t handle;
518: PetscFunctionBegin;
519: PetscAssert(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%ssytrs not implemented", cupmSolverName());
520: PetscCall(GetHandlesFrom_(dctx, &handle));
521: PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
522: PetscCall(PetscLogGpuTimeBegin());
523: {
524: const auto da = DeviceArrayRead(dctx, A);
525: const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
527: // clang-format off
528: PetscCall(
529: base_type::ResizeFactLwork(
530: mcu, stream,
531: [&](cupmBlasInt_t *lwork)
532: {
533: return cupmSolverXpotrs_bufferSize(
534: handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, lwork
535: );
536: }
537: )
538: );
539: // clang-format on
540: PetscCallCUPMSOLVER(cupmSolverXpotrs(handle, CUPMSOLVER_FILL_MODE_LOWER, m, nrhs, da.cupmdata(), lda, x, ldx, mcu->d_fact_work, mcu->d_fact_lwork, fact_info));
541: PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
542: }
543: PetscCall(PetscLogGpuTimeEnd());
544: PetscCall(PetscLogGpuFlops(nrhs * (2.0 * m * m - m)));
545: PetscFunctionReturn(PETSC_SUCCESS);
546: }
547: };
549: template <device::cupm::DeviceType T>
550: struct MatDense_Seq_CUPM<T>::SolveQR : SolveCommon<SolveQR> {
551: using base_type = SolveCommon<SolveQR>;
553: static constexpr const char *NAME() noexcept { return "QR"; }
554: static constexpr MatFactorType MATFACTORTYPE() noexcept { return MAT_FACTOR_QR; }
556: static PetscErrorCode Factor(Mat A, IS, const MatFactorInfo *) noexcept
557: {
558: const auto m = static_cast<cupmBlasInt_t>(A->rmap->n);
559: const auto n = static_cast<cupmBlasInt_t>(A->cmap->n);
560: const auto min = std::min(m, n);
561: const auto mimpl = MatIMPLCast(A);
562: cupmStream_t stream;
563: cupmSolverHandle_t handle;
564: PetscDeviceContext dctx;
566: PetscFunctionBegin;
567: if (!m || !n) PetscFunctionReturn(PETSC_SUCCESS);
568: PetscCall(GetHandles_(&dctx, &handle, &stream));
569: PetscCall(base_type::FactorPrepare(A, stream));
570: mimpl->rank = min;
571: {
572: const auto mcu = MatCUPMCast(A);
573: const auto da = DeviceArrayReadWrite(dctx, A);
574: const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);
576: if (!mcu->workvec) PetscCall(vec::cupm::VecCreateSeqCUPMAsync<T>(PetscObjectComm(PetscObjectCast(A)), m, &mcu->workvec));
577: if (!mcu->d_fact_tau) PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_tau, min, stream));
578: // clang-format off
579: PetscCall(
580: base_type::ResizeFactLwork(
581: mcu, stream,
582: [&](cupmBlasInt_t *fact_lwork)
583: {
584: return cupmSolverXgeqrf_bufferSize(handle, m, n, da.cupmdata(), lda, fact_lwork);
585: }
586: )
587: );
588: // clang-format on
589: PetscCall(PetscLogGpuTimeBegin());
590: PetscCallCUPMSOLVER(cupmSolverXgeqrf(handle, m, n, da.cupmdata(), lda, mcu->d_fact_tau, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
591: PetscCall(PetscLogGpuTimeEnd());
592: PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
593: }
594: PetscCall(PetscLogGpuFlops(2.0 * min * min * (std::max(m, n) - min / 3.0)));
595: PetscFunctionReturn(PETSC_SUCCESS);
596: }
598: template <bool transpose>
599: static PetscErrorCode Solve(Mat A, cupmScalar_t *x, cupmBlasInt_t ldx, cupmBlasInt_t m, cupmBlasInt_t nrhs, cupmBlasInt_t k, PetscDeviceContext dctx, cupmStream_t stream) noexcept
600: {
601: const auto mimpl = MatIMPLCast(A);
602: const auto rank = static_cast<cupmBlasInt_t>(mimpl->rank);
603: const auto mcu = MatCUPMCast(A);
604: const auto fact_info = mcu->d_fact_info;
605: const auto fact_tau = mcu->d_fact_tau;
606: const auto fact_work = mcu->d_fact_work;
607: const auto fact_lwork = mcu->d_fact_lwork;
608: cupmSolverHandle_t solver_handle;
609: cupmBlasHandle_t blas_handle;
611: PetscFunctionBegin;
612: PetscCall(GetHandlesFrom_(dctx, &blas_handle, &solver_handle));
613: PetscCall(PetscInfo(A, "%s solve %d x %d on backend\n", NAME(), m, k));
614: PetscCall(PetscLogGpuTimeBegin());
615: {
616: const auto da = DeviceArrayRead(dctx, A);
617: const auto one = cupmScalarCast(1.0);
618: const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);
620: if (transpose) {
621: PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_T, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx));
622: PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, CUPMSOLVER_OP_N, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info));
623: PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
624: } else {
625: constexpr auto op = PetscDefined(USE_COMPLEX) ? CUPMSOLVER_OP_C : CUPMSOLVER_OP_T;
627: PetscCallCUPMSOLVER(cupmSolverXormqr(solver_handle, CUPMSOLVER_SIDE_LEFT, op, m, nrhs, rank, da.cupmdata(), lda, fact_tau, x, ldx, fact_work, fact_lwork, fact_info));
628: PetscCall(CheckCUPMSolverInfo_(fact_info, stream));
629: PetscCallCUPMBLAS(cupmBlasXtrsm(blas_handle, CUPMBLAS_SIDE_LEFT, CUPMBLAS_FILL_MODE_UPPER, CUPMBLAS_OP_N, CUPMBLAS_DIAG_NON_UNIT, rank, nrhs, &one, da.cupmdata(), lda, x, ldx));
630: }
631: }
632: PetscCall(PetscLogGpuTimeEnd());
633: PetscCall(PetscLogFlops(nrhs * (4.0 * m * rank - (rank * rank))));
634: PetscFunctionReturn(PETSC_SUCCESS);
635: }
636: };
638: template <device::cupm::DeviceType T>
639: template <typename Solver, bool transpose>
640: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatSolve_Factored_Dispatch_(Mat A, Vec x, Vec y) noexcept
641: {
642: using namespace vec::cupm;
643: const auto pobj_A = PetscObjectCast(A);
644: const auto m = static_cast<cupmBlasInt_t>(A->rmap->n);
645: const auto k = static_cast<cupmBlasInt_t>(A->cmap->n);
646: auto &workvec = MatCUPMCast(A)->workvec;
647: PetscScalar *y_array = nullptr;
648: PetscDeviceContext dctx;
649: PetscBool xiscupm, yiscupm, aiscupm;
650: bool use_y_array_directly;
651: cupmStream_t stream;
653: PetscFunctionBegin;
654: PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
655: PetscCall(PetscObjectTypeCompare(PetscObjectCast(x), VecSeq_CUPM::VECSEQCUPM(), &xiscupm));
656: PetscCall(PetscObjectTypeCompare(PetscObjectCast(y), VecSeq_CUPM::VECSEQCUPM(), &yiscupm));
657: PetscCall(PetscObjectTypeCompare(pobj_A, MATSEQDENSECUPM(), &aiscupm));
658: PetscAssert(aiscupm, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix A is somehow not CUPM?????????????????????????????");
659: PetscCall(GetHandles_(&dctx, &stream));
660: use_y_array_directly = yiscupm && (k >= m);
661: {
662: const PetscScalar *x_array;
663: const auto xisdevice = xiscupm && PetscOffloadDevice(x->offloadmask);
664: const auto copy_mode = xisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice;
666: if (!use_y_array_directly && !workvec) PetscCall(VecCreateSeqCUPMAsync<T>(PetscObjectComm(pobj_A), m, &workvec));
667: // The logic here is to try to minimize the amount of memory copying:
668: //
669: // If we call VecCUPMGetArrayRead(X, &x) every time xiscupm and the data is not offloaded
670: // to the GPU yet, then the data is copied to the GPU. But we are only trying to get the
671: // data in order to copy it into the y array. So the array x will be wherever the data
672: // already is so that only one memcpy is performed
673: if (xisdevice) {
674: PetscCall(VecCUPMGetArrayReadAsync<T>(x, &x_array, dctx));
675: } else {
676: PetscCall(VecGetArrayRead(x, &x_array));
677: }
678: PetscCall(VecCUPMGetArrayWriteAsync<T>(use_y_array_directly ? y : workvec, &y_array, dctx));
679: PetscCall(PetscCUPMMemcpyAsync(y_array, x_array, m, copy_mode, stream));
680: if (xisdevice) {
681: PetscCall(VecCUPMRestoreArrayReadAsync<T>(x, &x_array, dctx));
682: } else {
683: PetscCall(VecRestoreArrayRead(x, &x_array));
684: }
685: }
687: if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
688: PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y_array), m, m, 1, k, dctx, stream));
689: if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
691: if (use_y_array_directly) {
692: PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &y_array, dctx));
693: } else {
694: const auto copy_mode = yiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost;
695: PetscScalar *yv;
697: // The logic here is that the data is not yet in either y's GPU array or its CPU array.
698: // There is nothing in the interface to say where the user would like it to end up. So we
699: // choose the GPU, because it is the faster option
700: if (yiscupm) {
701: PetscCall(VecCUPMGetArrayWriteAsync<T>(y, &yv, dctx));
702: } else {
703: PetscCall(VecGetArray(y, &yv));
704: }
705: PetscCall(PetscCUPMMemcpyAsync(yv, y_array, k, copy_mode, stream));
706: if (yiscupm) {
707: PetscCall(VecCUPMRestoreArrayWriteAsync<T>(y, &yv, dctx));
708: } else {
709: PetscCall(VecRestoreArray(y, &yv));
710: }
711: PetscCall(VecCUPMRestoreArrayWriteAsync<T>(workvec, &y_array));
712: }
713: PetscFunctionReturn(PETSC_SUCCESS);
714: }
716: template <device::cupm::DeviceType T>
717: template <typename Solver, bool transpose>
718: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatSolve_Factored_Dispatch_(Mat A, Mat B, Mat X) noexcept
719: {
720: const auto m = static_cast<cupmBlasInt_t>(A->rmap->n);
721: const auto k = static_cast<cupmBlasInt_t>(A->cmap->n);
722: cupmBlasInt_t nrhs, ldb, ldx, ldy;
723: PetscScalar *y;
724: PetscBool biscupm, xiscupm, aiscupm;
725: PetscDeviceContext dctx;
726: cupmStream_t stream;
728: PetscFunctionBegin;
729: PetscCheck(A->factortype != MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix must be factored to solve");
730: PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &biscupm));
731: PetscCall(PetscObjectTypeCompare(PetscObjectCast(X), MATSEQDENSECUPM(), &xiscupm));
732: PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &aiscupm));
733: PetscCall(GetHandles_(&dctx, &stream));
734: {
735: PetscInt n;
737: PetscCall(MatGetSize(B, nullptr, &n));
738: PetscCall(PetscCUPMBlasIntCast(n, &nrhs));
739: PetscCall(MatDenseGetLDA(B, &n));
740: PetscCall(PetscCUPMBlasIntCast(n, &ldb));
741: PetscCall(MatDenseGetLDA(X, &n));
742: PetscCall(PetscCUPMBlasIntCast(n, &ldx));
743: }
744: {
745: // The logic here is to try to minimize the amount of memory copying:
746: //
747: // If we call MatDenseCUPMGetArrayRead(B, &b) every time biscupm and the data is not
748: // offloaded to the GPU yet, then the data is copied to the GPU. But we are only trying to
749: // get the data in order to copy it into the y array. So the array b will be wherever the
750: // data already is so that only one memcpy is performed
751: const auto bisdevice = biscupm && PetscOffloadDevice(B->offloadmask);
752: const auto copy_mode = bisdevice ? cupmMemcpyDeviceToDevice : cupmMemcpyHostToDevice;
753: const PetscScalar *b;
755: if (bisdevice) {
756: b = DeviceArrayRead(dctx, B);
757: } else if (biscupm) {
758: b = HostArrayRead(dctx, B);
759: } else {
760: PetscCall(MatDenseGetArrayRead(B, &b));
761: }
763: if (ldx < m || !xiscupm) {
764: // X's array cannot serve as the array (too small or not on device), B's array cannot
765: // serve as the array (const), so allocate a new array
766: ldy = m;
767: PetscCall(PetscCUPMMallocAsync(&y, nrhs * m));
768: } else {
769: // X's array should serve as the array
770: ldy = ldx;
771: y = DeviceArrayWrite(dctx, X);
772: }
773: PetscCall(PetscCUPMMemcpy2DAsync(y, ldy, b, ldb, m, nrhs, copy_mode, stream));
774: if (!bisdevice && !biscupm) PetscCall(MatDenseRestoreArrayRead(B, &b));
775: }
777: // convert to CUPM twice??????????????????????????????????
778: // but A should already be CUPM??????????????????????????????????????
779: if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
780: PetscCall(Solver{}.template Solve<transpose>(A, cupmScalarPtrCast(y), ldy, m, nrhs, k, dctx, stream));
781: if (!aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
783: if (ldx < m || !xiscupm) {
784: const auto copy_mode = xiscupm ? cupmMemcpyDeviceToDevice : cupmMemcpyDeviceToHost;
785: PetscScalar *x;
787: // The logic here is that the data is not yet in either X's GPU array or its CPU
788: // array. There is nothing in the interface to say where the user would like it to end up.
789: // So we choose the GPU, because it is the faster option
790: if (xiscupm) {
791: x = DeviceArrayWrite(dctx, X);
792: } else {
793: PetscCall(MatDenseGetArray(X, &x));
794: }
795: PetscCall(PetscCUPMMemcpy2DAsync(x, ldx, y, ldy, k, nrhs, copy_mode, stream));
796: if (!xiscupm) PetscCall(MatDenseRestoreArray(X, &x));
797: PetscCallCUPM(cupmFreeAsync(y, stream));
798: }
799: PetscFunctionReturn(PETSC_SUCCESS);
800: }
802: template <device::cupm::DeviceType T>
803: template <bool transpose, bool hermitian>
804: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultAddColumnRange_Dispatch_(Mat A, Vec xx, Vec yy, Vec zz, PetscInt c_start, PetscInt c_end) noexcept
805: {
806: const auto m = static_cast<cupmBlasInt_t>(A->rmap->n);
807: const auto n = static_cast<cupmBlasInt_t>(c_end - c_start);
808: const auto lda = static_cast<cupmBlasInt_t>(MatIMPLCast(A)->lda);
809: cupmBlasHandle_t handle;
810: PetscDeviceContext dctx;
812: PetscFunctionBegin;
813: if (yy && yy != zz) PetscCall(VecSeq_CUPM::Copy(yy, zz)); // mult add
814: if (!m || !n) {
815: // mult only
816: if (!yy) PetscCall(VecSeq_CUPM::Set(zz, 0.0));
817: PetscFunctionReturn(PETSC_SUCCESS);
818: }
819: PetscCall(PetscInfo(A, "Matrix-vector product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, n));
820: PetscCall(GetHandles_(&dctx, &handle));
821: {
822: constexpr auto op = transpose ? (hermitian ? CUPMBLAS_OP_C : CUPMBLAS_OP_T) : CUPMBLAS_OP_N;
823: const auto one = cupmScalarCast(1.0);
824: const auto zero = cupmScalarCast(0.0);
825: const auto da = DeviceArrayRead(dctx, A);
826: const auto dxx = VecSeq_CUPM::DeviceArrayRead(dctx, xx);
827: const auto dzz = VecSeq_CUPM::DeviceArrayReadWrite(dctx, zz);
829: PetscCall(PetscLogGpuTimeBegin());
830: PetscCallCUPMBLAS(cupmBlasXgemv(handle, op, m, n, &one, da.cupmdata() + c_start * lda, lda, dxx.cupmdata() + (transpose ? 0 : c_start), 1, yy ? &one : &zero, dzz.cupmdata() + (transpose ? c_start : 0), 1));
831: PetscCall(PetscLogGpuTimeEnd());
832: }
833: PetscCall(PetscLogGpuFlops(2.0 * m * n - (yy ? 0 : m)));
834: PetscFunctionReturn(PETSC_SUCCESS);
835: }
837: template <device::cupm::DeviceType T>
838: template <bool transpose, bool hermitian>
839: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultColumnRange_Dispatch_(Mat A, Vec xx, Vec yy, PetscInt c_start, PetscInt c_end) noexcept
840: {
841: PetscFunctionBegin;
842: PetscCall(MatMultAddColumnRange_Dispatch_<transpose, hermitian>(A, xx, nullptr, yy, c_start, c_end));
843: PetscFunctionReturn(PETSC_SUCCESS);
844: }
846: template <device::cupm::DeviceType T>
847: template <bool transpose, bool hermitian>
848: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMultAdd_Dispatch_(Mat A, Vec xx, Vec yy, Vec zz) noexcept
849: {
850: PetscFunctionBegin;
851: PetscCall(MatMultAddColumnRange_Dispatch_<transpose, hermitian>(A, xx, yy, zz, 0, A->cmap->n));
852: PetscFunctionReturn(PETSC_SUCCESS);
853: }
855: // ==========================================================================================
856: // MatDense_Seq_CUPM - Private API - Conversion Dispatch
857: // ==========================================================================================
859: template <device::cupm::DeviceType T>
860: template <bool to_host>
861: inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_Dispatch_(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
862: {
863: PetscFunctionBegin;
864: if (reuse == MAT_REUSE_MATRIX || reuse == MAT_INITIAL_MATRIX) {
865: // TODO these cases should be optimized
866: PetscCall(MatConvert_Basic(M, type, reuse, newmat));
867: } else {
868: const auto B = *newmat;
869: const auto pobj = PetscObjectCast(B);
871: if (to_host) {
872: PetscCall(BindToCPU(B, PETSC_TRUE));
873: PetscCall(Reset(B));
874: } else {
875: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
876: }
878: PetscCall(PetscStrFreeAllocpy(to_host ? VECSTANDARD : VecSeq_CUPM::VECCUPM(), &B->defaultvectype));
879: PetscCall(PetscObjectChangeTypeName(pobj, to_host ? MATSEQDENSE : MATSEQDENSECUPM()));
880: // cvec might be the wrong VecType, destroy and rebuild it if necessary
881: // REVIEW ME: this is possibly very inefficient
882: PetscCall(VecDestroy(&MatIMPLCast(B)->cvec));
884: MatComposeOp_CUPM(to_host, pobj, MatConvert_seqdensecupm_seqdense_C(), nullptr, Convert_SeqDenseCUPM_SeqDense);
885: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArray_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>);
886: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayRead_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>);
887: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMGetArrayWrite_C(), nullptr, GetArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>);
888: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArray_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ_WRITE>);
889: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayRead_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_READ>);
890: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMRestoreArrayWrite_C(), nullptr, RestoreArrayC_<PETSC_MEMTYPE_DEVICE, PETSC_MEMORY_ACCESS_WRITE>);
891: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMPlaceArray_C(), nullptr, PlaceArray);
892: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMResetArray_C(), nullptr, ResetArray);
893: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMReplaceArray_C(), nullptr, ReplaceArray);
894: MatComposeOp_CUPM(to_host, pobj, MatProductSetFromOptions_seqaij_seqdensecupm_C(), nullptr, MatProductSetFromOptions_SeqAIJ_SeqDense);
895: MatComposeOp_CUPM(to_host, pobj, MatDenseCUPMSetPreallocation_C(), nullptr, SetPreallocation);
897: if (to_host) {
898: B->offloadmask = PETSC_OFFLOAD_CPU;
899: } else {
900: Mat_SeqDenseCUPM *mcu;
902: PetscCall(PetscNew(&mcu));
903: B->spptr = mcu;
904: B->offloadmask = PETSC_OFFLOAD_UNALLOCATED; // REVIEW ME: why not offload host??
905: PetscCall(BindToCPU(B, PETSC_FALSE));
906: }
908: MatSetOp_CUPM(to_host, B, bindtocpu, nullptr, BindToCPU);
909: MatSetOp_CUPM(to_host, B, destroy, MatDestroy_SeqDense, Destroy);
910: }
911: PetscFunctionReturn(PETSC_SUCCESS);
912: }
914: // ==========================================================================================
915: // MatDense_Seq_CUPM - Public API
916: // ==========================================================================================
918: template <device::cupm::DeviceType T>
919: inline constexpr MatType MatDense_Seq_CUPM<T>::MATIMPLCUPM_() noexcept
920: {
921: return MATSEQDENSECUPM();
922: }
924: template <device::cupm::DeviceType T>
925: inline constexpr typename MatDense_Seq_CUPM<T>::Mat_SeqDenseCUPM *MatDense_Seq_CUPM<T>::MatCUPMCast(Mat m) noexcept
926: {
927: return static_cast<Mat_SeqDenseCUPM *>(m->spptr);
928: }
930: template <device::cupm::DeviceType T>
931: inline constexpr Mat_SeqDense *MatDense_Seq_CUPM<T>::MatIMPLCast_(Mat m) noexcept
932: {
933: return static_cast<Mat_SeqDense *>(m->data);
934: }
936: template <device::cupm::DeviceType T>
937: inline constexpr const char *MatDense_Seq_CUPM<T>::MatConvert_seqdensecupm_seqdense_C() noexcept
938: {
939: return T == device::cupm::DeviceType::CUDA ? "MatConvert_seqdensecuda_seqdense_C" : "MatConvert_seqdensehip_seqdense_C";
940: }
942: template <device::cupm::DeviceType T>
943: inline constexpr const char *MatDense_Seq_CUPM<T>::MatProductSetFromOptions_seqaij_seqdensecupm_C() noexcept
944: {
945: return T == device::cupm::DeviceType::CUDA ? "MatProductSetFromOptions_seqaij_seqdensecuda_C" : "MatProductSetFromOptions_seqaij_seqdensehip_C";
946: }
948: // ==========================================================================================
950: // MatCreate_SeqDenseCUPM()
951: template <device::cupm::DeviceType T>
952: inline PetscErrorCode MatDense_Seq_CUPM<T>::Create(Mat A) noexcept
953: {
954: PetscFunctionBegin;
955: PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUPM()));
956: PetscCall(MatCreate_SeqDense(A));
957: PetscCall(Convert_SeqDense_SeqDenseCUPM(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
958: PetscFunctionReturn(PETSC_SUCCESS);
959: }
961: template <device::cupm::DeviceType T>
962: inline PetscErrorCode MatDense_Seq_CUPM<T>::Destroy(Mat A) noexcept
963: {
964: PetscFunctionBegin;
965: // prevent copying back data if we own the data pointer
966: if (!MatIMPLCast(A)->user_alloc) A->offloadmask = PETSC_OFFLOAD_CPU;
967: PetscCall(Convert_SeqDenseCUPM_SeqDense(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
968: PetscCall(MatDestroy_SeqDense(A));
969: PetscFunctionReturn(PETSC_SUCCESS);
970: }
972: // obj->ops->setup()
973: template <device::cupm::DeviceType T>
974: inline PetscErrorCode MatDense_Seq_CUPM<T>::SetUp(Mat A) noexcept
975: {
976: PetscFunctionBegin;
977: PetscCall(PetscLayoutSetUp(A->rmap));
978: PetscCall(PetscLayoutSetUp(A->cmap));
979: if (!A->preallocated) {
980: PetscDeviceContext dctx;
982: PetscCall(GetHandles_(&dctx));
983: PetscCall(SetPreallocation(A, dctx, nullptr));
984: }
985: PetscFunctionReturn(PETSC_SUCCESS);
986: }
988: template <device::cupm::DeviceType T>
989: inline PetscErrorCode MatDense_Seq_CUPM<T>::Reset(Mat A) noexcept
990: {
991: PetscFunctionBegin;
992: if (const auto mcu = MatCUPMCast(A)) {
993: cupmStream_t stream;
995: PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
996: PetscCall(GetHandles_(&stream));
997: if (!mcu->d_user_alloc) PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
998: PetscCallCUPM(cupmFreeAsync(mcu->d_fact_tau, stream));
999: PetscCallCUPM(cupmFreeAsync(mcu->d_fact_ipiv, stream));
1000: PetscCallCUPM(cupmFreeAsync(mcu->d_fact_info, stream));
1001: PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
1002: PetscCall(VecDestroy(&mcu->workvec));
1003: PetscCall(PetscFree(A->spptr /* mcu */));
1004: }
1005: PetscFunctionReturn(PETSC_SUCCESS);
1006: }
1008: // ==========================================================================================
1010: template <device::cupm::DeviceType T>
1011: inline PetscErrorCode MatDense_Seq_CUPM<T>::BindToCPU(Mat A, PetscBool to_host) noexcept
1012: {
1013: const auto mimpl = MatIMPLCast(A);
1014: const auto pobj = PetscObjectCast(A);
1016: PetscFunctionBegin;
1017: PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1018: PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1019: A->boundtocpu = to_host;
1020: PetscCall(PetscStrFreeAllocpy(to_host ? PETSCRANDER48 : PETSCDEVICERAND(), &A->defaultrandtype));
1021: if (to_host) {
1022: PetscDeviceContext dctx;
1024: // make sure we have an up-to-date copy on the CPU
1025: PetscCall(GetHandles_(&dctx));
1026: PetscCall(DeviceToHost_(A, dctx));
1027: } else {
1028: PetscBool iscupm;
1030: if (auto &cvec = mimpl->cvec) {
1031: PetscCall(PetscObjectTypeCompare(PetscObjectCast(cvec), VecSeq_CUPM::VECSEQCUPM(), &iscupm));
1032: if (!iscupm) PetscCall(VecDestroy(&cvec));
1033: }
1034: if (auto &cmat = mimpl->cmat) {
1035: PetscCall(PetscObjectTypeCompare(PetscObjectCast(cmat), MATSEQDENSECUPM(), &iscupm));
1036: if (!iscupm) PetscCall(MatDestroy(&cmat));
1037: }
1038: }
1040: // ============================================================
1041: // Composed ops
1042: // ============================================================
1043: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArray_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ_WRITE>);
1044: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayRead_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_READ>);
1045: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWrite_C", MatDenseGetArray_SeqDense, GetArrayC_<PETSC_MEMTYPE_HOST, PETSC_MEMORY_ACCESS_WRITE>);
1046: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>);
1047: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ_WRITE>);
1048: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayReadAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>);
1049: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayReadAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_READ>);
1050: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetArrayWriteAndMemType_C", nullptr, GetArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>);
1051: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreArrayWriteAndMemType_C", nullptr, RestoreArrayAndMemTypeC_<PETSC_MEMORY_ACCESS_WRITE>);
1052: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVec_C", MatDenseGetColumnVec_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>);
1053: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVec_C", MatDenseRestoreColumnVec_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ_WRITE>);
1054: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecRead_C", MatDenseGetColumnVecRead_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_READ>);
1055: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecRead_C", MatDenseRestoreColumnVecRead_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_READ>);
1056: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetColumnVecWrite_C", MatDenseGetColumnVecWrite_SeqDense, GetColumnVec<PETSC_MEMORY_ACCESS_WRITE>);
1057: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreColumnVecWrite_C", MatDenseRestoreColumnVecWrite_SeqDense, RestoreColumnVec<PETSC_MEMORY_ACCESS_WRITE>);
1058: MatComposeOp_CUPM(to_host, pobj, "MatDenseGetSubMatrix_C", MatDenseGetSubMatrix_SeqDense, GetSubMatrix);
1059: MatComposeOp_CUPM(to_host, pobj, "MatDenseRestoreSubMatrix_C", MatDenseRestoreSubMatrix_SeqDense, RestoreSubMatrix);
1060: MatComposeOp_CUPM(to_host, pobj, "MatQRFactor_C", MatQRFactor_SeqDense, SolveQR::Factor);
1061: MatComposeOp_CUPM(to_host, pobj, "MatMultAddColumnRange_C", MatMultAddColumnRange_SeqDense, MatMultAddColumnRange_Dispatch_</* transpose */ false, /* hermitian */ false>);
1062: MatComposeOp_CUPM(to_host, pobj, "MatMultHermitianTransposeColumnRange_C", MatMultHermitianTransposeColumnRange_SeqDense, MatMultColumnRange_Dispatch_</* transpose */ true, /* hermitian */ true>);
1063: MatComposeOp_CUPM(to_host, pobj, "MatMultHermitianTransposeAddColumnRange_C", MatMultHermitianTransposeAddColumnRange_SeqDense, MatMultAddColumnRange_Dispatch_</* transpose */ true, /* hermitian */ true>);
1064: // always the same
1065: PetscCall(PetscObjectComposeFunction(pobj, "MatDenseSetLDA_C", MatDenseSetLDA_SeqDense));
1067: // ============================================================
1068: // Function pointer ops
1069: // ============================================================
1070: MatSetOp_CUPM(to_host, A, duplicate, MatDuplicate_SeqDense, Duplicate);
1071: MatSetOp_CUPM(to_host, A, mult, MatMult_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ false, /* hermitian */ false>(A, xx, nullptr, yy); });
1072: MatSetOp_CUPM(to_host, A, multtranspose, MatMultTranspose_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ false>(A, xx, nullptr, yy); });
1073: MatSetOp_CUPM(to_host, A, multhermitiantranspose, MatMultTranspose_SeqDense, [](Mat A, Vec xx, Vec yy) { return MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ true>(A, xx, nullptr, yy); });
1074: MatSetOp_CUPM(to_host, A, multadd, MatMultAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ false, /* hermitian */ false>);
1075: MatSetOp_CUPM(to_host, A, multtransposeadd, MatMultTransposeAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ false>);
1076: MatSetOp_CUPM(to_host, A, multhermitiantransposeadd, MatMultHermitianTransposeAdd_SeqDense, MatMultAdd_Dispatch_</* transpose */ true, /* hermitian */ true>);
1077: MatSetOp_CUPM(to_host, A, matmultnumeric, MatMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ false>);
1078: MatSetOp_CUPM(to_host, A, mattransposemultnumeric, MatMatTransposeMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ false, /* transpose_B */ true>);
1079: MatSetOp_CUPM(to_host, A, transposematmultnumeric, MatTransposeMatMultNumeric_SeqDense_SeqDense, MatMatMult_Numeric_Dispatch</* transpose_A */ true, /* transpose_B */ false>);
1080: MatSetOp_CUPM(to_host, A, axpy, MatAXPY_SeqDense, AXPY);
1081: MatSetOp_CUPM(to_host, A, choleskyfactor, MatCholeskyFactor_SeqDense, SolveCholesky::Factor);
1082: MatSetOp_CUPM(to_host, A, lufactor, MatLUFactor_SeqDense, SolveLU::Factor);
1083: MatSetOp_CUPM(to_host, A, getcolumnvector, MatGetColumnVector_SeqDense, GetColumnVector);
1084: MatSetOp_CUPM(to_host, A, scale, MatScale_SeqDense, Scale);
1085: MatSetOp_CUPM(to_host, A, shift, MatShift_SeqDense, Shift);
1086: MatSetOp_CUPM(to_host, A, copy, MatCopy_SeqDense, Copy);
1087: MatSetOp_CUPM(to_host, A, zeroentries, MatZeroEntries_SeqDense, ZeroEntries);
1088: MatSetOp_CUPM(to_host, A, setup, MatSetUp_SeqDense, SetUp);
1089: MatSetOp_CUPM(to_host, A, setrandom, MatSetRandom_SeqDense, SetRandom);
1090: MatSetOp_CUPM(to_host, A, getdiagonal, MatGetDiagonal_SeqDense, GetDiagonal);
1091: // seemingly always the same
1092: A->ops->productsetfromoptions = MatProductSetFromOptions_SeqDense;
1094: if (const auto cmat = mimpl->cmat) PetscCall(MatBindToCPU(cmat, to_host));
1095: PetscFunctionReturn(PETSC_SUCCESS);
1096: }
1098: template <device::cupm::DeviceType T>
1099: inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDenseCUPM_SeqDense(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
1100: {
1101: PetscFunctionBegin;
1102: PetscCall(Convert_Dispatch_</* to host */ true>(M, type, reuse, newmat));
1103: PetscFunctionReturn(PETSC_SUCCESS);
1104: }
1106: template <device::cupm::DeviceType T>
1107: inline PetscErrorCode MatDense_Seq_CUPM<T>::Convert_SeqDense_SeqDenseCUPM(Mat M, MatType type, MatReuse reuse, Mat *newmat) noexcept
1108: {
1109: PetscFunctionBegin;
1110: PetscCall(Convert_Dispatch_</* to host */ false>(M, type, reuse, newmat));
1111: PetscFunctionReturn(PETSC_SUCCESS);
1112: }
1114: // ==========================================================================================
1116: template <device::cupm::DeviceType T>
1117: template <PetscMemType mtype, PetscMemoryAccessMode access>
1118: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArray(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept
1119: {
1120: constexpr auto hostmem = PetscMemTypeHost(mtype);
1121: constexpr auto read_access = PetscMemoryAccessRead(access);
1123: PetscFunctionBegin;
1124: static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
1125: PetscCall(PetscDeviceContextGetOptionalNullContext_Internal(&dctx));
1126: if (hostmem) {
1127: if (read_access) {
1128: PetscCall(DeviceToHost_(m, dctx));
1129: } else if (!MatIMPLCast(m)->v) {
1130: // MatCreateSeqDenseCUPM may not allocate CPU memory. Allocate if needed
1131: PetscCall(MatSeqDenseSetPreallocation(m, nullptr));
1132: }
1133: *array = MatIMPLCast(m)->v;
1134: } else {
1135: if (read_access) {
1136: PetscCall(HostToDevice_(m, dctx));
1137: } else if (!MatCUPMCast(m)->d_v) {
1138: // write-only
1139: PetscCall(SetPreallocation(m, dctx, nullptr));
1140: }
1141: *array = MatCUPMCast(m)->d_v;
1142: }
1143: if (PetscMemoryAccessWrite(access)) {
1144: m->offloadmask = hostmem ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1145: PetscCall(PetscObjectStateIncrease(PetscObjectCast(m)));
1146: }
1147: PetscFunctionReturn(PETSC_SUCCESS);
1148: }
1150: template <device::cupm::DeviceType T>
1151: template <PetscMemType mtype, PetscMemoryAccessMode access>
1152: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArray(Mat m, PetscScalar **array, PetscDeviceContext) noexcept
1153: {
1154: PetscFunctionBegin;
1155: static_assert((mtype == PETSC_MEMTYPE_HOST) || (mtype == PETSC_MEMTYPE_DEVICE), "");
1156: if (PetscMemoryAccessWrite(access)) {
1157: // WRITE or READ_WRITE
1158: m->offloadmask = PetscMemTypeHost(mtype) ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1159: PetscCall(PetscObjectStateIncrease(PetscObjectCast(m)));
1160: }
1161: if (array) {
1162: PetscCall(CheckPointerMatchesMemType_(*array, mtype));
1163: *array = nullptr;
1164: }
1165: PetscFunctionReturn(PETSC_SUCCESS);
1166: }
1168: template <device::cupm::DeviceType T>
1169: template <PetscMemoryAccessMode access>
1170: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetArrayAndMemType(Mat m, PetscScalar **array, PetscMemType *mtype, PetscDeviceContext dctx) noexcept
1171: {
1172: PetscFunctionBegin;
1173: PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx));
1174: if (mtype) *mtype = PETSC_MEMTYPE_CUPM();
1175: PetscFunctionReturn(PETSC_SUCCESS);
1176: }
1178: template <device::cupm::DeviceType T>
1179: template <PetscMemoryAccessMode access>
1180: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreArrayAndMemType(Mat m, PetscScalar **array, PetscDeviceContext dctx) noexcept
1181: {
1182: PetscFunctionBegin;
1183: PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(m, array, dctx));
1184: PetscFunctionReturn(PETSC_SUCCESS);
1185: }
1187: // ==========================================================================================
1189: template <device::cupm::DeviceType T>
1190: inline PetscErrorCode MatDense_Seq_CUPM<T>::PlaceArray(Mat A, const PetscScalar *array) noexcept
1191: {
1192: const auto mimpl = MatIMPLCast(A);
1193: const auto mcu = MatCUPMCast(A);
1195: PetscFunctionBegin;
1196: PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1197: PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1198: PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1199: if (mimpl->v) {
1200: PetscDeviceContext dctx;
1202: PetscCall(GetHandles_(&dctx));
1203: PetscCall(HostToDevice_(A, dctx));
1204: }
1205: mcu->unplacedarray = util::exchange(mcu->d_v, const_cast<PetscScalar *>(array));
1206: mcu->d_unplaced_user_alloc = util::exchange(mcu->d_user_alloc, PETSC_TRUE);
1207: PetscFunctionReturn(PETSC_SUCCESS);
1208: }
1210: template <device::cupm::DeviceType T>
1211: inline PetscErrorCode MatDense_Seq_CUPM<T>::ReplaceArray(Mat A, const PetscScalar *array) noexcept
1212: {
1213: const auto mimpl = MatIMPLCast(A);
1214: const auto mcu = MatCUPMCast(A);
1216: PetscFunctionBegin;
1217: PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1218: PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1219: PetscCheck(!mcu->unplacedarray, PETSC_COMM_SELF, PETSC_ERR_ORDER, "MatDense%sResetArray() must be called first", cupmNAME());
1220: if (!mcu->d_user_alloc) {
1221: cupmStream_t stream;
1223: PetscCall(GetHandles_(&stream));
1224: PetscCallCUPM(cupmFreeAsync(mcu->d_v, stream));
1225: }
1226: mcu->d_v = const_cast<PetscScalar *>(array);
1227: mcu->d_user_alloc = PETSC_FALSE;
1228: PetscFunctionReturn(PETSC_SUCCESS);
1229: }
1231: template <device::cupm::DeviceType T>
1232: inline PetscErrorCode MatDense_Seq_CUPM<T>::ResetArray(Mat A) noexcept
1233: {
1234: const auto mimpl = MatIMPLCast(A);
1235: const auto mcu = MatCUPMCast(A);
1237: PetscFunctionBegin;
1238: PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1239: PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1240: if (mimpl->v) {
1241: PetscDeviceContext dctx;
1243: PetscCall(GetHandles_(&dctx));
1244: PetscCall(HostToDevice_(A, dctx));
1245: }
1246: mcu->d_v = util::exchange(mcu->unplacedarray, nullptr);
1247: mcu->d_user_alloc = mcu->d_unplaced_user_alloc;
1248: PetscFunctionReturn(PETSC_SUCCESS);
1249: }
1251: // ==========================================================================================
1253: template <device::cupm::DeviceType T>
1254: template <bool transpose_A, bool transpose_B>
1255: inline PetscErrorCode MatDense_Seq_CUPM<T>::MatMatMult_Numeric_Dispatch(Mat A, Mat B, Mat C) noexcept
1256: {
1257: cupmBlasInt_t m, n, k;
1258: PetscBool Aiscupm, Biscupm;
1259: PetscDeviceContext dctx;
1260: cupmBlasHandle_t handle;
1262: PetscFunctionBegin;
1263: PetscCall(PetscCUPMBlasIntCast(C->rmap->n, &m));
1264: PetscCall(PetscCUPMBlasIntCast(C->cmap->n, &n));
1265: PetscCall(PetscCUPMBlasIntCast(transpose_A ? A->rmap->n : A->cmap->n, &k));
1266: if (!m || !n || !k) PetscFunctionReturn(PETSC_SUCCESS);
1268: // we may end up with SEQDENSE as one of the arguments
1269: // REVIEW ME: how? and why is it not B and C????????
1270: PetscCall(PetscObjectTypeCompare(PetscObjectCast(A), MATSEQDENSECUPM(), &Aiscupm));
1271: PetscCall(PetscObjectTypeCompare(PetscObjectCast(B), MATSEQDENSECUPM(), &Biscupm));
1272: if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &A));
1273: if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSECUPM(), MAT_INPLACE_MATRIX, &B));
1274: PetscCall(PetscInfo(C, "Matrix-Matrix product %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " x %" PetscBLASInt_FMT " on backend\n", m, k, n));
1275: PetscCall(GetHandles_(&dctx, &handle));
1277: PetscCall(PetscLogGpuTimeBegin());
1278: {
1279: const auto one = cupmScalarCast(1.0);
1280: const auto zero = cupmScalarCast(0.0);
1281: const auto da = DeviceArrayRead(dctx, A);
1282: const auto db = DeviceArrayRead(dctx, B);
1283: const auto dc = DeviceArrayWrite(dctx, C);
1284: PetscInt alda, blda, clda;
1286: PetscCall(MatDenseGetLDA(A, &alda));
1287: PetscCall(MatDenseGetLDA(B, &blda));
1288: PetscCall(MatDenseGetLDA(C, &clda));
1289: PetscCallCUPMBLAS(cupmBlasXgemm(handle, transpose_A ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, transpose_B ? CUPMBLAS_OP_T : CUPMBLAS_OP_N, m, n, k, &one, da.cupmdata(), alda, db.cupmdata(), blda, &zero, dc.cupmdata(), clda));
1290: }
1291: PetscCall(PetscLogGpuTimeEnd());
1293: PetscCall(PetscLogGpuFlops(1.0 * m * n * k + 1.0 * m * n * (k - 1)));
1294: if (!Aiscupm) PetscCall(MatConvert(A, MATSEQDENSE, MAT_INPLACE_MATRIX, &A));
1295: if (!Biscupm) PetscCall(MatConvert(B, MATSEQDENSE, MAT_INPLACE_MATRIX, &B));
1296: PetscFunctionReturn(PETSC_SUCCESS);
1297: }
1299: template <device::cupm::DeviceType T>
1300: inline PetscErrorCode MatDense_Seq_CUPM<T>::Copy(Mat A, Mat B, MatStructure str) noexcept
1301: {
1302: const auto m = A->rmap->n;
1303: const auto n = A->cmap->n;
1305: PetscFunctionBegin;
1306: PetscAssert(m == B->rmap->n && n == B->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "size(B) != size(A)");
1307: // The two matrices must have the same copy implementation to be eligible for fast copy
1308: if (A->ops->copy == B->ops->copy) {
1309: PetscDeviceContext dctx;
1310: cupmStream_t stream;
1312: PetscCall(GetHandles_(&dctx, &stream));
1313: PetscCall(PetscLogGpuTimeBegin());
1314: {
1315: const auto va = DeviceArrayRead(dctx, A);
1316: const auto vb = DeviceArrayWrite(dctx, B);
1317: // order is important, DeviceArrayRead/Write() might call SetPreallocation() which sets
1318: // lda!
1319: const auto lda_a = MatIMPLCast(A)->lda;
1320: const auto lda_b = MatIMPLCast(B)->lda;
1322: if (lda_a > m || lda_b > m) {
1323: PetscAssert(lda_b > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "B lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_b, cupmNAME());
1324: PetscAssert(lda_a > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A lda (%" PetscBLASInt_FMT ") must be > 0 at this point, this indicates Mat%sSetPreallocation() was not called when it should have been!", lda_a, cupmNAME());
1325: PetscCall(PetscCUPMMemcpy2DAsync(vb.data(), lda_b, va.data(), lda_a, m, n, cupmMemcpyDeviceToDevice, stream));
1326: } else {
1327: PetscCall(PetscCUPMMemcpyAsync(vb.data(), va.data(), m * n, cupmMemcpyDeviceToDevice, stream));
1328: }
1329: }
1330: PetscCall(PetscLogGpuTimeEnd());
1331: } else {
1332: PetscCall(MatCopy_Basic(A, B, str));
1333: }
1334: PetscFunctionReturn(PETSC_SUCCESS);
1335: }
1337: template <device::cupm::DeviceType T>
1338: inline PetscErrorCode MatDense_Seq_CUPM<T>::ZeroEntries(Mat m) noexcept
1339: {
1340: PetscDeviceContext dctx;
1341: cupmStream_t stream;
1343: PetscFunctionBegin;
1344: PetscCall(GetHandles_(&dctx, &stream));
1345: PetscCall(PetscLogGpuTimeBegin());
1346: {
1347: const auto va = DeviceArrayWrite(dctx, m);
1348: const auto lda = MatIMPLCast(m)->lda;
1349: const auto ma = m->rmap->n;
1350: const auto na = m->cmap->n;
1352: if (lda > ma) {
1353: PetscCall(PetscCUPMMemset2DAsync(va.data(), lda, 0, ma, na, stream));
1354: } else {
1355: PetscCall(PetscCUPMMemsetAsync(va.data(), 0, ma * na, stream));
1356: }
1357: }
1358: PetscCall(PetscLogGpuTimeEnd());
1359: PetscFunctionReturn(PETSC_SUCCESS);
1360: }
1362: namespace detail
1363: {
1365: // ==========================================================================================
1366: // SubMatIndexFunctor
1367: //
1368: // Iterator which permutes a linear index range into matrix indices for am nrows x ncols
1369: // submat with leading dimension lda. Essentially SubMatIndexFunctor(i) returns the index for
1370: // the i'th sequential entry in the matrix.
1371: // ==========================================================================================
1372: template <typename T>
1373: struct SubMatIndexFunctor {
1374: PETSC_HOSTDEVICE_INLINE_DECL T operator()(T x) const noexcept { return ((x / nrows) * lda) + (x % nrows); }
1376: PetscInt nrows;
1377: PetscInt ncols;
1378: PetscInt lda;
1379: };
1381: template <typename Iterator>
1382: struct SubMatrixIterator : MatrixIteratorBase<Iterator, SubMatIndexFunctor<typename thrust::iterator_difference<Iterator>::type>> {
1383: using base_type = MatrixIteratorBase<Iterator, SubMatIndexFunctor<typename thrust::iterator_difference<Iterator>::type>>;
1385: using iterator = typename base_type::iterator;
1387: constexpr SubMatrixIterator(Iterator first, Iterator last, PetscInt nrows, PetscInt ncols, PetscInt lda) noexcept :
1388: base_type{
1389: std::move(first), std::move(last), {nrows, ncols, lda}
1390: }
1391: {
1392: }
1394: PETSC_NODISCARD iterator end() const noexcept { return this->begin() + (this->func.nrows * this->func.ncols); }
1395: };
1397: namespace
1398: {
1400: template <typename T>
1401: PETSC_NODISCARD inline SubMatrixIterator<typename thrust::device_vector<T>::iterator> make_submat_iterator(PetscInt rstart, PetscInt rend, PetscInt cstart, PetscInt cend, PetscInt lda, T *ptr) noexcept
1402: {
1403: const auto nrows = rend - rstart;
1404: const auto ncols = cend - cstart;
1405: const auto dptr = thrust::device_pointer_cast(ptr);
1407: return {dptr + (rstart * lda) + cstart, dptr + ((rstart + nrows) * lda) + cstart, nrows, ncols, lda};
1408: }
1410: } // namespace
1412: } // namespace detail
1414: template <device::cupm::DeviceType T>
1415: inline PetscErrorCode MatDense_Seq_CUPM<T>::Scale(Mat A, PetscScalar alpha) noexcept
1416: {
1417: const auto m = A->rmap->n;
1418: const auto n = A->cmap->n;
1419: const auto N = m * n;
1420: PetscDeviceContext dctx;
1422: PetscFunctionBegin;
1423: PetscCall(PetscInfo(A, "Performing Scale %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m, n));
1424: PetscCall(GetHandles_(&dctx));
1425: {
1426: const auto da = DeviceArrayReadWrite(dctx, A);
1427: const auto lda = MatIMPLCast(A)->lda;
1429: if (lda > m) {
1430: cupmStream_t stream;
1432: PetscCall(GetHandlesFrom_(dctx, &stream));
1433: // clang-format off
1434: PetscCallThrust(
1435: const auto sub_mat = detail::make_submat_iterator(0, m, 0, n, lda, da.data());
1437: THRUST_CALL(
1438: thrust::transform,
1439: stream,
1440: sub_mat.begin(), sub_mat.end(), sub_mat.begin(),
1441: device::cupm::functors::make_times_equals(alpha)
1442: )
1443: );
1444: // clang-format on
1445: } else {
1446: const auto cu_alpha = cupmScalarCast(alpha);
1447: cupmBlasHandle_t handle;
1449: PetscCall(GetHandlesFrom_(dctx, &handle));
1450: PetscCall(PetscLogGpuTimeBegin());
1451: PetscCallCUPMBLAS(cupmBlasXscal(handle, N, &cu_alpha, da.cupmdata(), 1));
1452: PetscCall(PetscLogGpuTimeEnd());
1453: }
1454: }
1455: PetscCall(PetscLogGpuFlops(N));
1456: PetscFunctionReturn(PETSC_SUCCESS);
1457: }
1459: template <device::cupm::DeviceType T>
1460: inline PetscErrorCode MatDense_Seq_CUPM<T>::AXPY(Mat Y, PetscScalar alpha, Mat X, MatStructure) noexcept
1461: {
1462: const auto m_x = X->rmap->n, m_y = Y->rmap->n;
1463: const auto n_x = X->cmap->n, n_y = Y->cmap->n;
1464: const auto N = m_x * n_x;
1465: PetscDeviceContext dctx;
1467: PetscFunctionBegin;
1468: if (!m_x || !n_x || alpha == (PetscScalar)0.0) PetscFunctionReturn(PETSC_SUCCESS);
1469: PetscCall(PetscInfo(Y, "Performing AXPY %" PetscInt_FMT " x %" PetscInt_FMT " on backend\n", m_y, n_y));
1470: PetscCall(GetHandles_(&dctx));
1471: {
1472: const auto dx = DeviceArrayRead(dctx, X);
1473: const auto dy = DeviceArrayReadWrite(dctx, Y);
1474: const auto lda_x = MatIMPLCast(X)->lda;
1475: const auto lda_y = MatIMPLCast(Y)->lda;
1477: if (lda_x > m_x || lda_y > m_x) {
1478: cupmStream_t stream;
1480: PetscCall(GetHandlesFrom_(dctx, &stream));
1481: // clang-format off
1482: PetscCallThrust(
1483: const auto sub_mat_y = detail::make_submat_iterator(0, m_y, 0, n_y, lda_y, dy.data());
1484: const auto sub_mat_x = detail::make_submat_iterator(0, m_x, 0, n_x, lda_x, dx.data());
1486: THRUST_CALL(
1487: thrust::transform,
1488: stream,
1489: sub_mat_x.begin(), sub_mat_x.end(), sub_mat_y.begin(), sub_mat_y.begin(),
1490: device::cupm::functors::make_axpy(alpha)
1491: );
1492: );
1493: // clang-format on
1494: } else {
1495: const auto cu_alpha = cupmScalarCast(alpha);
1496: cupmBlasHandle_t handle;
1498: PetscCall(GetHandlesFrom_(dctx, &handle));
1499: PetscCall(PetscLogGpuTimeBegin());
1500: PetscCallCUPMBLAS(cupmBlasXaxpy(handle, N, &cu_alpha, dx.cupmdata(), 1, dy.cupmdata(), 1));
1501: PetscCall(PetscLogGpuTimeEnd());
1502: }
1503: }
1504: PetscCall(PetscLogGpuFlops(PetscMax(2 * N - 1, 0)));
1505: PetscFunctionReturn(PETSC_SUCCESS);
1506: }
1508: template <device::cupm::DeviceType T>
1509: inline PetscErrorCode MatDense_Seq_CUPM<T>::Duplicate(Mat A, MatDuplicateOption opt, Mat *B) noexcept
1510: {
1511: const auto hopt = (opt == MAT_COPY_VALUES && A->offloadmask != PETSC_OFFLOAD_CPU) ? MAT_DO_NOT_COPY_VALUES : opt;
1512: PetscDeviceContext dctx;
1514: PetscFunctionBegin;
1515: PetscCall(GetHandles_(&dctx));
1516: // do not call SetPreallocation() yet, we call it afterwards??
1517: PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, B, dctx, /* preallocate */ false));
1518: PetscCall(MatDuplicateNoCreate_SeqDense(*B, A, hopt));
1519: if (opt == MAT_COPY_VALUES && hopt != MAT_COPY_VALUES) PetscCall(Copy(A, *B, SAME_NONZERO_PATTERN));
1520: // allocate memory if needed
1521: if (opt != MAT_COPY_VALUES && !MatCUPMCast(*B)->d_v) PetscCall(SetPreallocation(*B, dctx, nullptr));
1522: PetscFunctionReturn(PETSC_SUCCESS);
1523: }
1525: template <device::cupm::DeviceType T>
1526: inline PetscErrorCode MatDense_Seq_CUPM<T>::SetRandom(Mat A, PetscRandom rng) noexcept
1527: {
1528: PetscBool device;
1530: PetscFunctionBegin;
1531: PetscCall(PetscObjectTypeCompare(PetscObjectCast(rng), PETSCDEVICERAND(), &device));
1532: if (device) {
1533: const auto m = A->rmap->n;
1534: const auto n = A->cmap->n;
1535: PetscDeviceContext dctx;
1537: PetscCall(GetHandles_(&dctx));
1538: {
1539: const auto a = DeviceArrayWrite(dctx, A);
1540: PetscInt lda;
1542: PetscCall(MatDenseGetLDA(A, &lda));
1543: if (lda > m) {
1544: for (PetscInt i = 0; i < n; i++) PetscCall(PetscRandomGetValues(rng, m, a.data() + i * lda));
1545: } else {
1546: PetscInt mn;
1548: PetscCall(PetscIntMultError(m, n, &mn));
1549: PetscCall(PetscRandomGetValues(rng, mn, a));
1550: }
1551: }
1552: } else {
1553: PetscCall(MatSetRandom_SeqDense(A, rng));
1554: }
1555: PetscFunctionReturn(PETSC_SUCCESS);
1556: }
1558: // ==========================================================================================
1560: template <device::cupm::DeviceType T>
1561: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVector(Mat A, Vec v, PetscInt col) noexcept
1562: {
1563: const auto offloadmask = A->offloadmask;
1564: const auto n = A->rmap->n;
1565: const auto col_offset = [&](const PetscScalar *ptr) { return ptr + col * MatIMPLCast(A)->lda; };
1566: PetscBool viscupm;
1567: PetscDeviceContext dctx;
1568: cupmStream_t stream;
1570: PetscFunctionBegin;
1571: PetscCall(PetscObjectTypeCompareAny(PetscObjectCast(v), &viscupm, VecSeq_CUPM::VECSEQCUPM(), VecSeq_CUPM::VECMPICUPM(), VecSeq_CUPM::VECCUPM(), ""));
1572: PetscCall(GetHandles_(&dctx, &stream));
1573: if (viscupm && !v->boundtocpu) {
1574: const auto x = VecSeq_CUPM::DeviceArrayWrite(dctx, v);
1576: // update device data
1577: if (PetscOffloadDevice(offloadmask)) {
1578: PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToDevice, stream));
1579: } else {
1580: PetscCall(PetscCUPMMemcpyAsync(x.data(), col_offset(HostArrayRead(dctx, A)), n, cupmMemcpyHostToDevice, stream));
1581: }
1582: } else {
1583: PetscScalar *x;
1585: // update host data
1586: PetscCall(VecGetArrayWrite(v, &x));
1587: if (PetscOffloadUnallocated(offloadmask) || PetscOffloadHost(offloadmask)) {
1588: PetscCall(PetscArraycpy(x, col_offset(HostArrayRead(dctx, A)), n));
1589: } else if (PetscOffloadDevice(offloadmask)) {
1590: PetscCall(PetscCUPMMemcpyAsync(x, col_offset(DeviceArrayRead(dctx, A)), n, cupmMemcpyDeviceToHost, stream));
1591: }
1592: PetscCall(VecRestoreArrayWrite(v, &x));
1593: }
1594: PetscFunctionReturn(PETSC_SUCCESS);
1595: }
1597: template <device::cupm::DeviceType T>
1598: template <PetscMemoryAccessMode access>
1599: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetColumnVec(Mat A, PetscInt col, Vec *v) noexcept
1600: {
1601: using namespace vec::cupm;
1602: const auto mimpl = MatIMPLCast(A);
1603: PetscDeviceContext dctx;
1605: PetscFunctionBegin;
1606: PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1607: PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1608: mimpl->vecinuse = col + 1;
1609: if (!mimpl->cvec) PetscCall(MatDenseCreateColumnVec_Private(A, &mimpl->cvec));
1610: PetscCall(GetHandles_(&dctx));
1611: PetscCall(GetArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx));
1612: PetscCall(VecCUPMPlaceArrayAsync<T>(mimpl->cvec, mimpl->ptrinuse + static_cast<std::size_t>(col) * static_cast<std::size_t>(mimpl->lda)));
1613: if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPush(mimpl->cvec));
1614: *v = mimpl->cvec;
1615: PetscFunctionReturn(PETSC_SUCCESS);
1616: }
1618: template <device::cupm::DeviceType T>
1619: template <PetscMemoryAccessMode access>
1620: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreColumnVec(Mat A, PetscInt, Vec *v) noexcept
1621: {
1622: using namespace vec::cupm;
1623: const auto mimpl = MatIMPLCast(A);
1624: const auto cvec = mimpl->cvec;
1625: PetscDeviceContext dctx;
1627: PetscFunctionBegin;
1628: PetscCheck(mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetColumnVec() first");
1629: PetscCheck(cvec, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column vector");
1630: mimpl->vecinuse = 0;
1631: if (access == PETSC_MEMORY_ACCESS_READ) PetscCall(VecLockReadPop(cvec));
1632: PetscCall(VecCUPMResetArrayAsync<T>(cvec));
1633: PetscCall(GetHandles_(&dctx));
1634: PetscCall(RestoreArray<PETSC_MEMTYPE_DEVICE, access>(A, const_cast<PetscScalar **>(&mimpl->ptrinuse), dctx));
1635: if (v) *v = nullptr;
1636: PetscFunctionReturn(PETSC_SUCCESS);
1637: }
1639: // ==========================================================================================
1641: template <device::cupm::DeviceType T>
1642: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetFactor(Mat A, MatFactorType ftype, Mat *fact_out) noexcept
1643: {
1644: Mat fact = nullptr;
1645: PetscDeviceContext dctx;
1647: PetscFunctionBegin;
1648: PetscCall(GetHandles_(&dctx));
1649: PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), A->rmap->n, A->cmap->n, nullptr, &fact, dctx, /* preallocate */ false));
1650: fact->factortype = ftype;
1651: switch (ftype) {
1652: case MAT_FACTOR_LU:
1653: case MAT_FACTOR_ILU: // fall-through
1654: fact->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1655: fact->ops->ilufactorsymbolic = MatLUFactorSymbolic_SeqDense;
1656: break;
1657: case MAT_FACTOR_CHOLESKY:
1658: case MAT_FACTOR_ICC: // fall-through
1659: fact->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqDense;
1660: break;
1661: case MAT_FACTOR_QR: {
1662: const auto pobj = PetscObjectCast(fact);
1664: PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactor_C", MatQRFactor_SeqDense));
1665: PetscCall(PetscObjectComposeFunction(pobj, "MatQRFactorSymbolic_C", MatQRFactorSymbolic_SeqDense));
1666: } break;
1667: case MAT_FACTOR_NONE:
1668: case MAT_FACTOR_ILUDT: // fall-through
1669: case MAT_FACTOR_NUM_TYPES: // fall-through
1670: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "MatFactorType %s not supported", MatFactorTypes[ftype]);
1671: }
1672: PetscCall(PetscStrFreeAllocpy(MATSOLVERCUPM(), &fact->solvertype));
1673: PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_LU));
1674: PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ILU));
1675: PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_CHOLESKY));
1676: PetscCall(PetscStrallocpy(MATORDERINGEXTERNAL, const_cast<char **>(fact->preferredordering) + MAT_FACTOR_ICC));
1677: *fact_out = fact;
1678: PetscFunctionReturn(PETSC_SUCCESS);
1679: }
1681: template <device::cupm::DeviceType T>
1682: inline PetscErrorCode MatDense_Seq_CUPM<T>::InvertFactors(Mat A) noexcept
1683: {
1684: const auto mimpl = MatIMPLCast(A);
1685: const auto mcu = MatCUPMCast(A);
1686: const auto n = static_cast<cupmBlasInt_t>(A->cmap->n);
1687: cupmSolverHandle_t handle;
1688: PetscDeviceContext dctx;
1689: cupmStream_t stream;
1691: PetscFunctionBegin;
1692: #if PetscDefined(HAVE_CUDA) && PetscDefined(USING_NVCC)
1693: // HIP appears to have this by default??
1694: PetscCheck(PETSC_PKG_CUDA_VERSION_GE(10, 1, 0), PETSC_COMM_SELF, PETSC_ERR_SUP, "Upgrade to CUDA version 10.1.0 or higher");
1695: #endif
1696: if (!n || !A->rmap->n) PetscFunctionReturn(PETSC_SUCCESS);
1697: PetscCheck(A->factortype == MAT_FACTOR_CHOLESKY, PETSC_COMM_SELF, PETSC_ERR_LIB, "Factor type %s not implemented", MatFactorTypes[A->factortype]);
1698: // spd
1699: PetscCheck(!mcu->d_fact_ipiv, PETSC_COMM_SELF, PETSC_ERR_LIB, "%sDnsytri not implemented", cupmSolverName());
1701: PetscCall(GetHandles_(&dctx, &handle, &stream));
1702: {
1703: const auto da = DeviceArrayReadWrite(dctx, A);
1704: const auto lda = static_cast<cupmBlasInt_t>(mimpl->lda);
1705: cupmBlasInt_t il;
1707: PetscCallCUPMSOLVER(cupmSolverXpotri_bufferSize(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, &il));
1708: if (il > mcu->d_fact_lwork) {
1709: mcu->d_fact_lwork = il;
1710: PetscCallCUPM(cupmFreeAsync(mcu->d_fact_work, stream));
1711: PetscCall(PetscCUPMMallocAsync(&mcu->d_fact_work, il, stream));
1712: }
1713: PetscCall(PetscLogGpuTimeBegin());
1714: PetscCallCUPMSOLVER(cupmSolverXpotri(handle, CUPMSOLVER_FILL_MODE_LOWER, n, da.cupmdata(), lda, mcu->d_fact_work, mcu->d_fact_lwork, mcu->d_fact_info));
1715: PetscCall(PetscLogGpuTimeEnd());
1716: }
1717: PetscCall(CheckCUPMSolverInfo_(mcu->d_fact_info, stream));
1718: // TODO (write cuda kernel)
1719: PetscCall(MatSeqDenseSymmetrize_Private(A, PETSC_TRUE));
1720: PetscCall(PetscLogGpuFlops(1.0 * n * n * n / 3.0));
1722: A->ops->solve = nullptr;
1723: A->ops->solvetranspose = nullptr;
1724: A->ops->matsolve = nullptr;
1725: A->factortype = MAT_FACTOR_NONE;
1727: PetscCall(PetscFree(A->solvertype));
1728: PetscFunctionReturn(PETSC_SUCCESS);
1729: }
1731: // ==========================================================================================
1733: template <device::cupm::DeviceType T>
1734: inline PetscErrorCode MatDense_Seq_CUPM<T>::GetSubMatrix(Mat A, PetscInt rbegin, PetscInt rend, PetscInt cbegin, PetscInt cend, Mat *mat) noexcept
1735: {
1736: const auto mimpl = MatIMPLCast(A);
1737: const auto array_offset = [&](PetscScalar *ptr) { return ptr + rbegin + static_cast<std::size_t>(cbegin) * mimpl->lda; };
1738: const auto n = rend - rbegin;
1739: const auto m = cend - cbegin;
1740: auto &cmat = mimpl->cmat;
1741: PetscDeviceContext dctx;
1743: PetscFunctionBegin;
1744: PetscCheck(!mimpl->vecinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreColumnVec() first");
1745: PetscCheck(!mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseRestoreSubMatrix() first");
1746: mimpl->matinuse = cbegin + 1;
1748: PetscCall(GetHandles_(&dctx));
1749: PetscCall(HostToDevice_(A, dctx));
1751: if (cmat && ((m != cmat->cmap->N) || (n != cmat->rmap->N))) PetscCall(MatDestroy(&cmat));
1752: {
1753: const auto device_array = array_offset(MatCUPMCast(A)->d_v);
1755: if (cmat) {
1756: PetscCall(PlaceArray(cmat, device_array));
1757: } else {
1758: PetscCall(MatCreateSeqDenseCUPM<T>(PetscObjectComm(PetscObjectCast(A)), n, m, device_array, &cmat, dctx));
1759: }
1760: }
1761: PetscCall(MatDenseSetLDA(cmat, mimpl->lda));
1762: // place CPU array if present but do not copy any data
1763: if (const auto host_array = mimpl->v) {
1764: cmat->offloadmask = PETSC_OFFLOAD_GPU;
1765: PetscCall(MatDensePlaceArray(cmat, array_offset(host_array)));
1766: }
1768: cmat->offloadmask = A->offloadmask;
1769: *mat = cmat;
1770: PetscFunctionReturn(PETSC_SUCCESS);
1771: }
1773: template <device::cupm::DeviceType T>
1774: inline PetscErrorCode MatDense_Seq_CUPM<T>::RestoreSubMatrix(Mat A, Mat *m) noexcept
1775: {
1776: const auto mimpl = MatIMPLCast(A);
1777: const auto cmat = mimpl->cmat;
1778: const auto reset = static_cast<bool>(mimpl->v);
1779: bool copy, was_offload_host;
1781: PetscFunctionBegin;
1782: PetscCheck(mimpl->matinuse, PETSC_COMM_SELF, PETSC_ERR_ORDER, "Need to call MatDenseGetSubMatrix() first");
1783: PetscCheck(cmat, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Missing internal column matrix");
1784: PetscCheck(*m == cmat, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Not the matrix obtained from MatDenseGetSubMatrix()");
1785: mimpl->matinuse = 0;
1787: // calls to ResetArray may change it, so save it here
1788: was_offload_host = cmat->offloadmask == PETSC_OFFLOAD_CPU;
1789: if (was_offload_host && !reset) {
1790: copy = true;
1791: PetscCall(MatSeqDenseSetPreallocation(A, nullptr));
1792: } else {
1793: copy = false;
1794: }
1796: PetscCall(ResetArray(cmat));
1797: if (reset) PetscCall(MatDenseResetArray(cmat));
1798: if (copy) {
1799: PetscDeviceContext dctx;
1801: PetscCall(GetHandles_(&dctx));
1802: PetscCall(DeviceToHost_(A, dctx));
1803: } else {
1804: A->offloadmask = was_offload_host ? PETSC_OFFLOAD_CPU : PETSC_OFFLOAD_GPU;
1805: }
1807: cmat->offloadmask = PETSC_OFFLOAD_UNALLOCATED;
1808: *m = nullptr;
1809: PetscFunctionReturn(PETSC_SUCCESS);
1810: }
1812: // ==========================================================================================
1814: namespace
1815: {
1817: template <device::cupm::DeviceType T>
1818: inline PetscErrorCode MatMatMultNumeric_SeqDenseCUPM_SeqDenseCUPM(Mat A, Mat B, Mat C, PetscBool TA, PetscBool TB) noexcept
1819: {
1820: PetscFunctionBegin;
1821: if (TA) {
1822: if (TB) {
1823: PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, true>(A, B, C));
1824: } else {
1825: PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<true, false>(A, B, C));
1826: }
1827: } else {
1828: if (TB) {
1829: PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, true>(A, B, C));
1830: } else {
1831: PetscCall(MatDense_Seq_CUPM<T>::template MatMatMult_Numeric_Dispatch<false, false>(A, B, C));
1832: }
1833: }
1834: PetscFunctionReturn(PETSC_SUCCESS);
1835: }
1837: template <device::cupm::DeviceType T>
1838: inline PetscErrorCode MatSolverTypeRegister_DENSECUPM() noexcept
1839: {
1840: PetscFunctionBegin;
1841: for (auto ftype : util::make_array(MAT_FACTOR_LU, MAT_FACTOR_CHOLESKY, MAT_FACTOR_QR)) {
1842: PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MATSEQDENSE, ftype, MatDense_Seq_CUPM<T>::GetFactor));
1843: PetscCall(MatSolverTypeRegister(MatDense_Seq_CUPM<T>::MATSOLVERCUPM(), MatDense_Seq_CUPM<T>::MATSEQDENSECUPM(), ftype, MatDense_Seq_CUPM<T>::GetFactor));
1844: }
1845: PetscFunctionReturn(PETSC_SUCCESS);
1846: }
1848: } // anonymous namespace
1850: } // namespace impl
1852: } // namespace cupm
1854: } // namespace mat
1856: } // namespace Petsc