Actual source code: sfmpi.c
1: #include <../src/vec/is/sf/impls/basic/sfpack.h>
3: // Though there is no default mechanism to start a communication, we have a
4: // default to finish communication, which is just waiting on the requests.
5: // It should work for both non-blocking or persistent send/recvs or collectivwes.
6: static PetscErrorCode PetscSFLinkFinishCommunication_Default(PetscSF sf, PetscSFLink link, PetscSFDirection direction)
7: {
8: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
9: const PetscMemType rootmtype_mpi = link->rootmtype_mpi, leafmtype_mpi = link->leafmtype_mpi;
10: const PetscInt rootdirect_mpi = link->rootdirect_mpi, leafdirect_mpi = link->leafdirect_mpi;
12: PetscFunctionBegin;
13: if (bas->nrootreqs) PetscCallMPI(MPI_Waitall(bas->nrootreqs, link->rootreqs[direction][rootmtype_mpi][rootdirect_mpi], MPI_STATUSES_IGNORE));
14: if (sf->nleafreqs) PetscCallMPI(MPI_Waitall(sf->nleafreqs, link->leafreqs[direction][leafmtype_mpi][leafdirect_mpi], MPI_STATUSES_IGNORE));
15: if (direction == PETSCSF_ROOT2LEAF) {
16: PetscCall(PetscSFLinkCopyLeafBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE /* host2device after recving */));
17: } else {
18: PetscCall(PetscSFLinkCopyRootBufferInCaseNotUseGpuAwareMPI(sf, link, PETSC_FALSE));
19: }
20: PetscFunctionReturn(PETSC_SUCCESS);
21: }
23: /*
24: The routine Creates a communication link for the given operation. It first looks up its link cache. If
25: there is a free & suitable one, it uses it. Otherwise it creates a new one.
27: A link contains buffers and MPI requests for send/recv. It also contains pack/unpack routines to pack/unpack
28: root/leafdata to/from these buffers. Buffers are allocated at our discretion. When we find root/leafata
29: can be directly passed to MPI, we won't allocate them. Even we allocate buffers, we only allocate
30: those that are needed by the given `sfop` and `op`, in other words, we do lazy memory-allocation.
32: The routine also allocates buffers on CPU when one does not use gpu-aware MPI but data is on GPU.
34: In SFBasic, MPI requests are persistent. They are init'ed until we try to get requests from a link.
36: The routine is shared by SFBasic and SFNeighbor based on the fact they all deal with sparse graphs and
37: need pack/unpack data.
38: */
39: PetscErrorCode PetscSFLinkCreate_MPI(PetscSF sf, MPI_Datatype unit, PetscMemType xrootmtype, const void *rootdata, PetscMemType xleafmtype, const void *leafdata, MPI_Op op, PetscSFOperation sfop, PetscSFLink *mylink)
40: {
41: PetscSF_Basic *bas = (PetscSF_Basic *)sf->data;
42: PetscInt i, j, k, nrootreqs, nleafreqs, nreqs;
43: PetscSFLink *p, link;
44: PetscSFDirection direction;
45: MPI_Request *reqs = NULL;
46: PetscBool match, rootdirect[2], leafdirect[2];
47: PetscMemType rootmtype = PetscMemTypeHost(xrootmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE; /* Convert to 0/1 as we will use it in subscript */
48: PetscMemType leafmtype = PetscMemTypeHost(xleafmtype) ? PETSC_MEMTYPE_HOST : PETSC_MEMTYPE_DEVICE;
49: PetscMemType rootmtype_mpi, leafmtype_mpi; /* mtypes seen by MPI */
50: PetscInt rootdirect_mpi, leafdirect_mpi; /* root/leafdirect seen by MPI*/
52: PetscFunctionBegin;
53: /* Can we directly use root/leafdirect with the given sf, sfop and op? */
54: for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
55: if (sfop == PETSCSF_BCAST) {
56: rootdirect[i] = bas->rootcontig[i]; /* Pack roots */
57: leafdirect[i] = (sf->leafcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack leaves */
58: } else if (sfop == PETSCSF_REDUCE) {
59: leafdirect[i] = sf->leafcontig[i]; /* Pack leaves */
60: rootdirect[i] = (bas->rootcontig[i] && op == MPI_REPLACE) ? PETSC_TRUE : PETSC_FALSE; /* Unpack roots */
61: } else { /* PETSCSF_FETCH */
62: rootdirect[i] = PETSC_FALSE; /* FETCH always need a separate rootbuf */
63: leafdirect[i] = PETSC_FALSE; /* We also force allocating a separate leafbuf so that leafdata and leafupdate can share mpi requests */
64: }
65: }
67: // NEVER use root/leafdirect[] for persistent collectives. Otherwise, suppose for the first time, all ranks build
68: // a persistent MPI request in a collective call. Then in a second call to PetscSFBcast, one rank uses root/leafdirect
69: // but with new rootdata/leafdata pointers. Other ranks keep using the same rootdata/leafdata pointers as last time.
70: // Only that rank will try to rebuild the request with a collective call, resulting in hanging. We could to call
71: // MPI_Allreduce() every time to detect changes in root/leafdata, but that is too expensive for sparse communication.
72: // So we always set root/leafdirect[] to false and allocate additional root/leaf buffers for persistent collectives.
73: if (sf->persistent && sf->collective) {
74: rootdirect[PETSCSF_REMOTE] = PETSC_FALSE;
75: leafdirect[PETSCSF_REMOTE] = PETSC_FALSE;
76: }
78: if (sf->use_gpu_aware_mpi) {
79: rootmtype_mpi = rootmtype;
80: leafmtype_mpi = leafmtype;
81: } else {
82: rootmtype_mpi = leafmtype_mpi = PETSC_MEMTYPE_HOST;
83: }
84: /* Will root/leafdata be directly accessed by MPI? Without use_gpu_aware_mpi, device data is buffered on host and then passed to MPI */
85: rootdirect_mpi = rootdirect[PETSCSF_REMOTE] && (rootmtype_mpi == rootmtype) ? 1 : 0;
86: leafdirect_mpi = leafdirect[PETSCSF_REMOTE] && (leafmtype_mpi == leafmtype) ? 1 : 0;
88: direction = (sfop == PETSCSF_BCAST) ? PETSCSF_ROOT2LEAF : PETSCSF_LEAF2ROOT;
89: nrootreqs = bas->nrootreqs;
90: nleafreqs = sf->nleafreqs;
92: /* Look for free links in cache */
93: for (p = &bas->avail; (link = *p); p = &link->next) {
94: if (!link->use_nvshmem) { /* Only check with MPI links */
95: PetscCall(MPIPetsc_Type_compare(unit, link->unit, &match));
96: if (match) {
97: /* If root/leafdata will be directly passed to MPI, test if the data used to initialized the MPI requests matches with the current.
98: If not, free old requests. New requests will be lazily init'ed until one calls PetscSFLinkGetMPIBuffersAndRequests() with the same tag.
99: */
100: if (rootdirect_mpi && sf->persistent && link->rootreqsinited[direction][rootmtype][1] && link->rootdatadirect[direction][rootmtype] != rootdata) {
101: reqs = link->rootreqs[direction][rootmtype][1]; /* Here, rootmtype = rootmtype_mpi */
102: for (i = 0; i < nrootreqs; i++) {
103: if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i]));
104: }
105: link->rootreqsinited[direction][rootmtype][1] = PETSC_FALSE;
106: }
107: if (leafdirect_mpi && sf->persistent && link->leafreqsinited[direction][leafmtype][1] && link->leafdatadirect[direction][leafmtype] != leafdata) {
108: reqs = link->leafreqs[direction][leafmtype][1];
109: for (i = 0; i < nleafreqs; i++) {
110: if (reqs[i] != MPI_REQUEST_NULL) PetscCallMPI(MPI_Request_free(&reqs[i]));
111: }
112: link->leafreqsinited[direction][leafmtype][1] = PETSC_FALSE;
113: }
114: *p = link->next; /* Remove from available list */
115: goto found;
116: }
117: }
118: }
120: PetscCall(PetscNew(&link));
121: PetscCall(PetscSFLinkSetUp_Host(sf, link, unit));
122: PetscCall(PetscCommGetNewTag(PetscObjectComm((PetscObject)sf), &link->tag)); /* One tag per link */
124: nreqs = (nrootreqs + nleafreqs) * 8;
125: PetscCall(PetscMalloc1(nreqs, &link->reqs));
126: for (i = 0; i < nreqs; i++) link->reqs[i] = MPI_REQUEST_NULL; /* Initialized to NULL so that we know which need to be freed in Destroy */
128: if (nreqs)
129: for (i = 0; i < 2; i++) { /* Two communication directions */
130: for (j = 0; j < 2; j++) { /* Two memory types */
131: for (k = 0; k < 2; k++) { /* root/leafdirect 0 or 1 */
132: link->rootreqs[i][j][k] = link->reqs + nrootreqs * (4 * i + 2 * j + k);
133: link->leafreqs[i][j][k] = link->reqs + nrootreqs * 8 + nleafreqs * (4 * i + 2 * j + k);
134: }
135: }
136: }
138: link->FinishCommunication = PetscSFLinkFinishCommunication_Default;
139: // each SF type could customize their communication by setting function pointers in the link.
140: // Currently only BASIC and NEIGHBOR use this abstraction.
141: PetscTryTypeMethod(sf, SetCommunicationOps, link);
143: found:
145: #if defined(PETSC_HAVE_DEVICE)
146: if ((PetscMemTypeDevice(xrootmtype) || PetscMemTypeDevice(xleafmtype)) && !link->deviceinited) {
147: #if defined(PETSC_HAVE_CUDA)
148: if (sf->backend == PETSCSF_BACKEND_CUDA) PetscCall(PetscSFLinkSetUp_CUDA(sf, link, unit)); /* Setup streams etc */
149: #endif
150: #if defined(PETSC_HAVE_HIP)
151: if (sf->backend == PETSCSF_BACKEND_HIP) PetscCall(PetscSFLinkSetUp_HIP(sf, link, unit)); /* Setup streams etc */
152: #endif
153: #if defined(PETSC_HAVE_KOKKOS)
154: if (sf->backend == PETSCSF_BACKEND_KOKKOS) PetscCall(PetscSFLinkSetUp_Kokkos(sf, link, unit));
155: #endif
156: }
157: #endif
159: /* Allocate buffers along root/leafdata */
160: for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
161: /* For local communication, buffers are only needed when roots and leaves have different mtypes */
162: if (i == PETSCSF_LOCAL && rootmtype == leafmtype) continue;
163: if (bas->rootbuflen[i]) {
164: if (rootdirect[i]) { /* Aha, we disguise rootdata as rootbuf */
165: link->rootbuf[i][rootmtype] = (char *)rootdata + bas->rootstart[i] * link->unitbytes;
166: } else { /* Have to have a separate rootbuf */
167: if (!link->rootbuf_alloc[i][rootmtype]) PetscCall(PetscSFMalloc(sf, rootmtype, bas->rootbuflen[i] * link->unitbytes, (void **)&link->rootbuf_alloc[i][rootmtype]));
168: link->rootbuf[i][rootmtype] = link->rootbuf_alloc[i][rootmtype];
169: }
170: }
172: if (sf->leafbuflen[i]) {
173: if (leafdirect[i]) {
174: link->leafbuf[i][leafmtype] = (char *)leafdata + sf->leafstart[i] * link->unitbytes;
175: } else {
176: if (!link->leafbuf_alloc[i][leafmtype]) PetscCall(PetscSFMalloc(sf, leafmtype, sf->leafbuflen[i] * link->unitbytes, (void **)&link->leafbuf_alloc[i][leafmtype]));
177: link->leafbuf[i][leafmtype] = link->leafbuf_alloc[i][leafmtype];
178: }
179: }
180: }
182: #if defined(PETSC_HAVE_DEVICE)
183: /* Allocate buffers on host for buffering data on device in cast not use_gpu_aware_mpi */
184: if (PetscMemTypeDevice(rootmtype) && PetscMemTypeHost(rootmtype_mpi)) {
185: if (!link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(bas->rootbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]));
186: link->rootbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->rootbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
187: }
188: if (PetscMemTypeDevice(leafmtype) && PetscMemTypeHost(leafmtype_mpi)) {
189: if (!link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]) PetscCall(PetscMalloc(sf->leafbuflen[PETSCSF_REMOTE] * link->unitbytes, &link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST]));
190: link->leafbuf[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST] = link->leafbuf_alloc[PETSCSF_REMOTE][PETSC_MEMTYPE_HOST];
191: }
192: #endif
194: /* Set `current` state of the link. They may change between different SF invocations with the same link */
195: if (sf->persistent) { /* If data is directly passed to MPI and inits MPI requests, record the data for comparison on future invocations */
196: if (rootdirect_mpi) link->rootdatadirect[direction][rootmtype] = rootdata;
197: if (leafdirect_mpi) link->leafdatadirect[direction][leafmtype] = leafdata;
198: }
200: link->rootdata = rootdata; /* root/leafdata are keys to look up links in PetscSFXxxEnd */
201: link->leafdata = leafdata;
202: for (i = PETSCSF_LOCAL; i <= PETSCSF_REMOTE; i++) {
203: link->rootdirect[i] = rootdirect[i];
204: link->leafdirect[i] = leafdirect[i];
205: }
206: link->rootdirect_mpi = rootdirect_mpi;
207: link->leafdirect_mpi = leafdirect_mpi;
208: link->rootmtype = rootmtype;
209: link->leafmtype = leafmtype;
210: link->rootmtype_mpi = rootmtype_mpi;
211: link->leafmtype_mpi = leafmtype_mpi;
213: link->next = bas->inuse;
214: bas->inuse = link;
215: *mylink = link;
216: PetscFunctionReturn(PETSC_SUCCESS);
217: }