Actual source code: baijsolvnat14.c
1: #include <../src/mat/impls/baij/seq/baij.h>
2: #include <petsc/private/kernels/blockinvert.h>
4: /* Block operations are done by accessing one column at a time */
5: /* Default MatSolve for block size 14 */
7: PetscErrorCode MatSolve_SeqBAIJ_14_NaturalOrdering(Mat A, Vec bb, Vec xx)
8: {
9: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
10: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi, bs = A->rmap->bs, bs2 = a->bs2;
11: PetscInt i, k, nz, idx, idt, m;
12: const MatScalar *aa = a->a, *v;
13: PetscScalar s[14];
14: PetscScalar *x, xv;
15: const PetscScalar *b;
17: PetscFunctionBegin;
18: PetscCall(VecGetArrayRead(bb, &b));
19: PetscCall(VecGetArray(xx, &x));
21: /* forward solve the lower triangular */
22: for (i = 0; i < n; i++) {
23: v = aa + bs2 * ai[i];
24: vi = aj + ai[i];
25: nz = ai[i + 1] - ai[i];
26: idt = bs * i;
27: x[idt] = b[idt];
28: x[1 + idt] = b[1 + idt];
29: x[2 + idt] = b[2 + idt];
30: x[3 + idt] = b[3 + idt];
31: x[4 + idt] = b[4 + idt];
32: x[5 + idt] = b[5 + idt];
33: x[6 + idt] = b[6 + idt];
34: x[7 + idt] = b[7 + idt];
35: x[8 + idt] = b[8 + idt];
36: x[9 + idt] = b[9 + idt];
37: x[10 + idt] = b[10 + idt];
38: x[11 + idt] = b[11 + idt];
39: x[12 + idt] = b[12 + idt];
40: x[13 + idt] = b[13 + idt];
41: for (m = 0; m < nz; m++) {
42: idx = bs * vi[m];
43: for (k = 0; k < bs; k++) {
44: xv = x[k + idx];
45: x[idt] -= v[0] * xv;
46: x[1 + idt] -= v[1] * xv;
47: x[2 + idt] -= v[2] * xv;
48: x[3 + idt] -= v[3] * xv;
49: x[4 + idt] -= v[4] * xv;
50: x[5 + idt] -= v[5] * xv;
51: x[6 + idt] -= v[6] * xv;
52: x[7 + idt] -= v[7] * xv;
53: x[8 + idt] -= v[8] * xv;
54: x[9 + idt] -= v[9] * xv;
55: x[10 + idt] -= v[10] * xv;
56: x[11 + idt] -= v[11] * xv;
57: x[12 + idt] -= v[12] * xv;
58: x[13 + idt] -= v[13] * xv;
59: v += 14;
60: }
61: }
62: }
63: /* backward solve the upper triangular */
64: for (i = n - 1; i >= 0; i--) {
65: v = aa + bs2 * (adiag[i + 1] + 1);
66: vi = aj + adiag[i + 1] + 1;
67: nz = adiag[i] - adiag[i + 1] - 1;
68: idt = bs * i;
69: s[0] = x[idt];
70: s[1] = x[1 + idt];
71: s[2] = x[2 + idt];
72: s[3] = x[3 + idt];
73: s[4] = x[4 + idt];
74: s[5] = x[5 + idt];
75: s[6] = x[6 + idt];
76: s[7] = x[7 + idt];
77: s[8] = x[8 + idt];
78: s[9] = x[9 + idt];
79: s[10] = x[10 + idt];
80: s[11] = x[11 + idt];
81: s[12] = x[12 + idt];
82: s[13] = x[13 + idt];
84: for (m = 0; m < nz; m++) {
85: idx = bs * vi[m];
86: for (k = 0; k < bs; k++) {
87: xv = x[k + idx];
88: s[0] -= v[0] * xv;
89: s[1] -= v[1] * xv;
90: s[2] -= v[2] * xv;
91: s[3] -= v[3] * xv;
92: s[4] -= v[4] * xv;
93: s[5] -= v[5] * xv;
94: s[6] -= v[6] * xv;
95: s[7] -= v[7] * xv;
96: s[8] -= v[8] * xv;
97: s[9] -= v[9] * xv;
98: s[10] -= v[10] * xv;
99: s[11] -= v[11] * xv;
100: s[12] -= v[12] * xv;
101: s[13] -= v[13] * xv;
102: v += 14;
103: }
104: }
105: PetscCall(PetscArrayzero(x + idt, bs));
106: for (k = 0; k < bs; k++) {
107: x[idt] += v[0] * s[k];
108: x[1 + idt] += v[1] * s[k];
109: x[2 + idt] += v[2] * s[k];
110: x[3 + idt] += v[3] * s[k];
111: x[4 + idt] += v[4] * s[k];
112: x[5 + idt] += v[5] * s[k];
113: x[6 + idt] += v[6] * s[k];
114: x[7 + idt] += v[7] * s[k];
115: x[8 + idt] += v[8] * s[k];
116: x[9 + idt] += v[9] * s[k];
117: x[10 + idt] += v[10] * s[k];
118: x[11 + idt] += v[11] * s[k];
119: x[12 + idt] += v[12] * s[k];
120: x[13 + idt] += v[13] * s[k];
121: v += 14;
122: }
123: }
124: PetscCall(VecRestoreArrayRead(bb, &b));
125: PetscCall(VecRestoreArray(xx, &x));
126: PetscCall(PetscLogFlops(2.0 * bs2 * (a->nz) - bs * A->cmap->n));
127: PetscFunctionReturn(PETSC_SUCCESS);
128: }
130: /* Block operations are done by accessing one column at a time */
131: /* Default MatSolve for block size 13 */
133: PetscErrorCode MatSolve_SeqBAIJ_13_NaturalOrdering(Mat A, Vec bb, Vec xx)
134: {
135: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
136: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi, bs = A->rmap->bs, bs2 = a->bs2;
137: PetscInt i, k, nz, idx, idt, m;
138: const MatScalar *aa = a->a, *v;
139: PetscScalar s[13];
140: PetscScalar *x, xv;
141: const PetscScalar *b;
143: PetscFunctionBegin;
144: PetscCall(VecGetArrayRead(bb, &b));
145: PetscCall(VecGetArray(xx, &x));
147: /* forward solve the lower triangular */
148: for (i = 0; i < n; i++) {
149: v = aa + bs2 * ai[i];
150: vi = aj + ai[i];
151: nz = ai[i + 1] - ai[i];
152: idt = bs * i;
153: x[idt] = b[idt];
154: x[1 + idt] = b[1 + idt];
155: x[2 + idt] = b[2 + idt];
156: x[3 + idt] = b[3 + idt];
157: x[4 + idt] = b[4 + idt];
158: x[5 + idt] = b[5 + idt];
159: x[6 + idt] = b[6 + idt];
160: x[7 + idt] = b[7 + idt];
161: x[8 + idt] = b[8 + idt];
162: x[9 + idt] = b[9 + idt];
163: x[10 + idt] = b[10 + idt];
164: x[11 + idt] = b[11 + idt];
165: x[12 + idt] = b[12 + idt];
166: for (m = 0; m < nz; m++) {
167: idx = bs * vi[m];
168: for (k = 0; k < bs; k++) {
169: xv = x[k + idx];
170: x[idt] -= v[0] * xv;
171: x[1 + idt] -= v[1] * xv;
172: x[2 + idt] -= v[2] * xv;
173: x[3 + idt] -= v[3] * xv;
174: x[4 + idt] -= v[4] * xv;
175: x[5 + idt] -= v[5] * xv;
176: x[6 + idt] -= v[6] * xv;
177: x[7 + idt] -= v[7] * xv;
178: x[8 + idt] -= v[8] * xv;
179: x[9 + idt] -= v[9] * xv;
180: x[10 + idt] -= v[10] * xv;
181: x[11 + idt] -= v[11] * xv;
182: x[12 + idt] -= v[12] * xv;
183: v += 13;
184: }
185: }
186: }
187: /* backward solve the upper triangular */
188: for (i = n - 1; i >= 0; i--) {
189: v = aa + bs2 * (adiag[i + 1] + 1);
190: vi = aj + adiag[i + 1] + 1;
191: nz = adiag[i] - adiag[i + 1] - 1;
192: idt = bs * i;
193: s[0] = x[idt];
194: s[1] = x[1 + idt];
195: s[2] = x[2 + idt];
196: s[3] = x[3 + idt];
197: s[4] = x[4 + idt];
198: s[5] = x[5 + idt];
199: s[6] = x[6 + idt];
200: s[7] = x[7 + idt];
201: s[8] = x[8 + idt];
202: s[9] = x[9 + idt];
203: s[10] = x[10 + idt];
204: s[11] = x[11 + idt];
205: s[12] = x[12 + idt];
207: for (m = 0; m < nz; m++) {
208: idx = bs * vi[m];
209: for (k = 0; k < bs; k++) {
210: xv = x[k + idx];
211: s[0] -= v[0] * xv;
212: s[1] -= v[1] * xv;
213: s[2] -= v[2] * xv;
214: s[3] -= v[3] * xv;
215: s[4] -= v[4] * xv;
216: s[5] -= v[5] * xv;
217: s[6] -= v[6] * xv;
218: s[7] -= v[7] * xv;
219: s[8] -= v[8] * xv;
220: s[9] -= v[9] * xv;
221: s[10] -= v[10] * xv;
222: s[11] -= v[11] * xv;
223: s[12] -= v[12] * xv;
224: v += 13;
225: }
226: }
227: PetscCall(PetscArrayzero(x + idt, bs));
228: for (k = 0; k < bs; k++) {
229: x[idt] += v[0] * s[k];
230: x[1 + idt] += v[1] * s[k];
231: x[2 + idt] += v[2] * s[k];
232: x[3 + idt] += v[3] * s[k];
233: x[4 + idt] += v[4] * s[k];
234: x[5 + idt] += v[5] * s[k];
235: x[6 + idt] += v[6] * s[k];
236: x[7 + idt] += v[7] * s[k];
237: x[8 + idt] += v[8] * s[k];
238: x[9 + idt] += v[9] * s[k];
239: x[10 + idt] += v[10] * s[k];
240: x[11 + idt] += v[11] * s[k];
241: x[12 + idt] += v[12] * s[k];
242: v += 13;
243: }
244: }
245: PetscCall(VecRestoreArrayRead(bb, &b));
246: PetscCall(VecRestoreArray(xx, &x));
247: PetscCall(PetscLogFlops(2.0 * bs2 * (a->nz) - bs * A->cmap->n));
248: PetscFunctionReturn(PETSC_SUCCESS);
249: }
251: /* Block operations are done by accessing one column at a time */
252: /* Default MatSolve for block size 12 */
254: PetscErrorCode MatSolve_SeqBAIJ_12_NaturalOrdering(Mat A, Vec bb, Vec xx)
255: {
256: Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
257: const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi, bs = A->rmap->bs, bs2 = a->bs2;
258: PetscInt i, k, nz, idx, idt, m;
259: const MatScalar *aa = a->a, *v;
260: PetscScalar s[12];
261: PetscScalar *x, xv;
262: const PetscScalar *b;
264: PetscFunctionBegin;
265: PetscCall(VecGetArrayRead(bb, &b));
266: PetscCall(VecGetArray(xx, &x));
268: /* forward solve the lower triangular */
269: for (i = 0; i < n; i++) {
270: v = aa + bs2 * ai[i];
271: vi = aj + ai[i];
272: nz = ai[i + 1] - ai[i];
273: idt = bs * i;
274: x[idt] = b[idt];
275: x[1 + idt] = b[1 + idt];
276: x[2 + idt] = b[2 + idt];
277: x[3 + idt] = b[3 + idt];
278: x[4 + idt] = b[4 + idt];
279: x[5 + idt] = b[5 + idt];
280: x[6 + idt] = b[6 + idt];
281: x[7 + idt] = b[7 + idt];
282: x[8 + idt] = b[8 + idt];
283: x[9 + idt] = b[9 + idt];
284: x[10 + idt] = b[10 + idt];
285: x[11 + idt] = b[11 + idt];
286: for (m = 0; m < nz; m++) {
287: idx = bs * vi[m];
288: for (k = 0; k < bs; k++) {
289: xv = x[k + idx];
290: x[idt] -= v[0] * xv;
291: x[1 + idt] -= v[1] * xv;
292: x[2 + idt] -= v[2] * xv;
293: x[3 + idt] -= v[3] * xv;
294: x[4 + idt] -= v[4] * xv;
295: x[5 + idt] -= v[5] * xv;
296: x[6 + idt] -= v[6] * xv;
297: x[7 + idt] -= v[7] * xv;
298: x[8 + idt] -= v[8] * xv;
299: x[9 + idt] -= v[9] * xv;
300: x[10 + idt] -= v[10] * xv;
301: x[11 + idt] -= v[11] * xv;
302: v += 12;
303: }
304: }
305: }
306: /* backward solve the upper triangular */
307: for (i = n - 1; i >= 0; i--) {
308: v = aa + bs2 * (adiag[i + 1] + 1);
309: vi = aj + adiag[i + 1] + 1;
310: nz = adiag[i] - adiag[i + 1] - 1;
311: idt = bs * i;
312: s[0] = x[idt];
313: s[1] = x[1 + idt];
314: s[2] = x[2 + idt];
315: s[3] = x[3 + idt];
316: s[4] = x[4 + idt];
317: s[5] = x[5 + idt];
318: s[6] = x[6 + idt];
319: s[7] = x[7 + idt];
320: s[8] = x[8 + idt];
321: s[9] = x[9 + idt];
322: s[10] = x[10 + idt];
323: s[11] = x[11 + idt];
325: for (m = 0; m < nz; m++) {
326: idx = bs * vi[m];
327: for (k = 0; k < bs; k++) {
328: xv = x[k + idx];
329: s[0] -= v[0] * xv;
330: s[1] -= v[1] * xv;
331: s[2] -= v[2] * xv;
332: s[3] -= v[3] * xv;
333: s[4] -= v[4] * xv;
334: s[5] -= v[5] * xv;
335: s[6] -= v[6] * xv;
336: s[7] -= v[7] * xv;
337: s[8] -= v[8] * xv;
338: s[9] -= v[9] * xv;
339: s[10] -= v[10] * xv;
340: s[11] -= v[11] * xv;
341: v += 12;
342: }
343: }
344: PetscCall(PetscArrayzero(x + idt, bs));
345: for (k = 0; k < bs; k++) {
346: x[idt] += v[0] * s[k];
347: x[1 + idt] += v[1] * s[k];
348: x[2 + idt] += v[2] * s[k];
349: x[3 + idt] += v[3] * s[k];
350: x[4 + idt] += v[4] * s[k];
351: x[5 + idt] += v[5] * s[k];
352: x[6 + idt] += v[6] * s[k];
353: x[7 + idt] += v[7] * s[k];
354: x[8 + idt] += v[8] * s[k];
355: x[9 + idt] += v[9] * s[k];
356: x[10 + idt] += v[10] * s[k];
357: x[11 + idt] += v[11] * s[k];
358: v += 12;
359: }
360: }
361: PetscCall(VecRestoreArrayRead(bb, &b));
362: PetscCall(VecRestoreArray(xx, &x));
363: PetscCall(PetscLogFlops(2.0 * bs2 * (a->nz) - bs * A->cmap->n));
364: PetscFunctionReturn(PETSC_SUCCESS);
365: }