Actual source code: ex1.raja.cxx
1: //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
2: // Copyright (c) 2016-21, Lawrence Livermore National Security, LLC
3: // and RAJA project contributors. See the RAJA/COPYRIGHT file for details.
4: //
5: // SPDX-License-Identifier: (BSD-3-Clause)
6: //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~//
8: #include <cstdlib>
9: #include <cstdio>
10: #include <cstring>
12: #include <iostream>
13: #include <cmath>
15: #include "RAJA/RAJA.hpp"
17: #include "memoryManager.hpp"
19: /*
20: * Jacobi Example
21: *
22: * ----[Details]--------------------
23: * This code uses a five point finite difference stencil
24: * to discretize the following boundary value problem
25: *
26: * U_xx + U_yy = f on [0,1] x [0,1].
27: *
28: * The right-hand side is chosen to be
29: * f = 2*x*(y-1)*(y-2*x+x*y+2)*exp(x-y).
30: *
31: * A structured grid is used to discretize the domain
32: * [0,1] x [0,1]. Values inside the domain are computed
33: * using the Jacobi method to solve the associated
34: * linear system. The scheme is invoked until the l_2
35: * difference of subsequent iterations is below a
36: * tolerance.
37: *
38: * The scheme is implemented by allocating two arrays
39: * (I, Iold) and initialized to zero. The first set of
40: * nested for loops apply an iteration of the Jacobi
41: * scheme. The scheme is only applied to the interior
42: * nodes.
43: *
44: * The second set of nested for loops is used to
45: * update Iold and compute the l_2 norm of the
46: * difference of the iterates.
47: *
48: * Computing the l_2 norm requires a reduction operation.
49: * To simplify the reduction procedure, the RAJA API
50: * introduces thread safe variables.
51: *
52: * ----[RAJA Concepts]---------------
53: * - Forall::nested loop
54: * - RAJA Reduction
55: *
56: */
58: /*
59: * ----[Constant Values]-----
60: * CUDA_BLOCK_SIZE_X - Number of threads in the
61: * x-dimension of a cuda thread block
62: *
63: * CUDA_BLOCK_SIZE_Y - Number of threads in the
64: * y-dimension of a cuda thread block
65: *
66: * CUDA_BLOCK_SIZE - Number of threads per threads block
67: */
68: #if defined(RAJA_ENABLE_CUDA)
69: const int CUDA_BLOCK_SIZE = 256;
70: #endif
72: #if defined(RAJA_ENABLE_HIP)
73: const int HIP_BLOCK_SIZE = 256;
74: #endif
76: //
77: // Struct to hold grid info
78: // o - Origin in a cartesian dimension
79: // h - Spacing between grid points
80: // n - Number of grid points
81: //
82: struct grid_s {
83: double o, h;
84: int n;
85: };
87: //
88: // ----[Functions]---------
89: // solution - Function for the analytic solution
90: // computeErr - Displays the maximum error in the solution
91: //
92: double solution(double x, double y);
93: void computeErr(double *I, grid_s grid);
95: int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv[]))
96: {
97: std::cout << "Jacobi Example" << std::endl;
99: /*
100: * ----[Solver Parameters]------------
101: * tol - Method terminates once the norm is less than tol
102: * N - Number of unknown gridpoints per cartesian dimension
103: * NN - Total number of gridpoints on the grid
104: * maxIter - Maximum number of iterations to be taken
105: *
106: * resI2 - Residual
107: * iteration - Iteration number
108: * grid_s - Struct with grid information for a cartesian dimension
109: */
110: double tol = 1e-10;
112: int N = 50;
113: int NN = (N + 2) * (N + 2);
114: int maxIter = 100000;
116: double resI2;
117: int iteration;
119: grid_s gridx;
120: gridx.o = 0.0;
121: gridx.h = 1.0 / (N + 1.0);
122: gridx.n = N + 2;
124: //
125: //I, Iold - Holds iterates of Jacobi method
126: //
127: double *I = memoryManager::allocate<double>(NN);
128: double *Iold = memoryManager::allocate<double>(NN);
130: memset(I, 0, NN * sizeof(double));
131: memset(Iold, 0, NN * sizeof(double));
133: printf("Standard C++ Loop \n");
134: resI2 = 1;
135: iteration = 0;
137: while (resI2 > tol * tol) {
138: //
139: // Jacobi Iteration
140: //
141: for (int n = 1; n <= N; ++n) {
142: for (int m = 1; m <= N; ++m) {
143: double x = gridx.o + m * gridx.h;
144: double y = gridx.o + n * gridx.h;
146: double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
148: int id = n * (N + 2) + m;
149: I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]);
150: }
151: }
153: //
154: // Compute residual and update Iold
155: //
156: resI2 = 0.0;
157: for (int k = 0; k < NN; k++) {
158: resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
159: Iold[k] = I[k];
160: }
162: if (iteration > maxIter) {
163: printf("Standard C++ Loop - Maxed out on iterations \n");
164: exit(-1);
165: }
167: iteration++;
168: }
169: computeErr(I, gridx);
170: printf("No of iterations: %d \n \n", iteration);
172: //
173: // RAJA loop calls may be shortened by predefining policies
174: //
175: RAJA::RangeSegment gridRange(0, NN);
176: RAJA::RangeSegment jacobiRange(1, N + 1);
178: using jacobiSeqNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
180: printf("RAJA: Sequential Policy - Nested ForallN \n");
181: resI2 = 1;
182: iteration = 0;
183: memset(I, 0, NN * sizeof(double));
184: memset(Iold, 0, NN * sizeof(double));
186: /*
187: * Sequential Jacobi Iteration.
188: *
189: * Note that a RAJA ReduceSum object is used to accumulate the sum
190: * for the residual. Since the loop is run sequentially, this is
191: * not strictly necessary. It is done here for consistency and
192: * comparison with other RAJA variants in this example.
193: */
194: while (resI2 > tol * tol) {
195: RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=](RAJA::Index_type m, RAJA::Index_type n) {
196: double x = gridx.o + m * gridx.h;
197: double y = gridx.o + n * gridx.h;
199: double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
201: int id = n * (N + 2) + m;
202: I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]);
203: });
205: RAJA::ReduceSum<RAJA::seq_reduce, double> RAJA_resI2(0.0);
206: RAJA::forall<RAJA::seq_exec>(gridRange, [=](RAJA::Index_type k) {
207: RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
208: Iold[k] = I[k];
209: });
211: resI2 = RAJA_resI2;
212: if (iteration > maxIter) {
213: printf("Jacobi: Sequential - Maxed out on iterations! \n");
214: exit(-1);
215: }
216: iteration++;
217: }
218: computeErr(I, gridx);
219: printf("No of iterations: %d \n \n", iteration);
221: #if defined(RAJA_ENABLE_OPENMP)
222: printf("RAJA: OpenMP Policy - Nested ForallN \n");
223: resI2 = 1;
224: iteration = 0;
225: memset(I, 0, NN * sizeof(double));
226: memset(Iold, 0, NN * sizeof(double));
228: /*
229: * OpenMP parallel Jacobi Iteration.
230: *
231: * ----[RAJA Policies]-----------
232: * RAJA::omp_collapse_for_exec -
233: * introduced a nested region
234: *
235: * Note that OpenMP RAJA ReduceSum object performs the reduction
236: * operation for the residual in a thread-safe manner.
237: */
239: using jacobiOmpNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<1, RAJA::omp_parallel_for_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
241: while (resI2 > tol * tol) {
242: RAJA::kernel<jacobiOmpNestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=](RAJA::Index_type m, RAJA::Index_type n) {
243: double x = gridx.o + m * gridx.h;
244: double y = gridx.o + n * gridx.h;
246: double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
248: int id = n * (N + 2) + m;
249: I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]);
250: });
252: RAJA::ReduceSum<RAJA::omp_reduce, double> RAJA_resI2(0.0);
254: RAJA::forall<RAJA::omp_parallel_for_exec>(gridRange, [=](RAJA::Index_type k) {
255: RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
256: Iold[k] = I[k];
257: });
259: resI2 = RAJA_resI2;
260: if (iteration > maxIter) {
261: printf("Jacobi: OpenMP - Maxed out on iterations! \n");
262: exit(-1);
263: }
264: iteration++;
265: }
266: computeErr(I, gridx);
267: printf("No of iterations: %d \n \n", iteration);
268: #endif
270: #if defined(RAJA_ENABLE_CUDA)
271: /*
272: * CUDA Jacobi Iteration.
273: *
274: * ----[RAJA Policies]-----------
275: * RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec -
276: * define the mapping of loop iterations to GPU thread blocks
277: *
278: * Note that CUDA RAJA ReduceSum object performs the reduction
279: * operation for the residual in a thread-safe manner on the GPU.
280: */
282: printf("RAJA: CUDA Policy - Nested ForallN \n");
284: using jacobiCUDANestedPolicy = RAJA::KernelPolicy<RAJA::statement::CudaKernel<RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::cuda_block_y_loop, RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::cuda_block_x_loop, RAJA::statement::For<1, RAJA::cuda_thread_y_direct, RAJA::statement::For<0, RAJA::cuda_thread_x_direct, RAJA::statement::Lambda<0>>>>>>>;
286: resI2 = 1;
287: iteration = 0;
288: memset(I, 0, NN * sizeof(double));
289: memset(Iold, 0, NN * sizeof(double));
291: while (resI2 > tol * tol) {
292: //
293: // Jacobi Iteration
294: //
295: RAJA::kernel<jacobiCUDANestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n) {
296: double x = gridx.o + m * gridx.h;
297: double y = gridx.o + n * gridx.h;
299: double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
301: int id = n * (N + 2) + m;
302: I[id] = 0.25 * (-f + Iold[id - N - 2] + Iold[id + N + 2] + Iold[id - 1] + Iold[id + 1]);
303: });
305: //
306: // Compute residual and update Iold
307: //
308: RAJA::ReduceSum<RAJA::cuda_reduce, double> RAJA_resI2(0.0);
309: RAJA::forall<RAJA::cuda_exec<CUDA_BLOCK_SIZE>>(gridRange, [=] RAJA_DEVICE(RAJA::Index_type k) {
310: RAJA_resI2 += (I[k] - Iold[k]) * (I[k] - Iold[k]);
311: Iold[k] = I[k];
312: });
314: resI2 = RAJA_resI2;
316: if (iteration > maxIter) {
317: printf("RAJA: CUDA - Maxed out on iterations! \n");
318: exit(-1);
319: }
320: iteration++;
321: }
322: cudaDeviceSynchronize();
323: computeErr(I, gridx);
324: printf("No of iterations: %d \n \n", iteration);
325: #endif
327: #if defined(RAJA_ENABLE_HIP)
328: /*
329: * HIP Jacobi Iteration.
330: *
331: * ----[RAJA Policies]-----------
332: * RAJA::cuda_threadblock_y_exec, RAJA::cuda_threadblock_x_exec -
333: * define the mapping of loop iterations to GPU thread blocks
334: *
335: * Note that HIP RAJA ReduceSum object performs the reduction
336: * operation for the residual in a thread-safe manner on the GPU.
337: */
339: printf("RAJA: HIP Policy - Nested ForallN \n");
341: using jacobiHIPNestedPolicy = RAJA::KernelPolicy<RAJA::statement::HipKernel<RAJA::statement::Tile<1, RAJA::tile_fixed<32>, RAJA::hip_block_y_loop, RAJA::statement::Tile<0, RAJA::tile_fixed<32>, RAJA::hip_block_x_loop, RAJA::statement::For<1, RAJA::hip_thread_y_direct, RAJA::statement::For<0, RAJA::hip_thread_x_direct, RAJA::statement::Lambda<0>>>>>>>;
343: resI2 = 1;
344: iteration = 0;
345: memset(I, 0, NN * sizeof(double));
346: memset(Iold, 0, NN * sizeof(double));
348: double *d_I = memoryManager::allocate_gpu<double>(NN);
349: double *d_Iold = memoryManager::allocate_gpu<double>(NN);
350: hipErrchk(hipMemcpy(d_I, I, NN * sizeof(double), hipMemcpyHostToDevice));
351: hipErrchk(hipMemcpy(d_Iold, Iold, NN * sizeof(double), hipMemcpyHostToDevice));
353: while (resI2 > tol * tol) {
354: //
355: // Jacobi Iteration
356: //
357: RAJA::kernel<jacobiHIPNestedPolicy>(RAJA::make_tuple(jacobiRange, jacobiRange), [=] RAJA_DEVICE(RAJA::Index_type m, RAJA::Index_type n) {
358: double x = gridx.o + m * gridx.h;
359: double y = gridx.o + n * gridx.h;
361: double f = gridx.h * gridx.h * (2 * x * (y - 1) * (y - 2 * x + x * y + 2) * exp(x - y));
363: int id = n * (N + 2) + m;
364: d_I[id] = 0.25 * (-f + d_Iold[id - N - 2] + d_Iold[id + N + 2] + d_Iold[id - 1] + d_Iold[id + 1]);
365: });
367: //
368: // Compute residual and update Iold
369: //
370: RAJA::ReduceSum<RAJA::hip_reduce, double> RAJA_resI2(0.0);
371: RAJA::forall<RAJA::hip_exec<HIP_BLOCK_SIZE>>(gridRange, [=] RAJA_DEVICE(RAJA::Index_type k) {
372: RAJA_resI2 += (d_I[k] - d_Iold[k]) * (d_I[k] - d_Iold[k]);
373: d_Iold[k] = d_I[k];
374: });
376: resI2 = RAJA_resI2;
378: if (iteration > maxIter) {
379: printf("RAJA: HIP - Maxed out on iterations! \n");
380: exit(-1);
381: }
382: iteration++;
383: }
384: hipDeviceSynchronize();
385: hipErrchk(hipMemcpy(I, d_I, NN * sizeof(double), hipMemcpyDeviceToHost));
386: computeErr(I, gridx);
387: printf("No of iterations: %d \n \n", iteration);
389: memoryManager::deallocate_gpu(d_I);
390: memoryManager::deallocate_gpu(d_Iold);
391: #endif
393: memoryManager::deallocate(I);
394: memoryManager::deallocate(Iold);
396: return 0;
397: }
399: //
400: // Function for the analytic solution
401: //
402: double solution(double x, double y)
403: {
404: return x * y * exp(x - y) * (1 - x) * (1 - y);
405: }
407: //
408: // Error is computed via ||I_{approx}(:) - U_{analytic}(:)||_{inf}
409: //
410: void computeErr(double *I, grid_s grid)
411: {
412: RAJA::RangeSegment gridRange(0, grid.n);
413: RAJA::ReduceMax<RAJA::seq_reduce, double> tMax(-1.0);
415: using jacobiSeqNestedPolicy = RAJA::KernelPolicy<RAJA::statement::For<1, RAJA::seq_exec, RAJA::statement::For<0, RAJA::seq_exec, RAJA::statement::Lambda<0>>>>;
417: RAJA::kernel<jacobiSeqNestedPolicy>(RAJA::make_tuple(gridRange, gridRange), [=](RAJA::Index_type ty, RAJA::Index_type tx) {
418: int id = tx + grid.n * ty;
419: double x = grid.o + tx * grid.h;
420: double y = grid.o + ty * grid.h;
421: double myErr = std::abs(I[id] - solution(x, y));
422: tMax.max(myErr);
423: });
425: double l2err = tMax;
426: printf("Max error = %lg, h = %f \n", l2err, grid.h);
427: }
429: /*TEST
431: test:
432: requires: raja !cuda
434: test:
435: suffix: 2
436: requires: raja cuda
438: TEST*/