From 93efce868ee99bee831014ff0e8f7dac56ef1613 Mon Sep 17 00:00:00 2001 From: julianlitz Date: Thu, 21 May 2026 22:48:32 +0200 Subject: [PATCH 1/4] Use Kokkos::Parallel in applyExtrapolation --- include/GMGPolar/gmgpolar.h | 33 +++--- include/GMGPolar/solver.h | 203 ++++++++++++++++++++---------------- 2 files changed, 132 insertions(+), 104 deletions(-) diff --git a/include/GMGPolar/gmgpolar.h b/include/GMGPolar/gmgpolar.h index 875ad3bc..7f720548 100644 --- a/include/GMGPolar/gmgpolar.h +++ b/include/GMGPolar/gmgpolar.h @@ -103,7 +103,7 @@ class GMGPolar : public IGMGPolar /* ------------------------------------------------------------------------- */ /* Chooses if full grid smoothing is active on level 0 for extrapolation > 0 */ - bool full_grid_smoothing_ = false; + bool full_grid_smoothing_; /* -------------------------------------------------- */ /* Vectors for PCG (Preconditioned Conjugate Gradient) @@ -121,6 +121,11 @@ class GMGPolar : public IGMGPolar HostAllocatableVector pcg_solution_; // x (solution) HostAllocatableVector pcg_search_direction_; // p (search direction) + /* ---------------------------------------------------------------------------------------------- */ + /* Store analytical solution values on host to avoid repeated computation during error evaluation */ + HostAllocatableVector analytical_solution_host_; + std::vector> exact_errors_; + /* -------------------- */ /* Convergence criteria */ int number_of_iterations_; @@ -128,25 +133,31 @@ class GMGPolar : public IGMGPolar double mean_residual_reduction_factor_; bool converged(double current_residual_norm, double first_residual_norm); +public: // Public due to cuda restrictions /* ---------------------------------------------------- */ /* Compute exact error if an exact solution is provided */ // The results are stored as a pair: (weighted L2 error, infinity error). - std::vector> exact_errors_; - std::pair computeExactError(Level& level, - HostConstVector solution, HostVector error, - HostConstVector exact_solution); + std::pair evaluateExactError(const PolarGrid& grid, ConstHostVector discrete_solution, + HostConstVector analytical_solution_host, + HostVector error); + void computeAnalyticalSolutionOnHost(const PolarGrid& grid, HostConstVector analytical_solution_host, + const ExactSolution& exact_solution); /* --------------- */ /* Setup Functions */ - int chooseNumberOfLevels(const PolarGrid& finest_grid); - // Public due to cuda restrictions -public: template void build_rhs_f(const Level& level, HostVector rhs_f, const BoundaryConditions& boundary_conditions, const SourceTerm& source_term); void discretize_rhs_f(const Level& level, HostVector rhs_f); + /* --------------- */ + /* Solve Functions */ + void applyExtrapolation(int current_level, HostVector fine_values, HostConstVector coarse_values); + private: + /* --------------- */ + /* Setup Functions */ + int chooseNumberOfLevels(const PolarGrid& finest_grid); bool checkUniformRefinement(const PolarGrid& grid, double tolerance) const; /* --------------- */ @@ -169,15 +180,13 @@ class GMGPolar : public IGMGPolar int iterations); void applyExtrapolatedMultigridIterations(Level& level, MultigridCycleType cycle, int iterations); - // Compute the extrapolated values: u_ex = 4/3 u_fine - 1/3 u_coarse - void applyExtrapolation(int current_level, HostVector fine_values, HostConstVector coarse_values); /* ----------------- */ /* Print information */ void printSettings(const PolarGrid& finest_grid, const PolarGrid& coarsest_grid) const; - void printIterationHeader(const ExactSolution* exact_solution); + void printIterationHeader(bool is_exact_solution_provided); void printIterationInfo(int iteration, double current_residual_norm, double current_relative_residual_norm, - const ExactSolution* exact_solution); + bool is_exact_solution_provided); /* ------------------- */ /* Multigrid Functions */ diff --git a/include/GMGPolar/solver.h b/include/GMGPolar/solver.h index efb1aac3..b749fa35 100644 --- a/include/GMGPolar/solver.h +++ b/include/GMGPolar/solver.h @@ -95,32 +95,22 @@ void GMGPolar::solve(const BoundaryC /* ---------------------------------------------- */ LIKWID_STOP("Solver"); auto start_check_exact_error = std::chrono::high_resolution_clock::now(); - HostAllocatableVector exact_sol; + if (exact_solution_) { - exact_sol = HostVector("exact_sol", level.solution().size()); - // fill exact solution on host to avoid repeat same computation - const PolarGrid& grid = level.grid(); - - Kokkos::parallel_for("fill exact sol outer loop on r", - Kokkos::MDRangePolicy>( - {0, 0}, {grid.numberSmootherCircles(), grid.ntheta()}), - [&](const int i_r, const int i_theta) { - double r = grid.radius(i_r); - double theta = grid.theta(i_theta); - const int grid_idx = grid.index(i_r, i_theta); - exact_sol(grid_idx) = exact_solution_->exact_solution(r, theta); - }); - Kokkos::parallel_for("fill exact sol outer loop on theta", - Kokkos::MDRangePolicy>( - {0, grid.numberSmootherCircles()}, {grid.ntheta(), grid.nr()}), - [&](const int i_theta, const int i_r) { - double r = grid.radius(i_r); - double theta = grid.theta(i_theta); - const int grid_idx = grid.index(i_r, i_theta); - exact_sol(grid_idx) = exact_solution_->exact_solution(r, theta); - }); - - evaluateExactError(level, exact_sol); + /* Fill analytical solution on the host. */ + // Note: We must compute the analytical solution values on the host because the + // ExactSolution object is not designed to be used on the device. + // As it is planned that the solution and error vector will use device memory during GPU porting, + // we need to transfer the exact solution values from host to device using the Kokkos::deep_copy operation. + if (std::ssize(analytical_solution_host_) != level.grid().numberOfNodes()) { + analytical_solution_host_ = HostVector("Analytical Solution", level.solution().size()); + } + computeAnalyticalSolutionOnHost(level.grid(), analytical_solution_host_, *exact_solution_); + + // Evaluate the error of the initial approximation against the exact solution, if provided. + // We use level.residual() as a temporary vector to store the error values. + exact_errors_.push_back( + evaluateExactError(level.grid(), level.solution(), analytical_solution_host_, level.residual())); } auto end_check_exact_error = std::chrono::high_resolution_clock::now(); t_check_exact_error_ += std::chrono::duration(end_check_exact_error - start_check_exact_error).count(); @@ -193,7 +183,7 @@ void GMGPolar::solve(const BoundaryC if (paraview_) { writeToVTK("output_solution", level, level.solution()); if (exact_solution_) { - computeExactError(level, level.solution(), level.residual(), exact_sol); + evaluateExactError(level.grid(), level.solution(), analytical_solution_host_, level.residual()); writeToVTK("output_error", level, level.residual()); } } @@ -268,8 +258,10 @@ void GMGPolar::solveMultigrid(double LIKWID_STOP("Solver"); auto start_check_exact_error = std::chrono::high_resolution_clock::now(); - if (exact_solution_) - evaluateExactError(level, exact_solution); + if (exact_solution_) { + exact_errors_.push_back( + evaluateExactError(level.grid(), level.solution(), analytical_solution_host_, level.residual())); + } auto end_check_exact_error = std::chrono::high_resolution_clock::now(); t_check_exact_error_ += std::chrono::duration(end_check_exact_error - start_check_exact_error).count(); @@ -379,8 +371,10 @@ void GMGPolar::solvePCG(double& init LIKWID_STOP("Solver"); auto start_check_exact_error = std::chrono::high_resolution_clock::now(); - if (exact_solution_) - exact_errors_.push_back(computeExactError(level, pcg_solution_, level.residual(), exact_solution)); + if (exact_solution_) { + exact_errors_.push_back( + evaluateExactError(level.grid(), pcg_solution_, analytical_solution_host_, level.residual())); + } auto end_check_exact_error = std::chrono::high_resolution_clock::now(); t_check_exact_error_ += std::chrono::duration(end_check_exact_error - start_check_exact_error).count(); @@ -536,46 +530,44 @@ void GMGPolar::applyExtrapolation(in assert(std::ssize(fine_values) == fineGrid.numberOfNodes()); assert(std::ssize(coarse_values) == coarseGrid.numberOfNodes()); -#pragma omp parallel - { -/* Circluar Indexing Section */ -/* For loop matches circular access pattern */ -#pragma omp for nowait - for (int i_r = 0; i_r < fineGrid.numberSmootherCircles(); i_r++) { - int i_r_coarse = i_r / 2; - for (int i_theta = 0; i_theta < fineGrid.ntheta(); i_theta++) { - int i_theta_coarse = i_theta / 2; - - if (i_r & 1 || i_theta & 1) { - fine_values[fineGrid.index(i_r, i_theta)] *= 4.0 / 3.0; - } - else { - int fine_idx = fineGrid.index(i_r, i_theta); - int coarse_idx = coarseGrid.index(i_r_coarse, i_theta_coarse); - fine_values[fine_idx] = (4.0 * fine_values[fine_idx] - coarse_values[coarse_idx]) / 3.0; - } + /* We split the loops into two regions to better respect the */ + /* access patterns of the smoother and improve cache locality. */ + + /* Circular Indexing Section */ + /* For loop matches circular access pattern */ + Kokkos::parallel_for( + "Extrapolation: Apply Extrapolation (Circular)", + Kokkos::MDRangePolicy>( + {0, 0}, {fineGrid.numberSmootherCircles(), fineGrid.ntheta()}), + KOKKOS_LAMBDA(const int i_r, const int i_theta) { + const int fine_idx = fineGrid.index(i_r, i_theta); + if (i_r & 1 || i_theta & 1) { + fine_values[fine_idx] *= 4.0 / 3.0; } - } - -/* Radial Indexing Section */ -/* For loop matches radial access pattern */ -#pragma omp for nowait - for (int i_theta = 0; i_theta < fineGrid.ntheta(); i_theta++) { - int i_theta_coarse = i_theta / 2; - for (int i_r = fineGrid.numberSmootherCircles(); i_r < fineGrid.nr(); i_r++) { - int i_r_coarse = i_r / 2; - - if (i_r & 1 || i_theta & 1) { - fine_values[fineGrid.index(i_r, i_theta)] *= 4.0 / 3.0; - } - else { - int fine_idx = fineGrid.index(i_r, i_theta); - int coarse_idx = coarseGrid.index(i_r_coarse, i_theta_coarse); - fine_values[fine_idx] = (4.0 * fine_values[fine_idx] - coarse_values[coarse_idx]) / 3.0; - } + else { + const int coarse_idx = coarseGrid.index(i_r / 2, i_theta / 2); + fine_values[fine_idx] = (4.0 * fine_values[fine_idx] - coarse_values[coarse_idx]) / 3.0; } - } - } + }); + + /* Radial Indexing Section */ + /* For loop matches radial access pattern */ + Kokkos::parallel_for( + "Extrapolation: Apply Extrapolation (Radial)", + Kokkos::MDRangePolicy>({0, fineGrid.numberSmootherCircles()}, + {fineGrid.ntheta(), fineGrid.nr()}), + KOKKOS_LAMBDA(const int i_theta, const int i_r) { + const int fine_idx = fineGrid.index(i_r, i_theta); + if (i_r & 1 || i_theta & 1) { + fine_values[fine_idx] *= 4.0 / 3.0; + } + else { + const int coarse_idx = coarseGrid.index(i_r / 2, i_theta / 2); + fine_values[fine_idx] = (4.0 * fine_values[fine_idx] - coarse_values[coarse_idx]) / 3.0; + } + }); + + Kokkos::fence(); } // ============================================================================= @@ -598,33 +590,60 @@ void GMGPolar::initRhsHierarchy(Host // ============================================================================= template -void GMGPolar::evaluateExactError( - Level& level, HostConstVector exact_solution) +std::pair GMGPolar::evaluateExactError( + const PolarGrid& grid, ConstHostVector discrete_solution, HostConstVector analytical_solution_host, + HostVector error) { + // Transfer the exact solution values from host to device memory for error computation. + Kokkos::deep_copy(error, analytical_solution_host); + + // Compute the error as the difference between the exact and numerical solutions. + subtract(error, discrete_solution); + // Compute the weighted L2 norm and infinity norm of the error between the numerical and exact solution. // The results are stored as a pair: (weighted L2 error, infinity error). - exact_errors_.push_back(computeExactError(level, level.solution(), level.residual(), exact_solution)); + const double weighted_euclidean_error = l2_norm(ConstHostVector(error)) / std::sqrt(grid.numberOfNodes()); + const double infinity_error = infinity_norm(ConstHostVector(error)); + + return std::make_pair(weighted_euclidean_error, infinity_error); } template -std::pair GMGPolar::computeExactError( - Level& level, HostConstVector solution, - HostVector error, HostConstVector exact_solution) +void GMGPolar::computeAnalyticalSolutionOnHost( + const PolarGrid& grid, HostConstVector analytical_solution_host, const ExactSolution& exact_solution) { - const PolarGrid& grid = level.grid(); - - assert(solution.size() == error.size()); - assert(std::ssize(solution) == grid.numberOfNodes()); - - Kokkos::deep_copy(error, solution); - // Compute the error as the difference between exact and numerical solution - subtract(error, exact_solution); - - HostConstVector c_error = error; - double weighted_euclidean_error = l2_norm(c_error) / std::sqrt(grid.numberOfNodes()); - double infinity_error = infinity_norm(c_error); - - return std::make_pair(weighted_euclidean_error, infinity_error); + assert(std::ssize(analytical_solution_host) == grid.numberOfNodes()); + + /* We split the loops into two regions to better respect the */ + /* access patterns of the smoother and improve cache locality. */ + + // The For loop matches circular access pattern */ + Kokkos::parallel_for( + "Analytical Solution: Compute Values on Host (Circular)", + Kokkos::MDRangePolicy>( // Host parallel loop + {0, 0}, // Starting index + {grid.numberSmootherCircles(), grid.ntheta()}), // Ending index + [&](const int i_r, const int i_theta) { + const int index = grid.index(i_r, i_theta); + const double radius = grid.radius(i_r); + const double theta = grid.theta(i_theta); + analytical_solution_host[index] = exact_solution.exact_solution(radius, theta); + }); + + /* For loop matches radial access pattern */ + Kokkos::parallel_for( + "Analytical Solution: Compute Values on Host (Radial)", + Kokkos::MDRangePolicy>( // Host parallel loop + {0, grid.numberSmootherCircles()}, // Starting index + {grid.ntheta(), grid.nr()}), // Ending index + [&](const int i_theta, const int i_r) { + const int index = grid.index(i_r, i_theta); + const double radius = grid.radius(i_r); + const double theta = grid.theta(i_theta); + analytical_solution_host[index] = exact_solution.exact_solution(radius, theta); + }); + + /* Kokkos::fence() is not needed here since everything is computed on the host */ } // ============================================================================= @@ -653,7 +672,7 @@ bool GMGPolar::converged(double resi // ============================================================================= template -void GMGPolar::printIterationHeader(const ExactSolution* exact_solution) +void GMGPolar::printIterationHeader(bool is_exact_solution_provided) { if (verbose_ <= 0) return; @@ -671,7 +690,7 @@ void GMGPolar::printIterationHeader( else std::cout << std::setw(15 + table_spacing) << "||r_k||/||rhs||"; } - if (exact_solution != nullptr) { + if (is_exact_solution_provided) { std::cout << std::setw(12 + table_spacing) << "||u-u_k||_l2"; std::cout << std::setw(13 + table_spacing) << "||u-u_k||_inf"; } @@ -683,7 +702,7 @@ template ::printIterationInfo(int iteration, double current_residual_norm, double current_relative_residual_norm, - const ExactSolution* exact_solution) + bool is_exact_solution_provided) { if (verbose_ <= 0) return; @@ -695,7 +714,7 @@ void GMGPolar::printIterationInfo(in std::cout << std::setw(9 + table_spacing + 2) << current_residual_norm; std::cout << std::setw(15 + table_spacing) << current_relative_residual_norm; } - if (exact_solution != nullptr) { + if (is_exact_solution_provided) { std::cout << std::setw(12 + table_spacing) << exact_errors_.back().first; std::cout << std::setw(13 + table_spacing) << exact_errors_.back().second; } From dcd067276fcb62af2ae8e09e0f6db04e1ac25b49 Mon Sep 17 00:00:00 2001 From: julianlitz Date: Thu, 21 May 2026 22:54:24 +0200 Subject: [PATCH 2/4] switcheroo --- include/GMGPolar/gmgpolar.h | 2 +- include/GMGPolar/solver.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/GMGPolar/gmgpolar.h b/include/GMGPolar/gmgpolar.h index 7f720548..1cd0f121 100644 --- a/include/GMGPolar/gmgpolar.h +++ b/include/GMGPolar/gmgpolar.h @@ -137,7 +137,7 @@ class GMGPolar : public IGMGPolar /* ---------------------------------------------------- */ /* Compute exact error if an exact solution is provided */ // The results are stored as a pair: (weighted L2 error, infinity error). - std::pair evaluateExactError(const PolarGrid& grid, ConstHostVector discrete_solution, + std::pair evaluateExactError(const PolarGrid& grid, HostConstVector discrete_solution, HostConstVector analytical_solution_host, HostVector error); void computeAnalyticalSolutionOnHost(const PolarGrid& grid, HostConstVector analytical_solution_host, diff --git a/include/GMGPolar/solver.h b/include/GMGPolar/solver.h index b749fa35..7cd6aa96 100644 --- a/include/GMGPolar/solver.h +++ b/include/GMGPolar/solver.h @@ -591,7 +591,7 @@ void GMGPolar::initRhsHierarchy(Host template std::pair GMGPolar::evaluateExactError( - const PolarGrid& grid, ConstHostVector discrete_solution, HostConstVector analytical_solution_host, + const PolarGrid& grid, HostConstVector discrete_solution, HostConstVector analytical_solution_host, HostVector error) { // Transfer the exact solution values from host to device memory for error computation. @@ -602,8 +602,8 @@ std::pair GMGPolar:: // Compute the weighted L2 norm and infinity norm of the error between the numerical and exact solution. // The results are stored as a pair: (weighted L2 error, infinity error). - const double weighted_euclidean_error = l2_norm(ConstHostVector(error)) / std::sqrt(grid.numberOfNodes()); - const double infinity_error = infinity_norm(ConstHostVector(error)); + const double weighted_euclidean_error = l2_norm(HostConstVector(error)) / std::sqrt(grid.numberOfNodes()); + const double infinity_error = infinity_norm(HostConstVector(error)); return std::make_pair(weighted_euclidean_error, infinity_error); } From 53685a9c599e0851aed7a8b40d86ad896953b431 Mon Sep 17 00:00:00 2001 From: julianlitz Date: Thu, 21 May 2026 22:59:54 +0200 Subject: [PATCH 3/4] improve --- include/GMGPolar/gmgpolar.h | 5 ++--- include/GMGPolar/solver.h | 10 ++++------ 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/include/GMGPolar/gmgpolar.h b/include/GMGPolar/gmgpolar.h index 1cd0f121..fc50785f 100644 --- a/include/GMGPolar/gmgpolar.h +++ b/include/GMGPolar/gmgpolar.h @@ -164,9 +164,8 @@ class GMGPolar : public IGMGPolar /* Solve Functions */ void fullMultigridApproximation(MultigridCycleType FMG_cycle, int FMG_iterations); void solveMultigrid(double& initial_residual_norm, double& current_residual_norm, - double& current_relative_residual_norm, HostConstVector exact_solution); - void solvePCG(double& initial_residual_norm, double& current_residual_norm, double& current_relative_residual_norm, - HostConstVector exact_solution); + double& current_relative_residual_norm); + void solvePCG(double& initial_residual_norm, double& current_residual_norm, double& current_relative_residual_norm); double residualNorm(const ResidualNormType& norm_type, const Level& level, HostConstVector residual) const; diff --git a/include/GMGPolar/solver.h b/include/GMGPolar/solver.h index 7cd6aa96..e7d4ddde 100644 --- a/include/GMGPolar/solver.h +++ b/include/GMGPolar/solver.h @@ -134,7 +134,7 @@ void GMGPolar::solve(const BoundaryC if (!PCG_) { // Solve A*x = b directly using multigrid cycles (V/W/F-cycle) // until convergence or max_iterations_ is reached. - solveMultigrid(initial_residual_norm, current_residual_norm, current_relative_residual_norm, exact_sol); + solveMultigrid(initial_residual_norm, current_residual_norm, current_relative_residual_norm); } else { // Solve A*x = b using Preconditioned Conjugate Gradient (PCG), @@ -142,7 +142,7 @@ void GMGPolar::solve(const BoundaryC // cycle approximates the action of A^{-1} at each PCG iteration). auto start_conjugate_gradient = std::chrono::high_resolution_clock::now(); - solvePCG(initial_residual_norm, current_residual_norm, current_relative_residual_norm, exact_sol); + solvePCG(initial_residual_norm, current_residual_norm, current_relative_residual_norm); auto end_conjugate_gradient = std::chrono::high_resolution_clock::now(); t_conjugate_gradient_ += @@ -228,8 +228,7 @@ void GMGPolar::fullMultigridApproxim template void GMGPolar::solveMultigrid(double& initial_residual_norm, double& current_residual_norm, - double& current_relative_residual_norm, - HostConstVector exact_solution) + double& current_relative_residual_norm) { Level& level = levels_[0]; @@ -304,8 +303,7 @@ void GMGPolar::solveMultigrid(double template void GMGPolar::solvePCG(double& initial_residual_norm, double& current_residual_norm, - double& current_relative_residual_norm, - HostConstVector exact_solution) + double& current_relative_residual_norm) { Level& level = levels_[0]; From dafcd2d96eb894926e664eaf0314a3d45e05c1d7 Mon Sep 17 00:00:00 2001 From: julianlitz Date: Thu, 21 May 2026 23:06:23 +0200 Subject: [PATCH 4/4] resolve const error --- include/GMGPolar/gmgpolar.h | 2 +- include/GMGPolar/solver.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/GMGPolar/gmgpolar.h b/include/GMGPolar/gmgpolar.h index fc50785f..48bf3fbd 100644 --- a/include/GMGPolar/gmgpolar.h +++ b/include/GMGPolar/gmgpolar.h @@ -140,7 +140,7 @@ class GMGPolar : public IGMGPolar std::pair evaluateExactError(const PolarGrid& grid, HostConstVector discrete_solution, HostConstVector analytical_solution_host, HostVector error); - void computeAnalyticalSolutionOnHost(const PolarGrid& grid, HostConstVector analytical_solution_host, + void computeAnalyticalSolutionOnHost(const PolarGrid& grid, HostVector analytical_solution_host, const ExactSolution& exact_solution); /* --------------- */ diff --git a/include/GMGPolar/solver.h b/include/GMGPolar/solver.h index e7d4ddde..c0659f07 100644 --- a/include/GMGPolar/solver.h +++ b/include/GMGPolar/solver.h @@ -593,7 +593,7 @@ std::pair GMGPolar:: HostVector error) { // Transfer the exact solution values from host to device memory for error computation. - Kokkos::deep_copy(error, analytical_solution_host); + Kokkos::deep_copy(error, analytical_solution_host_); // Compute the error as the difference between the exact and numerical solutions. subtract(error, discrete_solution); @@ -608,7 +608,7 @@ std::pair GMGPolar:: template void GMGPolar::computeAnalyticalSolutionOnHost( - const PolarGrid& grid, HostConstVector analytical_solution_host, const ExactSolution& exact_solution) + const PolarGrid& grid, HostVector analytical_solution_host, const ExactSolution& exact_solution) { assert(std::ssize(analytical_solution_host) == grid.numberOfNodes());