From 94a7ade2ed4cd162e4d514a70ed8258bb4f4d928 Mon Sep 17 00:00:00 2001 From: Rajesh Gandham Date: Thu, 28 May 2026 10:28:32 -0400 Subject: [PATCH] Remove pinned host memory from barrier solver Replace all pinned_dense_vector_t members in iteration_data_t with plain dense_vector_t, eliminating CPU<->GPU synchronization overhead from page-locked memory allocation. Removes 169 net lines. Vectors removed (pinned -> plain or deleted entirely): - 10 direction vectors (dw_aff, dx_aff, dy_aff, dv_aff, dz_aff and their corrector counterparts) - 5 RHS vectors (primal_rhs, bound_rhs, dual_rhs, complementarity_xz_rhs, complementarity_wv_rhs) - 5 residual vectors (primal_residual, bound_residual, dual_residual, complementarity_xz_residual, complementarity_wv_residual) - diag, inv_diag, inv_sqrt_diag (CPU-only, converted to dense_vector_t) - c, b (constants, converted; permanent d_b_ added to avoid per-iteration device_copy in compute_primal_dual_objective) - restrict_u_ (converted; permanent d_restrict_u_ added, copied once) - w, x, y, v, z, upper_bounds (state vectors, converted) Also removes the CPU compute_residuals function entirely (replaced by gpu_compute_residuals path) and simplifies gpu_compute_search_direction signature by removing unused pinned vector parameters. Validated on 179 benchmark problems (portfolio/maros/qplib): identical results vs baseline under --cudss-deterministic true. Co-Authored-By: Claude Sonnet 4.6 --- cpp/src/barrier/barrier.cu | 451 ++++++++++++------------------------ cpp/src/barrier/barrier.hpp | 15 +- 2 files changed, 148 insertions(+), 318 deletions(-) diff --git a/cpp/src/barrier/barrier.cu b/cpp/src/barrier/barrier.cu index e7604fb60e..5506160f7d 100644 --- a/cpp/src/barrier/barrier.cu +++ b/cpp/src/barrier/barrier.cu @@ -129,27 +129,7 @@ class iteration_data_t { A(lp.A), Q(Qin), cusparse_Q_view_(lp.handle_ptr, Q), - primal_residual(lp.num_rows), - bound_residual(num_upper_bounds), - dual_residual(lp.num_cols), - complementarity_xz_residual(lp.num_cols), - complementarity_wv_residual(num_upper_bounds), cusparse_view_(lp.handle_ptr, lp.A), - primal_rhs(lp.num_rows), - bound_rhs(num_upper_bounds), - dual_rhs(lp.num_cols), - complementarity_xz_rhs(lp.num_cols), - complementarity_wv_rhs(num_upper_bounds), - dw_aff(num_upper_bounds), - dx_aff(lp.num_cols), - dy_aff(lp.num_rows), - dv_aff(num_upper_bounds), - dz_aff(lp.num_cols), - dw(num_upper_bounds), - dx(lp.num_cols), - dy(lp.num_rows), - dv(num_upper_bounds), - dz(lp.num_cols), cusparse_info(lp.handle_ptr), device_AD(lp.num_cols, lp.num_rows, 0, lp.handle_ptr->get_stream()), device_A(lp.num_cols, lp.num_rows, 0, lp.handle_ptr->get_stream()), @@ -185,6 +165,7 @@ class iteration_data_t { d_r1_(lp.num_cols, lp.handle_ptr->get_stream()), d_r1_prime_(lp.num_cols, lp.handle_ptr->get_stream()), d_c_(lp.num_cols, lp.handle_ptr->get_stream()), + d_b_(lp.num_rows, lp.handle_ptr->get_stream()), d_upper_(0, lp.handle_ptr->get_stream()), d_u_(lp.A.n, lp.handle_ptr->get_stream()), d_upper_bounds_(0, lp.handle_ptr->get_stream()), @@ -215,6 +196,7 @@ class iteration_data_t { d_Q_diag_(0, lp.handle_ptr->get_stream()), d_Qx_(Qin.m, lp.handle_ptr->get_stream()), restrict_u_(0), + d_restrict_u_(0, lp.handle_ptr->get_stream()), transform_reduce_helper_(lp.handle_ptr->get_stream()), sum_reduce_helper_(lp.handle_ptr->get_stream()), indefinite_Q(false), @@ -235,6 +217,8 @@ class iteration_data_t { settings.log.printf("Free variables (QP) : %d\n", n_free_vars); } + raft::copy(d_c_.data(), c.data(), c.size(), stream_view_); + raft::copy(d_b_.data(), b.data(), b.size(), stream_view_); bool has_Q = Q.x.size() > 0; indefinite_Q = false; if (has_Q) { @@ -1494,15 +1478,15 @@ class iteration_data_t { raft::handle_t const* handle_ptr; i_t n_upper_bounds; - pinned_dense_vector_t upper_bounds; - pinned_dense_vector_t c; - pinned_dense_vector_t b; + dense_vector_t upper_bounds; + dense_vector_t c; + dense_vector_t b; - pinned_dense_vector_t w; - pinned_dense_vector_t x; + dense_vector_t w; + dense_vector_t x; dense_vector_t y; - pinned_dense_vector_t v; - pinned_dense_vector_t z; + dense_vector_t v; + dense_vector_t z; dense_vector_t w_save; dense_vector_t x_save; @@ -1516,9 +1500,9 @@ class iteration_data_t { f_t dual_residual_norm_save; f_t complementarity_residual_norm_save; - pinned_dense_vector_t diag; - pinned_dense_vector_t inv_diag; - pinned_dense_vector_t inv_sqrt_diag; + dense_vector_t diag; + dense_vector_t inv_diag; + dense_vector_t inv_sqrt_diag; rmm::device_uvector d_original_A_values; @@ -1568,29 +1552,6 @@ class iteration_data_t { bool has_solve_info; i_t num_factorizations; - pinned_dense_vector_t primal_residual; - pinned_dense_vector_t bound_residual; - pinned_dense_vector_t dual_residual; - pinned_dense_vector_t complementarity_xz_residual; - pinned_dense_vector_t complementarity_wv_residual; - - pinned_dense_vector_t primal_rhs; - pinned_dense_vector_t bound_rhs; - pinned_dense_vector_t dual_rhs; - pinned_dense_vector_t complementarity_xz_rhs; - pinned_dense_vector_t complementarity_wv_rhs; - - pinned_dense_vector_t dw_aff; - pinned_dense_vector_t dx_aff; - pinned_dense_vector_t dy_aff; - pinned_dense_vector_t dv_aff; - pinned_dense_vector_t dz_aff; - - pinned_dense_vector_t dw; - pinned_dense_vector_t dx; - pinned_dense_vector_t dy; - pinned_dense_vector_t dv; - pinned_dense_vector_t dz; cusparse_info_t cusparse_info; cusparse_view_t cusparse_view_; detail::cusparse_dn_vec_descr_wrapper_t cusparse_tmp4_; @@ -1624,6 +1585,7 @@ class iteration_data_t { rmm::device_uvector d_r1_; rmm::device_uvector d_r1_prime_; rmm::device_uvector d_c_; + rmm::device_uvector d_b_; rmm::device_uvector d_upper_; rmm::device_uvector d_u_; rmm::device_uvector d_upper_bounds_; @@ -1660,7 +1622,8 @@ class iteration_data_t { rmm::device_uvector d_Q_diag_; rmm::device_uvector d_Qx_; - pinned_dense_vector_t restrict_u_; + dense_vector_t restrict_u_; + rmm::device_uvector d_restrict_u_; transform_reduce_helper_t transform_reduce_helper_; sum_reduce_helper_t sum_reduce_helper_; @@ -1873,20 +1836,22 @@ int barrier_solver_t::initial_point(iteration_data_t& data) } // Verify A*x = b - data.primal_residual = lp.rhs; - data.cusparse_view_.spmv(1.0, data.x, -1.0, data.primal_residual); + dense_vector_t primal_residual(lp.num_rows); + primal_residual = lp.rhs; + data.cusparse_view_.spmv(1.0, data.x, -1.0, primal_residual); data.handle_ptr->get_stream().synchronize(); #ifdef PRINT_INFO - settings.log.printf("||b - A * x||: %.16e\n", vector_norm2(data.primal_residual)); + settings.log.printf("||b - A * x||: %.16e\n", vector_norm2(primal_residual)); #endif if (data.n_upper_bounds > 0) { + dense_vector_t bound_residual(data.n_upper_bounds); for (i_t k = 0; k < data.n_upper_bounds; k++) { - i_t j = data.upper_bounds[k]; - data.bound_residual[k] = lp.upper[j] - data.w[k] - data.x[j]; + i_t j = data.upper_bounds[k]; + bound_residual[k] = lp.upper[j] - data.w[k] - data.x[j]; } #ifdef PRINT_INFO - settings.log.printf("|| u - w - x||: %e\n", vector_norm2(data.bound_residual)); + settings.log.printf("|| u - w - x||: %e\n", vector_norm2(bound_residual)); #endif } @@ -1988,18 +1953,19 @@ int barrier_solver_t::initial_point(iteration_data_t& data) } // Verify A'*y + z - E*v - Q*x = c - data.z.pairwise_subtract(data.c, data.dual_residual); - if (data.Q.n > 0) { matrix_vector_multiply(data.Q, -1.0, data.x, 1.0, data.dual_residual); } - data.cusparse_view_.transpose_spmv(1.0, data.y, 1.0, data.dual_residual); + dense_vector_t dual_residual(lp.num_cols); + data.z.pairwise_subtract(data.c, dual_residual); + if (data.Q.n > 0) { matrix_vector_multiply(data.Q, -1.0, data.x, 1.0, dual_residual); } + data.cusparse_view_.transpose_spmv(1.0, data.y, 1.0, dual_residual); if (data.n_upper_bounds > 0) { for (i_t k = 0; k < data.n_upper_bounds; k++) { i_t j = data.upper_bounds[k]; - data.dual_residual[j] -= data.v[k]; + dual_residual[j] -= data.v[k]; } } #ifdef PRINT_INFO settings.log.printf("||A^T y + z - E*v - Q*x - c ||: %e\n", - vector_norm2(data.dual_residual)); + vector_norm2(dual_residual)); #endif // Make sure (w, x, v, z) > 0 if (data.n_free_vars > 0) { @@ -2036,24 +2002,10 @@ void barrier_solver_t::gpu_compute_residuals(const rmm::device_uvector { raft::common::nvtx::range fun_scope("Barrier: GPU compute_residuals"); - data.d_primal_residual_.resize(data.primal_residual.size(), stream_view_); + data.d_primal_residual_.resize(lp.num_rows, stream_view_); raft::copy(data.d_primal_residual_.data(), lp.rhs.data(), lp.rhs.size(), stream_view_); - data.d_dual_residual_.resize(data.dual_residual.size(), stream_view_); - raft::copy(data.d_dual_residual_.data(), - data.dual_residual.data(), - data.dual_residual.size(), - stream_view_); - data.d_upper_bounds_.resize(data.n_upper_bounds, stream_view_); - raft::copy( - data.d_upper_bounds_.data(), data.upper_bounds.data(), data.n_upper_bounds, stream_view_); - data.d_upper_.resize(lp.upper.size(), stream_view_); - raft::copy(data.d_upper_.data(), lp.upper.data(), lp.upper.size(), stream_view_); - data.d_bound_residual_.resize(data.bound_residual.size(), stream_view_); - raft::copy(data.d_bound_residual_.data(), - data.bound_residual.data(), - data.bound_residual.size(), - stream_view_); + data.d_dual_residual_.resize(lp.num_cols, stream_view_); // Compute primal_residual = b - A*x @@ -2120,69 +2072,6 @@ void barrier_solver_t::gpu_compute_residuals(const rmm::device_uvector cuda::std::multiplies<>{}, stream_view_.value()); RAFT_CHECK_CUDA(stream_view_); - raft::copy(data.complementarity_wv_residual.data(), - data.d_complementarity_wv_residual_.data(), - data.d_complementarity_wv_residual_.size(), - stream_view_); - raft::copy(data.complementarity_xz_residual.data(), - data.d_complementarity_xz_residual_.data(), - data.d_complementarity_xz_residual_.size(), - stream_view_); - raft::copy(data.dual_residual.data(), - data.d_dual_residual_.data(), - data.d_dual_residual_.size(), - stream_view_); - raft::copy(data.primal_residual.data(), - data.d_primal_residual_.data(), - data.d_primal_residual_.size(), - stream_view_); - raft::copy(data.bound_residual.data(), - data.d_bound_residual_.data(), - data.d_bound_residual_.size(), - stream_view_); - // Sync to make sure host data has been copied - RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); -} - -template -template -void barrier_solver_t::compute_residuals(const dense_vector_t& w, - const dense_vector_t& x, - const dense_vector_t& y, - const dense_vector_t& v, - const dense_vector_t& z, - iteration_data_t& data) -{ - raft::common::nvtx::range fun_scope("Barrier: CPU compute_residuals"); - - // Compute primal_residual = b - A*x - data.primal_residual = lp.rhs; - data.cusparse_view_.spmv(-1.0, x, 1.0, data.primal_residual); - - // Compute bound_residual = E'*u - w - E'*x - if (data.n_upper_bounds > 0) { - for (i_t k = 0; k < data.n_upper_bounds; k++) { - i_t j = data.upper_bounds[k]; - data.bound_residual[k] = lp.upper[j] - w[k] - x[j]; - } - } - - // Compute dual_residual = c - A'*y - z + E*v + Q*x - data.c.pairwise_subtract(z, data.dual_residual); - data.cusparse_view_.transpose_spmv(-1.0, y, 1.0, data.dual_residual); - if (data.n_upper_bounds > 0) { - for (i_t k = 0; k < data.n_upper_bounds; k++) { - i_t j = data.upper_bounds[k]; - data.dual_residual[j] += v[k]; - } - } - if (data.Q.n > 0) { matrix_vector_multiply(data.Q, 1.0, x, 1.0, data.dual_residual); } - - // Compute complementarity_xz_residual = x.*z - x.pairwise_product(z, data.complementarity_xz_residual); - - // Compute complementarity_wv_residual = w.*v - w.pairwise_product(v, data.complementarity_wv_residual); } template @@ -2253,11 +2142,6 @@ f_t barrier_solver_t::gpu_max_step_to_boundary(iteration_data_t i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t& data, - pinned_dense_vector_t& dw, - pinned_dense_vector_t& dx, - pinned_dense_vector_t& dy, - pinned_dense_vector_t& dv, - pinned_dense_vector_t& dz, f_t& max_residual) { raft::common::nvtx::range fun_scope("Barrier: compute_search_direction"); @@ -2268,40 +2152,16 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t::gpu_compute_search_direction(iteration_data_t::gpu_compute_search_direction(iteration_data_t d_primal_rhs_save(lp.num_rows, stream_view_); + { raft::common::nvtx::range fun_scope("Barrier: GPU compute H"); // tmp3 <- E * ((complementarity_wv_rhs .- v .* bound_rhs) ./ w) @@ -2492,6 +2355,9 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t::gpu_compute_search_direction(iteration_data_t d_augmented_rhs(lp.num_cols + lp.num_rows, stream_view_); raft::copy(d_augmented_rhs.data(), data.d_r1_.data(), lp.num_cols, stream_view_); - raft::copy( - d_augmented_rhs.data() + lp.num_cols, data.primal_rhs.data(), lp.num_rows, stream_view_); + raft::copy(d_augmented_rhs.data() + lp.num_cols, d_primal_rhs_save.data(), lp.num_rows, stream_view_); rmm::device_uvector d_augmented_soln(lp.num_cols + lp.num_rows, stream_view_); data.chol->solve(d_augmented_rhs, d_augmented_soln); struct op_t { @@ -2533,8 +2398,6 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t::gpu_compute_search_direction(iteration_data_t::gpu_compute_search_direction(iteration_data_t::gpu_compute_search_direction(iteration_data_t x_residual = data.primal_rhs; - data.cusparse_view_.spmv(1.0, dx, -1.0, x_residual); - const f_t x_residual_norm = vector_norm_inf(x_residual, stream_view_); - max_residual = std::max(max_residual, x_residual_norm); - if (x_residual_norm > 1e-2) { - settings.log.printf("|| A * dx - rp || = %.2e\n", x_residual_norm); - } - } - { raft::common::nvtx::range fun_scope("Barrier: dz formation GPU"); @@ -2803,7 +2647,6 @@ i_t barrier_solver_t::gpu_compute_search_direction(iteration_data_t::gpu_compute_search_direction(iteration_data_t::gpu_compute_search_direction(iteration_data_t::compute_affine_rhs(iteration_data_t& { raft::common::nvtx::range fun_scope("Barrier: compute_affine_rhs"); - data.primal_rhs = data.primal_residual; - data.bound_rhs = data.bound_residual; - data.dual_rhs = data.dual_residual; - + // D2D: RHS = residuals (all on device) + raft::copy(data.d_h_.data(), data.d_primal_residual_.data(), data.d_primal_residual_.size(), stream_view_); + raft::copy(data.d_dual_rhs_.data(), data.d_dual_residual_.data(), data.d_dual_residual_.size(), stream_view_); + data.d_bound_rhs_.resize(data.d_bound_residual_.size(), stream_view_); + raft::copy(data.d_bound_rhs_.data(), data.d_bound_residual_.data(), data.d_bound_residual_.size(), stream_view_); + data.d_dw_.resize(data.d_bound_residual_.size(), stream_view_); + raft::copy(data.d_dw_.data(), data.d_bound_residual_.data(), data.d_bound_residual_.size(), stream_view_); raft::copy(data.d_complementarity_xz_rhs_.data(), data.d_complementarity_xz_residual_.data(), data.d_complementarity_xz_residual_.size(), @@ -3025,16 +2869,6 @@ void barrier_solver_t::compute_target_mu( f_t complementarity_aff_sum = 0.0; // TMP no copy and data should always be on the GPU - data.d_dw_aff_.resize(data.dw_aff.size(), stream_view_); - data.d_dx_aff_.resize(data.dx_aff.size(), stream_view_); - data.d_dv_aff_.resize(data.dv_aff.size(), stream_view_); - data.d_dz_aff_.resize(data.dz_aff.size(), stream_view_); - - raft::copy(data.d_dw_aff_.data(), data.dw_aff.data(), data.dw_aff.size(), stream_view_); - raft::copy(data.d_dx_aff_.data(), data.dx_aff.data(), data.dx_aff.size(), stream_view_); - raft::copy(data.d_dv_aff_.data(), data.dv_aff.data(), data.dv_aff.size(), stream_view_); - raft::copy(data.d_dz_aff_.data(), data.dz_aff.data(), data.dz_aff.size(), stream_view_); - f_t step_primal_aff = std::min(gpu_max_step_to_boundary(data, data.d_w_, data.d_dw_aff_), gpu_max_step_to_boundary(data, data.d_x_, data.d_dx_aff_)); f_t step_dual_aff = std::min(gpu_max_step_to_boundary(data, data.d_v_, data.d_dv_aff_), @@ -3148,31 +2982,18 @@ void barrier_solver_t::compute_cc_rhs(iteration_data_t& data stream_view_.value()); RAFT_CHECK_CUDA(stream_view_); - // TMP should be CPU to 0 if CPU and GPU to 0 if GPU - data.primal_rhs.set_scalar(0.0); - data.bound_rhs.set_scalar(0.0); - data.dual_rhs.set_scalar(0.0); + // Zero the corrector RHS on device + thrust::fill(rmm::exec_policy(stream_view_), data.d_h_.begin(), data.d_h_.end(), f_t(0)); + thrust::fill(rmm::exec_policy(stream_view_), data.d_dual_rhs_.begin(), data.d_dual_rhs_.end(), f_t(0)); + thrust::fill(rmm::exec_policy(stream_view_), data.d_bound_rhs_.begin(), data.d_bound_rhs_.end(), f_t(0)); + thrust::fill(rmm::exec_policy(stream_view_), data.d_dw_.begin(), data.d_dw_.end(), f_t(0)); } template void barrier_solver_t::compute_final_direction(iteration_data_t& data) { raft::common::nvtx::range fun_scope("Barrier: compute_final_direction"); - // TODO Nicolas: Redundant copies - data.d_y_.resize(data.y.size(), stream_view_); - data.d_dy_aff_.resize(data.dy_aff.size(), stream_view_); - raft::copy(data.d_y_.data(), data.y.data(), data.y.size(), stream_view_); - raft::copy(data.d_dy_aff_.data(), data.dy_aff.data(), data.dy_aff.size(), stream_view_); - -#ifdef FINITE_CHECK - for (i_t i = 0; i < (int)data.y.size(); i++) { - cuopt_assert(std::isfinite(data.y[i]), "data.d_y_[i] is not finite"); - } - for (i_t i = 0; i < (int)data.dy_aff.size(); i++) { - cuopt_assert(std::isfinite(data.dy_aff[i]), "data.dy_aff_[i] is not finite"); - } -#endif // dw = dw_aff + dw_cc // dx = dx_aff + dx_cc @@ -3293,15 +3114,9 @@ void barrier_solver_t::compute_next_iterate(iteration_data_t }); } - raft::copy(data.w.data(), data.d_w_.data(), data.d_w_.size(), stream_view_); - raft::copy(data.x.data(), data.d_x_.data(), data.d_x_.size(), stream_view_); - raft::copy(data.y.data(), data.d_y_.data(), data.d_y_.size(), stream_view_); - raft::copy(data.v.data(), data.d_v_.data(), data.d_v_.size(), stream_view_); - raft::copy(data.z.data(), data.d_z_.data(), data.d_z_.size(), stream_view_); - // Sync to make sure all host variable are done copying - RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); } + template void barrier_solver_t::compute_residual_norms(iteration_data_t& data, f_t& primal_residual_norm, @@ -3345,8 +3160,6 @@ void barrier_solver_t::compute_primal_dual_objective(iteration_data_t< { raft::common::nvtx::range fun_scope("Barrier: compute_primal_dual_objective"); raft::copy(data.d_c_.data(), data.c.data(), data.c.size(), stream_view_); - auto d_b = device_copy(data.b, stream_view_); - auto d_restrict_u = device_copy(data.restrict_u_, stream_view_); rmm::device_scalar d_cx(stream_view_); rmm::device_scalar d_by(stream_view_); rmm::device_scalar d_uv(stream_view_); @@ -3360,16 +3173,16 @@ void barrier_solver_t::compute_primal_dual_objective(iteration_data_t< d_cx.data(), stream_view_)); RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(lp.handle_ptr->get_cublas_handle(), - d_b.size(), - d_b.data(), + data.d_b_.size(), + data.d_b_.data(), 1, data.d_y_.data(), 1, d_by.data(), stream_view_)); RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(lp.handle_ptr->get_cublas_handle(), - d_restrict_u.size(), - d_restrict_u.data(), + data.d_restrict_u_.size(), + data.d_restrict_u_.data(), 1, data.d_v_.data(), 1, @@ -3506,12 +3319,17 @@ lp_status_t barrier_solver_t::check_for_suboptimal_solution( relative_dual_residual < settings.barrier_relaxed_optimality_tol && relative_complementarity_residual < settings.barrier_relaxed_complementarity_tol && primal_objective == primal_objective) { + raft::copy(data.x.data(), data.d_x_.data(), data.d_x_.size(), stream_view_); + raft::copy(data.y.data(), data.d_y_.data(), data.d_y_.size(), stream_view_); + raft::copy(data.z.data(), data.d_z_.data(), data.d_z_.size(), stream_view_); + raft::copy(data.v.data(), data.d_v_.data(), data.d_v_.size(), stream_view_); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); data.to_solution(lp, iter, primal_objective, compute_user_objective(lp, primal_objective), - vector_norm2(data.primal_residual), - vector_norm2(data.dual_residual), + primal_residual_norm, + dual_residual_norm, data.cusparse_view_, solution); settings.log.printf("\n"); @@ -3545,6 +3363,11 @@ lp_status_t barrier_solver_t::check_for_suboptimal_solution( data.relative_dual_residual_save < settings.barrier_relaxed_optimality_tol && data.relative_complementarity_residual_save < settings.barrier_relaxed_complementarity_tol) { settings.log.printf("Restoring previous solution\n"); + raft::copy(data.x.data(), data.d_x_.data(), data.d_x_.size(), stream_view_); + raft::copy(data.y.data(), data.d_y_.data(), data.d_y_.size(), stream_view_); + raft::copy(data.z.data(), data.d_z_.data(), data.d_z_.size(), stream_view_); + raft::copy(data.v.data(), data.d_v_.data(), data.d_v_.size(), stream_view_); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_)); data.to_solution(lp, iter, primal_objective_save, @@ -3651,17 +3474,37 @@ lp_status_t barrier_solver_t::solve(f_t start_time, lp_solution_t>(data.w, data.x, data.y, data.v, data.z, data); - - f_t primal_residual_norm = - std::max(vector_norm_inf(data.primal_residual, stream_view_), - vector_norm_inf(data.bound_residual, stream_view_)); - f_t dual_residual_norm = vector_norm_inf(data.dual_residual, stream_view_); - f_t complementarity_residual_norm = - std::max(vector_norm_inf(data.complementarity_xz_residual, stream_view_), - vector_norm_inf(data.complementarity_wv_residual, stream_view_)); - f_t mu = (data.complementarity_xz_residual.sum() + data.complementarity_wv_residual.sum()) / - (static_cast(n) + static_cast(num_upper_bounds)); + // Upload initial point to device and compute initial residuals/norms on GPU + data.d_complementarity_wv_residual_.resize(data.n_upper_bounds, stream_view_); + data.d_complementarity_wv_rhs_.resize(data.n_upper_bounds, stream_view_); + data.d_x_.resize(data.x.size(), stream_view_); + raft::copy(data.d_x_.data(), data.x.data(), data.x.size(), stream_view_); + data.d_y_.resize(data.y.size(), stream_view_); + raft::copy(data.d_y_.data(), data.y.data(), data.y.size(), stream_view_); + data.d_z_.resize(data.z.size(), stream_view_); + raft::copy(data.d_z_.data(), data.z.data(), data.z.size(), stream_view_); + data.d_w_.resize(data.w.size(), stream_view_); + raft::copy(data.d_w_.data(), data.w.data(), data.w.size(), stream_view_); + data.d_v_.resize(data.v.size(), stream_view_); + raft::copy(data.d_v_.data(), data.v.data(), data.v.size(), stream_view_); + data.d_upper_bounds_.resize(data.upper_bounds.size(), stream_view_); + raft::copy(data.d_upper_bounds_.data(), data.upper_bounds.data(), data.upper_bounds.size(), stream_view_); + data.d_upper_.resize(lp.upper.size(), stream_view_); + raft::copy(data.d_upper_.data(), lp.upper.data(), lp.upper.size(), stream_view_); + data.d_bound_residual_.resize(data.n_upper_bounds, stream_view_); + + f_t primal_residual_norm, dual_residual_norm, complementarity_residual_norm; + gpu_compute_residual_norms(data.d_w_, + data.d_x_, + data.d_y_, + data.d_v_, + data.d_z_, + data, + primal_residual_norm, + dual_residual_norm, + complementarity_residual_norm); + f_t mu; + compute_mu(data, mu); f_t norm_b = vector_norm_inf(data.b, stream_view_); f_t norm_c = vector_norm_inf(data.c, stream_view_); @@ -3683,9 +3526,17 @@ lp_status_t barrier_solver_t::solve(f_t start_time, lp_solution_t upper(lp.upper); data.gather_upper_bounds(upper, data.restrict_u_); + data.d_restrict_u_.resize(data.restrict_u_.size(), stream_view_); + raft::copy(data.d_restrict_u_.data(), data.restrict_u_.data(), data.restrict_u_.size(), stream_view_); f_t dual_objective = data.b.inner_product(data.y) - data.restrict_u_.inner_product(data.v) - quad_objective; + data.w_save = data.w; + data.x_save = data.x; + data.y_save = data.y; + data.v_save = data.v; + data.z_save = data.z; + i_t iter = 0; settings.log.printf("\n"); settings.log.printf( @@ -3706,35 +3557,6 @@ lp_status_t barrier_solver_t::solve(f_t start_time, lp_solution_t::solve(f_t start_time, lp_solution_t::solve(f_t start_time, lp_solution_t::solve(f_t start_time, lp_solution_t::solve(f_t start_time, lp_solution_t::solve(f_t start_time, lp_solution_t(data.primal_residual), - vector_norm2(data.dual_residual), + primal_residual_norm, + dual_residual_norm, data.cusparse_view_, solution); return lp_status_t::ITERATION_LIMIT; diff --git a/cpp/src/barrier/barrier.hpp b/cpp/src/barrier/barrier.hpp index 4d25fa930e..ace639d30b 100644 --- a/cpp/src/barrier/barrier.hpp +++ b/cpp/src/barrier/barrier.hpp @@ -44,13 +44,6 @@ class barrier_solver_t { f_t& dual_residual_norm, f_t& complementarity_residual_norm); - template - void compute_residuals(const dense_vector_t& w, - const dense_vector_t& x, - const dense_vector_t& y, - const dense_vector_t& v, - const dense_vector_t& z, - iteration_data_t& data); void compute_primal_dual_step_length(iteration_data_t& data, f_t step_scale, f_t& step_primal, @@ -95,13 +88,7 @@ class barrier_solver_t { f_t gpu_max_step_to_boundary(iteration_data_t& data, const rmm::device_uvector& x, const rmm::device_uvector& dx); - i_t gpu_compute_search_direction(iteration_data_t& data, - pinned_dense_vector_t& dw, - pinned_dense_vector_t& dx, - pinned_dense_vector_t& dy, - pinned_dense_vector_t& dv, - pinned_dense_vector_t& dz, - f_t& max_residual); + i_t gpu_compute_search_direction(iteration_data_t& data, f_t& max_residual); private: lp_status_t check_for_suboptimal_solution(iteration_data_t& data,