From 6a23c3631317beb67d674bb464cfbfffb82d0258 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Tue, 2 Jun 2026 16:39:16 +0000 Subject: [PATCH] Unroll inner loop - 2 rows at a time. Up to 1.5X faster. --- kernel/riscv64/gemv_n_vector.c | 54 +++++++++++++++++++++++++++++----- 1 file changed, 46 insertions(+), 8 deletions(-) diff --git a/kernel/riscv64/gemv_n_vector.c b/kernel/riscv64/gemv_n_vector.c index 48eb4e2d68..845e4de296 100644 --- a/kernel/riscv64/gemv_n_vector.c +++ b/kernel/riscv64/gemv_n_vector.c @@ -49,12 +49,33 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO { if (n < 0) return(0); - FLOAT *a_ptr, *y_ptr, temp; + FLOAT *a_ptr, *y_ptr, *a2_ptr, temp, temp2; BLASLONG i, j, vl; - FLOAT_V_T va, vy; + FLOAT_V_T va, vy, va2; if (inc_y == 1) { - for (j = 0; j < n; j++) { + for (j = 0; j < (n >> 1); j++) { + temp = alpha * x[0]; + temp2 = alpha * x[inc_x]; + y_ptr = y; + a_ptr = a; + a2_ptr = a + lda; + for (i = m; i > 0; i -= vl) { + vl = VSETVL(i); + vy = VLEV_FLOAT(y_ptr, vl); + va = VLEV_FLOAT(a_ptr, vl); + va2 = VLEV_FLOAT(a2_ptr, vl); + vy = VFMACCVF_FLOAT(vy, temp, va, vl); + vy = VFMACCVF_FLOAT(vy, temp2, va2, vl); + VSEV_FLOAT(y_ptr, vy, vl); + y_ptr += vl; + a_ptr += vl; + a2_ptr += vl; + } + x += inc_x * 2; + a += lda * 2; + } + if (n & 1) { temp = alpha * x[0]; y_ptr = y; a_ptr = a; @@ -67,12 +88,31 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO y_ptr += vl; a_ptr += vl; } - x += inc_x; - a += lda; } } else { BLASLONG stride_y = inc_y * sizeof(FLOAT); - for (j = 0; j < n; j++) { + for (j = 0; j < (n >> 1); j++) { + temp = alpha * x[0]; + temp2 = alpha * x[inc_x]; + y_ptr = y; + a_ptr = a; + a2_ptr = a + lda; + for (i = m; i > 0; i -= vl) { + vl = VSETVL(i); + vy = VLSEV_FLOAT(y_ptr, stride_y, vl); + va = VLEV_FLOAT(a_ptr, vl); + va2 = VLEV_FLOAT(a2_ptr, vl); + vy = VFMACCVF_FLOAT(vy, temp, va, vl); + vy = VFMACCVF_FLOAT(vy, temp2, va2, vl); + VSSEV_FLOAT(y_ptr, stride_y, vy, vl); + y_ptr += vl * inc_y; + a_ptr += vl; + a2_ptr += vl; + } + x += inc_x * 2; + a += lda * 2; + } + if (n & 1) { temp = alpha * x[0]; y_ptr = y; a_ptr = a; @@ -85,8 +125,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO y_ptr += vl * inc_y; a_ptr += vl; } - x += inc_x; - a += lda; } } return(0);