From 3863a7778d4fc26653e721d2f5d7d86063a78194 Mon Sep 17 00:00:00 2001 From: Amrita H S Date: Wed, 3 Jun 2026 02:34:53 -0500 Subject: [PATCH] Power10: Replace vector pair loads with __builtin_vsx_lxvp Replace normal vector pair pointer dereferences with the optimized __builtin_vsx_lxvp builtin across DGEMM, ZGEMM, and DGEMV kernels. Also done some identation corrections in dgemm_kernel_power10.c. This is done as part of POWER code cleanup and may not have any performance impact. Signed-off-by: Amrita H S --- kernel/power/dgemm_kernel_power10.c | 1494 +++++++++--------- kernel/power/dgemm_small_kernel_nn_power10.c | 4 +- kernel/power/dgemm_small_kernel_nt_power10.c | 6 +- kernel/power/dgemm_small_kernel_tt_power10.c | 6 +- kernel/power/dgemv_t_power10.c | 18 +- kernel/power/gemm_common.c | 8 +- kernel/power/zgemm_kernel_power10.c | 120 +- 7 files changed, 809 insertions(+), 847 deletions(-) diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c index f5cc3dfede..6ec6ee51b2 100644 --- a/kernel/power/dgemm_kernel_power10.c +++ b/kernel/power/dgemm_kernel_power10.c @@ -38,875 +38,837 @@ typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); #endif #ifdef TRMMKERNEL -#define SAVE_ACC(ACC, J) \ - __builtin_mma_disassemble_acc ((void *)result, ACC); \ - rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] = result[0] * alpha; \ - rowC = (v4sf_t *) &CO[1*ldc+J]; \ - rowC[0] = result[1] * alpha; \ - rowC = (v4sf_t *) &CO[2*ldc+J]; \ - rowC[0] = result[2] * alpha; \ - rowC = (v4sf_t *) &CO[3*ldc+J]; \ - rowC[0] = result[3] * alpha; -#define SAVE_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc ((void *)result, ACC); \ - rowC = (v4sf_t *) &CO[4* ldc+J]; \ - rowC[0] = result[0] * alpha; \ - rowC = (v4sf_t *) &CO[5*ldc+J]; \ - rowC[0] = result[1] * alpha; \ - rowC = (v4sf_t *) &CO[6*ldc+J]; \ - rowC[0] = result[2] * alpha; \ - rowC = (v4sf_t *) &CO[7*ldc+J]; \ - rowC[0] = result[3] * alpha; -#define SAVE2x4_ACC(ACC, J) \ - __builtin_mma_disassemble_acc ((void *)result, ACC); \ - rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] = result[0] * alpha; \ - rowC = (v4sf_t *) &CO[1* ldc+J]; \ - rowC[0] = result[1] * alpha; +#define SAVE_ACC(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] = result[0] * alpha; \ + rowC = (v4sf_t *) &CO[1*ldc+J]; \ + rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v4sf_t *) &CO[3*ldc+J]; \ + rowC[0] = result[3] * alpha; +#define SAVE_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v4sf_t *) &CO[4* ldc+J]; \ + rowC[0] = result[0] * alpha; \ + rowC = (v4sf_t *) &CO[5*ldc+J]; \ + rowC[0] = result[1] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] = result[2] * alpha; \ + rowC = (v4sf_t *) &CO[7*ldc+J]; \ + rowC[0] = result[3] * alpha; +#define SAVE2x4_ACC(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] = result[0] * alpha; \ + rowC = (v4sf_t *) &CO[1* ldc+J]; \ + rowC[0] = result[1] * alpha; #else -#define SAVE_ACC(ACC, J) \ - __builtin_mma_disassemble_acc ((void *)result, ACC); \ - rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[0] * alpha; \ - rowC = (v4sf_t *) &CO[1*ldc+J]; \ - rowC[0] += result[1] * alpha; \ - rowC = (v4sf_t *) &CO[2*ldc+J]; \ - rowC[0] += result[2] * alpha; \ - rowC = (v4sf_t *) &CO[3*ldc+J]; \ - rowC[0] += result[3] * alpha; -#define SAVE_ACC1(ACC, J) \ - __builtin_mma_disassemble_acc ((void *)result, ACC); \ - rowC = (v4sf_t *) &CO[4* ldc+J]; \ - rowC[0] += result[0] * alpha; \ - rowC = (v4sf_t *) &CO[5*ldc+J]; \ - rowC[0] += result[1] * alpha; \ - rowC = (v4sf_t *) &CO[6*ldc+J]; \ - rowC[0] += result[2] * alpha; \ - rowC = (v4sf_t *) &CO[7*ldc+J]; \ - rowC[0] += result[3] * alpha; -#define SAVE2x4_ACC(ACC, J) \ - __builtin_mma_disassemble_acc ((void *)result, ACC); \ - rowC = (v4sf_t *) &CO[0* ldc+J]; \ - rowC[0] += result[0] * alpha; \ - rowC = (v4sf_t *) &CO[1* ldc+J]; \ - rowC[0] += result[1] * alpha; +#define SAVE_ACC(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[0] * alpha; \ + rowC = (v4sf_t *) &CO[1*ldc+J]; \ + rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[2*ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v4sf_t *) &CO[3*ldc+J]; \ + rowC[0] += result[3] * alpha; +#define SAVE_ACC1(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v4sf_t *) &CO[4* ldc+J]; \ + rowC[0] += result[0] * alpha; \ + rowC = (v4sf_t *) &CO[5*ldc+J]; \ + rowC[0] += result[1] * alpha; \ + rowC = (v4sf_t *) &CO[6*ldc+J]; \ + rowC[0] += result[2] * alpha; \ + rowC = (v4sf_t *) &CO[7*ldc+J]; \ + rowC[0] += result[3] * alpha; +#define SAVE2x4_ACC(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + rowC = (v4sf_t *) &CO[0* ldc+J]; \ + rowC[0] += result[0] * alpha; \ + rowC = (v4sf_t *) &CO[1* ldc+J]; \ + rowC[0] += result[1] * alpha; #endif #define KERNEL(i) \ - rowA = (vec_t *)&AO[(i)<< 3];\ - rowB = *((__vector_pair *)((void *)&BO[(i) << 3]));\ - rowB1 = *((__vector_pair *)((void *)&BO[((i) << 3) + 4]));\ - __builtin_mma_xvf64gerpp(&acc0, rowB, rowA[0]);\ - __builtin_mma_xvf64gerpp(&acc1, rowB1, rowA[0]);\ - __builtin_mma_xvf64gerpp(&acc2, rowB, rowA[1]);\ - __builtin_mma_xvf64gerpp(&acc3, rowB1, rowA[1]);\ - __builtin_mma_xvf64gerpp(&acc4, rowB, rowA[2]);\ - __builtin_mma_xvf64gerpp(&acc5, rowB1, rowA[2]);\ - __builtin_mma_xvf64gerpp(&acc6, rowB, rowA[3]);\ - __builtin_mma_xvf64gerpp(&acc7, rowB1, rowA[3]); + rowA = (vec_t *)&AO[(i)<< 3];\ + rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[(i) << 3])); \ + rowB1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[((i) << 3) + 4])); \ + __builtin_mma_xvf64gerpp(&acc0, rowB, rowA[0]);\ + __builtin_mma_xvf64gerpp(&acc1, rowB1, rowA[0]);\ + __builtin_mma_xvf64gerpp(&acc2, rowB, rowA[1]);\ + __builtin_mma_xvf64gerpp(&acc3, rowB1, rowA[1]);\ + __builtin_mma_xvf64gerpp(&acc4, rowB, rowA[2]);\ + __builtin_mma_xvf64gerpp(&acc5, rowB1, rowA[2]);\ + __builtin_mma_xvf64gerpp(&acc6, rowB, rowA[3]);\ + __builtin_mma_xvf64gerpp(&acc7, rowB1, rowA[3]); #define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) #define REFRESH_TEMP_BK(x, y) \ - temp = k - off; + temp = k - off; #elif defined(LEFT) #define REFRESH_TEMP_BK(x, y) \ - temp = off + x; + temp = off + x; #else #define REFRESH_TEMP_BK(x, y) \ - temp = off + y; + temp = off + y; #endif #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #define REFRESH_POINTERS(x, y) \ - BO = B; \ - REFRESH_TEMP_BK(x, y) + BO = B; \ + REFRESH_TEMP_BK(x, y) #else #define REFRESH_POINTERS(x, y) \ - AO += off * x; \ - BO = B + off * y; \ - REFRESH_TEMP_BK(x, y) + AO += off * x; \ + BO = B + off * y; \ + REFRESH_TEMP_BK(x, y) #endif #ifdef LEFT #define REFRESH_OFF(x) \ - off += x; + off += x; #else #define REFRESH_OFF(x) #endif #ifdef LEFT #define UPDATE_TEMP(x, y) \ - temp -= x; + temp -= x; #else #define UPDATE_TEMP(x, y) \ - temp -= y; + temp -= y; #endif #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #define REFRESH_TMP_AFTER_SAVE(x, y) \ - temp = k - off; \ - UPDATE_TEMP(x, y) \ - AO += temp * x; \ - BO += temp * y; + temp = k - off; \ + UPDATE_TEMP(x, y) \ + AO += temp * x; \ + BO += temp * y; #else #define REFRESH_TMP_AFTER_SAVE(x, y) #endif #define REFRESH_AFTER_SAVE(x,y) \ - REFRESH_TMP_AFTER_SAVE(x, y) \ - REFRESH_OFF(x) + REFRESH_TMP_AFTER_SAVE(x, y) \ + REFRESH_OFF(x) /************************************************************************************* * GEMM Kernel *************************************************************************************/ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, - FLOAT * C, BLASLONG ldc + FLOAT * C, BLASLONG ldc #ifdef TRMMKERNEL - , BLASLONG offset + , BLASLONG offset #endif - ) +) { - BLASLONG i1; + BLASLONG i1; #if defined(TRMMKERNEL) - BLASLONG off; + BLASLONG off; #endif #if defined(TRMMKERNEL) && !defined(LEFT) - off = -offset; -#endif - v4sf_t valpha = { alpha, alpha }; - for (i1 = 0; i1 < (n >> 3); i1++) - { - BLASLONG j, temp; - FLOAT *CO; - FLOAT *AO; + off = -offset; +#endif + v4sf_t valpha = { alpha, alpha }; + for (i1 = 0; i1 < (n >> 3); i1++) { + BLASLONG j, temp; + FLOAT *CO; + FLOAT *AO; #if defined(TRMMKERNEL) && defined(LEFT) - off = offset; -#endif - CO = C; - C += ldc << 3; - AO = A; - PREFETCH1 (A, 128); - PREFETCH1 (A, 256); - for (j = 0; j < (m >> 3); j++) - { - FLOAT *BO; -#if defined(TRMMKERNEL) - REFRESH_POINTERS (8, 8); + off = offset; +#endif + CO = C; + C += ldc << 3; + AO = A; + PREFETCH1 (A, 128); + PREFETCH1 (A, 256); + for (j = 0; j < (m >> 3); j++) { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 8); #else - BO = B; - temp = k; -#endif - v4sf_t *rowC; - v4sf_t result[4]; - __vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7; - BLASLONG l = 1; - vec_t *rowA = (vec_t *) & AO[0]; - __vector_pair rowB, rowB1; - rowB = *((__vector_pair *)((void *)&BO[0])); - rowB1 = *((__vector_pair *)((void *)&BO[4])); - __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); - __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); - __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); - __builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]); - __builtin_mma_xvf64ger (&acc4, rowB, rowA[2]); - __builtin_mma_xvf64ger (&acc5, rowB1, rowA[2]); - __builtin_mma_xvf64ger (&acc6, rowB, rowA[3]); - __builtin_mma_xvf64ger (&acc7, rowB1, rowA[3]); - for (l = 1; l + 15 < temp; l += 16) - { - KERNEL (l); - KERNEL (l+1); - KERNEL (l+2); - KERNEL (l+3); - KERNEL (l+4); - KERNEL (l+5); - KERNEL (l+6); - KERNEL (l+7); - KERNEL (l+8); - KERNEL (l+9); - KERNEL (l+10); - KERNEL (l+11); - KERNEL (l+12); - KERNEL (l+13); - KERNEL (l+14); - KERNEL (l+15); - } - if ((temp - l) & 8) - { - KERNEL(l); - KERNEL(l+1); - KERNEL(l+2); - KERNEL(l+3); - KERNEL(l+4); - KERNEL(l+5); - KERNEL(l+6); - KERNEL(l+7); + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7; + BLASLONG l = 1; + vec_t *rowA = (vec_t *) & AO[0]; + __vector_pair rowB, rowB1; + rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[0])); + rowB1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[4])); + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); + __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); + __builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]); + __builtin_mma_xvf64ger (&acc4, rowB, rowA[2]); + __builtin_mma_xvf64ger (&acc5, rowB1, rowA[2]); + __builtin_mma_xvf64ger (&acc6, rowB, rowA[3]); + __builtin_mma_xvf64ger (&acc7, rowB1, rowA[3]); + for (l = 1; l + 15 < temp; l += 16) { + KERNEL (l); + KERNEL (l+1); + KERNEL (l+2); + KERNEL (l+3); + KERNEL (l+4); + KERNEL (l+5); + KERNEL (l+6); + KERNEL (l+7); + KERNEL (l+8); + KERNEL (l+9); + KERNEL (l+10); + KERNEL (l+11); + KERNEL (l+12); + KERNEL (l+13); + KERNEL (l+14); + KERNEL (l+15); + } + if ((temp - l) & 8) { + KERNEL(l); + KERNEL(l+1); + KERNEL(l+2); + KERNEL(l+3); + KERNEL(l+4); + KERNEL(l+5); + KERNEL(l+6); + KERNEL(l+7); l += 8; - } - if ((temp - l) & 4) - { - KERNEL(l); - KERNEL(l+1); - KERNEL(l+2); - KERNEL(l+3); - l += 4; - } - if ((temp - l) & 2) - { - KERNEL(l); - KERNEL(l+1); - l += 2; - } - if ((temp - l) & 1) - { - KERNEL(l); - } - SAVE_ACC (&acc0, 0); - SAVE_ACC1 (&acc1, 0); - SAVE_ACC (&acc2, 2); - SAVE_ACC1 (&acc3, 2); - SAVE_ACC (&acc4, 4); - SAVE_ACC1 (&acc5, 4); - SAVE_ACC (&acc6, 6); - SAVE_ACC1 (&acc7, 6); - CO += 8; - AO += temp << 3; - BO += temp << 3; -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (8, 8) -#endif - } - if (m & 4) - { - FLOAT *BO; -#if defined(TRMMKERNEL) - REFRESH_POINTERS (4, 8); + } + if ((temp - l) & 4) { + KERNEL(l); + KERNEL(l+1); + KERNEL(l+2); + KERNEL(l+3); + l += 4; + } + if ((temp - l) & 2) { + KERNEL(l); + KERNEL(l+1); + l += 2; + } + if ((temp - l) & 1) { + KERNEL(l); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC1 (&acc1, 0); + SAVE_ACC (&acc2, 2); + SAVE_ACC1 (&acc3, 2); + SAVE_ACC (&acc4, 4); + SAVE_ACC1 (&acc5, 4); + SAVE_ACC (&acc6, 6); + SAVE_ACC1 (&acc7, 6); + CO += 8; + AO += temp << 3; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 8) +#endif + } + if (m & 4) { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 8); #else - BO = B; - temp = k; -#endif - v4sf_t *rowC; - v4sf_t result[4]; - __vector_quad acc0, acc1, acc2, acc3; - BLASLONG l = 0; - vec_t *rowA = (vec_t *) & AO[0]; - __vector_pair rowB, rowB1; - rowB = *((__vector_pair *)((void *)&BO[0])); - rowB1 = *((__vector_pair *)((void *)&BO[4])); - __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); - __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); - __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); - __builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]); - for (l = 1; l < temp; l++) - { - rowA = (vec_t *) & AO[l << 2]; - rowB = *((__vector_pair *)((void *)&BO[l << 3])); - rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4])); - __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); - __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); - __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); - __builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]); - } - SAVE_ACC (&acc0, 0); - SAVE_ACC1 (&acc1, 0); - SAVE_ACC (&acc2, 2); - SAVE_ACC1 (&acc3, 2); - CO += 4; - AO += temp << 2; - BO += temp << 3; -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (4, 8) -#endif - } - if (m & 2) - { - FLOAT *BO; -#if defined(TRMMKERNEL) - REFRESH_POINTERS (2, 8); + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + BLASLONG l = 0; + vec_t *rowA = (vec_t *) & AO[0]; + __vector_pair rowB, rowB1; + rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[0])); + rowB1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[4])); + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); + __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); + __builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]); + for (l = 1; l < temp; l++) { + rowA = (vec_t *) & AO[l << 2]; + rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[l << 3])); + rowB1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[(l << 3) + 4])); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); + __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); + __builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC1 (&acc1, 0); + SAVE_ACC (&acc2, 2); + SAVE_ACC1 (&acc3, 2); + CO += 4; + AO += temp << 2; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 8) +#endif + } + if (m & 2) { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 8); #else - BO = B; - temp = k; -#endif - v4sf_t *rowC; - v4sf_t result[4]; - __vector_quad acc0, acc1; - BLASLONG l = 0; - vec_t *rowA = (vec_t *) & AO[0]; - __vector_pair rowB, rowB1; - rowB = *((__vector_pair *)((void *)&BO[0])); - rowB1 = *((__vector_pair *)((void *)&BO[4])); - __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); - __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); - for (l = 1; l < temp; l++) - { - rowA = (vec_t *) & AO[l << 1]; - rowB = *((__vector_pair *)((void *)&BO[l << 3])); - rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4])); - __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); - __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); - } - SAVE_ACC (&acc0, 0); - SAVE_ACC1 (&acc1, 0); - CO += 2; - AO += temp << 1; - BO += temp << 3; -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (2, 8) -#endif - } - if (m & 1) - { - FLOAT *BO; -#if defined(TRMMKERNEL) - REFRESH_POINTERS (1, 8); + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + BLASLONG l = 0; + vec_t *rowA = (vec_t *) & AO[0]; + __vector_pair rowB, rowB1; + rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[0])); + rowB1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[4])); + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); + for (l = 1; l < temp; l++) { + rowA = (vec_t *) & AO[l << 1]; + rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[l << 3])); + rowB1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[(l << 3) + 4])); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC1 (&acc1, 0); + CO += 2; + AO += temp << 1; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 8) +#endif + } + if (m & 1) { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 8); #else - BO = B; - temp = k; -#endif - BLASLONG l = 0; - v4sf_t t = { 0, 0 }; - v4sf_t t1 = { 0, 0 }; - v4sf_t t2 = { 0, 0 }; - v4sf_t t3 = { 0, 0 }; - for (l = 0; l < temp; l++) - { - v4sf_t rowA = { AO[l], AO[l] }; - v4sf_t rowB = { BO[l << 3], BO[(l << 3) + 1] }; - v4sf_t rowB1 = { BO[(l << 3) + 2], BO[(l << 3) + 3] }; - v4sf_t rowB2 = { BO[(l << 3) + 4], BO[(l << 3) + 5] }; - v4sf_t rowB3 = { BO[(l << 3) + 6], BO[(l << 3) + 7] }; - t += rowA * rowB; - t1 += rowA * rowB1; - t2 += rowA * rowB2; - t3 += rowA * rowB3; - } - t = t * valpha; - t1 = t1 * valpha; - t2 = t2 * valpha; - t3 = t3 * valpha; -#if defined(TRMMKERNEL) - CO[0 * ldc] = t[0]; - CO[1 * ldc] = t[1]; - CO[2 * ldc] = t1[0]; - CO[3 * ldc] = t1[1]; - CO[4 * ldc] = t2[0]; - CO[5 * ldc] = t2[1]; - CO[6 * ldc] = t3[0]; - CO[7 * ldc] = t3[1]; + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + v4sf_t t1 = { 0, 0 }; + v4sf_t t2 = { 0, 0 }; + v4sf_t t3 = { 0, 0 }; + for (l = 0; l < temp; l++) { + v4sf_t rowA = { AO[l], AO[l] }; + v4sf_t rowB = { BO[l << 3], BO[(l << 3) + 1] }; + v4sf_t rowB1 = { BO[(l << 3) + 2], BO[(l << 3) + 3] }; + v4sf_t rowB2 = { BO[(l << 3) + 4], BO[(l << 3) + 5] }; + v4sf_t rowB3 = { BO[(l << 3) + 6], BO[(l << 3) + 7] }; + t += rowA * rowB; + t1 += rowA * rowB1; + t2 += rowA * rowB2; + t3 += rowA * rowB3; + } + t = t * valpha; + t1 = t1 * valpha; + t2 = t2 * valpha; + t3 = t3 * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; + CO[2 * ldc] = t1[0]; + CO[3 * ldc] = t1[1]; + CO[4 * ldc] = t2[0]; + CO[5 * ldc] = t2[1]; + CO[6 * ldc] = t3[0]; + CO[7 * ldc] = t3[1]; #else - CO[0 * ldc] += t[0]; - CO[1 * ldc] += t[1]; - CO[2 * ldc] += t1[0]; - CO[3 * ldc] += t1[1]; - CO[4 * ldc] += t2[0]; - CO[5 * ldc] += t2[1]; - CO[6 * ldc] += t3[0]; - CO[7 * ldc] += t3[1]; -#endif - CO += 1; - AO += temp; - BO += temp << 3; -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (1, 8) -#endif - } + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[2 * ldc] += t1[0]; + CO[3 * ldc] += t1[1]; + CO[4 * ldc] += t2[0]; + CO[5 * ldc] += t2[1]; + CO[6 * ldc] += t3[0]; + CO[7 * ldc] += t3[1]; +#endif + CO += 1; + AO += temp; + BO += temp << 3; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (1, 8) +#endif + } #if defined(TRMMKERNEL) && !defined(LEFT) - off += 8; // number of values in A + off += 8; #endif - B += k << 3; + B += k << 3; } - if (n & 4) - { - BLASLONG j, temp; - FLOAT *CO; - FLOAT *AO; + if (n & 4) { + BLASLONG j, temp; + FLOAT *CO; + FLOAT *AO; #if defined(TRMMKERNEL) && defined(LEFT) - off = offset; -#endif - CO = C; - C += ldc << 2; - AO = A; - PREFETCH1 (A, 128); - PREFETCH1 (A, 256); - for (j = 0; j < (m >> 3); j++) - { - FLOAT *BO; -#if defined(TRMMKERNEL) - REFRESH_POINTERS (8, 4); + off = offset; +#endif + CO = C; + C += ldc << 2; + AO = A; + PREFETCH1 (A, 128); + PREFETCH1 (A, 256); + for (j = 0; j < (m >> 3); j++) { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 4); #else - BO = B; - temp = k; -#endif - v4sf_t *rowC; - v4sf_t result[4]; - __vector_quad acc0, acc1, acc2, acc3; - BLASLONG l = 0; - vec_t *rowA = (vec_t *) & AO[0]; - __vector_pair rowB; - rowB = *((__vector_pair *)((void *)&BO[0])); - __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); - __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); - __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); - __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); - for (l = 1; l < temp; l++) - { - rowA = (vec_t *) & AO[l << 3]; - rowB = *((__vector_pair *)((void *)&BO[l << 2])); - __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); - __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); - __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); - __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); - } - SAVE_ACC (&acc0, 0); - SAVE_ACC (&acc2, 4); - SAVE_ACC (&acc1, 2); - SAVE_ACC (&acc3, 6); - CO += 8; - AO += temp << 3; - BO += temp << 2; -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (8, 4) -#endif - } - if (m & 4) - { - FLOAT *BO; -#if defined(TRMMKERNEL) - REFRESH_POINTERS (4, 4); + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + BLASLONG l = 0; + vec_t *rowA = (vec_t *) & AO[0]; + __vector_pair rowB; + rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[0])); + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); + for (l = 1; l < temp; l++) { + rowA = (vec_t *) & AO[l << 3]; + rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[l << 2])); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc2, 4); + SAVE_ACC (&acc1, 2); + SAVE_ACC (&acc3, 6); + CO += 8; + AO += temp << 3; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 4) +#endif + } + if (m & 4) { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 4); #else - BO = B; - temp = k; -#endif - v4sf_t *rowC; - v4sf_t result[4]; - __vector_quad acc0, acc1; - BLASLONG l = 0; - vec_t *rowA = (vec_t *) & AO[0]; - __vector_pair rowB; - rowB = *((__vector_pair *)((void *)&BO[0])); - __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); - __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); - for (l = 1; l < temp; l++) - { - rowA = (vec_t *) & AO[l << 2]; - rowB = *((__vector_pair *)((void *)&BO[l << 2])); - __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); - __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); - } - SAVE_ACC (&acc0, 0); - SAVE_ACC (&acc1, 2); - CO += 4; - AO += temp << 2; - BO += temp << 2; -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (4, 4) -#endif - } - if (m & 2) - { - FLOAT *BO; -#if defined(TRMMKERNEL) - REFRESH_POINTERS (2, 4); + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + BLASLONG l = 0; + vec_t *rowA = (vec_t *) & AO[0]; + __vector_pair rowB; + rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[0])); + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); + for (l = 1; l < temp; l++) { + rowA = (vec_t *) & AO[l << 2]; + rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[l << 2])); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + } + SAVE_ACC (&acc0, 0); + SAVE_ACC (&acc1, 2); + CO += 4; + AO += temp << 2; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 4) +#endif + } + if (m & 2) { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 4); #else - BO = B; - temp = k; -#endif - v4sf_t *rowC; - v4sf_t result[4]; - __vector_quad acc0; - BLASLONG l = 0; - vec_t *rowA = (vec_t *) & AO[0]; - __vector_pair rowB; - rowB = *((__vector_pair *)((void *)&BO[0])); - __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); - for (l = 1; l < temp; l++) - { - rowA = (vec_t *) & AO[l << 1]; - rowB = *((__vector_pair *)((void *)&BO[l << 2])); - __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); - } - SAVE_ACC (&acc0, 0); - CO += 2; - AO += temp << 1; - BO += temp << 2; -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (2, 4) -#endif - } - if (m & 1) - { - FLOAT *BO; -#if defined(TRMMKERNEL) - REFRESH_POINTERS (1, 4); + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0; + BLASLONG l = 0; + vec_t *rowA = (vec_t *) & AO[0]; + __vector_pair rowB; + rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[0])); + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + for (l = 1; l < temp; l++) { + rowA = (vec_t *) & AO[l << 1]; + rowB = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&BO[l << 2])); + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + } + SAVE_ACC (&acc0, 0); + CO += 2; + AO += temp << 1; + BO += temp << 2; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 4) +#endif + } + if (m & 1) { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 4); #else - BO = B; - temp = k; -#endif - BLASLONG l = 0; - v4sf_t t = { 0, 0 }; - v4sf_t t1 = { 0, 0 }; - for (l = 0; l < temp; l++) - { - v4sf_t rowA = { AO[l], AO[l] }; - v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1] }; - v4sf_t rowB1 = { BO[(l << 2) + 2], BO[(l << 2) + 3] }; - t += rowA * rowB; - t1 += rowA * rowB1; - } - t = t * valpha; - t1 = t1 * valpha; -#if defined(TRMMKERNEL) - CO[0 * ldc] = t[0]; - CO[1 * ldc] = t[1]; - CO[2 * ldc] = t1[0]; - CO[3 * ldc] = t1[1]; + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + v4sf_t t1 = { 0, 0 }; + for (l = 0; l < temp; l++) { + v4sf_t rowA = { AO[l], AO[l] }; + v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1] }; + v4sf_t rowB1 = { BO[(l << 2) + 2], BO[(l << 2) + 3] }; + t += rowA * rowB; + t1 += rowA * rowB1; + } + t = t * valpha; + t1 = t1 * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; + CO[2 * ldc] = t1[0]; + CO[3 * ldc] = t1[1]; #else - CO[0 * ldc] += t[0]; - CO[1 * ldc] += t[1]; - CO[2 * ldc] += t1[0]; - CO[3 * ldc] += t1[1]; + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; + CO[2 * ldc] += t1[0]; + CO[3 * ldc] += t1[1]; #endif - CO += 1; - AO += temp; - BO += temp << 2; + CO += 1; + AO += temp; + BO += temp << 2; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (1, 4) + REFRESH_AFTER_SAVE (1, 4) #endif - } + } #if defined(TRMMKERNEL) && !defined(LEFT) - off += 4; // number of values in A + off += 4; #endif - B += k << 2; + B += k << 2; } - if (n & 2) - { - BLASLONG j, temp; + if (n & 2) { + BLASLONG j, temp; #if defined(TRMMKERNEL) && defined(LEFT) - off = offset; -#endif - FLOAT *CO; - FLOAT *AO; - CO = C; - C += ldc << 1; - AO = A; - for (j = 0; j < (m >> 3); j++) - { - FLOAT *BO; -#if defined(TRMMKERNEL) - REFRESH_POINTERS (8, 2); + off = offset; +#endif + FLOAT *CO; + FLOAT *AO; + CO = C; + C += ldc << 1; + AO = A; + for (j = 0; j < (m >> 3); j++) { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 2); #else - BO = B; - temp = k; -#endif - v4sf_t *rowC; - v4sf_t result[4]; - __vector_quad acc0, acc1, acc2, acc3; - BLASLONG l = 0; - __vector_pair rowB; - vec_t *rb = (vec_t *) & BO[0]; - __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); - vec_t *rowA = (vec_t *) & AO[0]; - __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); - __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); - __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); - __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); - for (l = 1; l < temp; l++) - { - rb = (vec_t *) & BO[l << 1]; - __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); - rowA = (vec_t *) & AO[l << 3]; - __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); - __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); - __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); - __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); - } - SAVE2x4_ACC (&acc0, 0); - SAVE2x4_ACC (&acc1, 2); - SAVE2x4_ACC (&acc2, 4); - SAVE2x4_ACC (&acc3, 6); - CO += 8; - AO += temp << 3; - BO += temp << 1; -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (8, 2) -#endif - } - if (m & 4) - { - FLOAT *BO; -#if defined(TRMMKERNEL) - REFRESH_POINTERS (4, 2); + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1, acc2, acc3; + BLASLONG l = 0; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[0]; + __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); + vec_t *rowA = (vec_t *) & AO[0]; + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]); + for (l = 1; l < temp; l++) { + rb = (vec_t *) & BO[l << 1]; + __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); + rowA = (vec_t *) & AO[l << 3]; + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); + __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 2); + SAVE2x4_ACC (&acc2, 4); + SAVE2x4_ACC (&acc3, 6); + CO += 8; + AO += temp << 3; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 2) +#endif + } + if (m & 4) { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 2); #else - BO = B; - temp = k; -#endif - v4sf_t *rowC; - v4sf_t result[4]; - __vector_quad acc0, acc1; - BLASLONG l = 0; - __vector_pair rowB; - vec_t *rb = (vec_t *) & BO[0]; - __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); - vec_t *rowA = (vec_t *) & AO[0]; - __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); - __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); - for (l = 1; l < temp; l++) - { - rb = (vec_t *) & BO[l << 1]; - __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); - rowA = (vec_t *) & AO[l << 2]; - __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); - __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); - } - SAVE2x4_ACC (&acc0, 0); - SAVE2x4_ACC (&acc1, 2); - CO += 4; - AO += temp << 2; - BO += temp << 1; -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (4, 2) -#endif - } - if (m & 2) - { - FLOAT *BO; -#if defined(TRMMKERNEL) - REFRESH_POINTERS (2, 2); + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0, acc1; + BLASLONG l = 0; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[0]; + __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); + vec_t *rowA = (vec_t *) & AO[0]; + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); + for (l = 1; l < temp; l++) { + rb = (vec_t *) & BO[l << 1]; + __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); + rowA = (vec_t *) & AO[l << 2]; + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); + } + SAVE2x4_ACC (&acc0, 0); + SAVE2x4_ACC (&acc1, 2); + CO += 4; + AO += temp << 2; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (4, 2) +#endif + } + if (m & 2) { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (2, 2); #else - BO = B; - temp = k; -#endif - v4sf_t *rowC; - v4sf_t result[4]; - __vector_quad acc0; - BLASLONG l = 0; - __vector_pair rowB; - vec_t *rb = (vec_t *) & BO[0]; - __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); - vec_t *rowA = (vec_t *) & AO[0]; - __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); - for (l = 1; l < temp; l++) - { - rb = (vec_t *) & BO[l << 1]; - __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); - rowA = (vec_t *) & AO[l << 1]; - __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); - } - SAVE2x4_ACC (&acc0, 0); - CO += 2; - AO += temp << 1; - BO += temp << 1; -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (2, 2) -#endif - } - if (m & 1) - { - FLOAT *BO; -#if defined(TRMMKERNEL) - REFRESH_POINTERS (1, 2); + BO = B; + temp = k; +#endif + v4sf_t *rowC; + v4sf_t result[4]; + __vector_quad acc0; + BLASLONG l = 0; + __vector_pair rowB; + vec_t *rb = (vec_t *) & BO[0]; + __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); + vec_t *rowA = (vec_t *) & AO[0]; + __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); + for (l = 1; l < temp; l++) { + rb = (vec_t *) & BO[l << 1]; + __builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]); + rowA = (vec_t *) & AO[l << 1]; + __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); + } + SAVE2x4_ACC (&acc0, 0); + CO += 2; + AO += temp << 1; + BO += temp << 1; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (2, 2) +#endif + } + if (m & 1) { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (1, 2); #else - BO = B; - temp = k; -#endif - BLASLONG l = 0; - v4sf_t t = { 0, 0 }; - for (l = 0; l < temp; l++) - { - v4sf_t rowA = { AO[l], AO[l] }; - v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1] }; - t += rowA * rowB; - } - t = t * valpha; -#if defined(TRMMKERNEL) - CO[0 * ldc] = t[0]; - CO[1 * ldc] = t[1]; + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + for (l = 0; l < temp; l++) { + v4sf_t rowA = { AO[l], AO[l] }; + v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1] }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0 * ldc] = t[0]; + CO[1 * ldc] = t[1]; #else - CO[0 * ldc] += t[0]; - CO[1 * ldc] += t[1]; + CO[0 * ldc] += t[0]; + CO[1 * ldc] += t[1]; #endif - CO += 1; - AO += temp; - BO += temp << 1; + CO += 1; + AO += temp; + BO += temp << 1; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (1, 2) + REFRESH_AFTER_SAVE (1, 2) #endif - } + } #if defined(TRMMKERNEL) && !defined(LEFT) - off += 2; // number of values in A + off += 2; #endif - B += k << 1; + B += k << 1; } - if (n & 1) - { - BLASLONG i, temp; + if (n & 1) { + BLASLONG i, temp; #if defined(TRMMKERNEL) && defined(LEFT) - off = offset; -#endif - FLOAT *CO; - FLOAT *AO; - CO = C; - C += ldc; - AO = A; - for (i = 0; i < (m >> 3); i++) - { - FLOAT *BO; -#if defined(TRMMKERNEL) - REFRESH_POINTERS (8, 1) + off = offset; +#endif + FLOAT *CO; + FLOAT *AO; + CO = C; + C += ldc; + AO = A; + for (i = 0; i < (m >> 3); i++) { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (8, 1) #else - BO = B; - temp = k; -#endif - BLASLONG l = 0; - v4sf_t t = { 0, 0 }; - v4sf_t t1 = { 0, 0 }; - v4sf_t t2 = { 0, 0 }; - v4sf_t t3 = { 0, 0 }; - for (l = 0; l < temp; l++) - { - v4sf_t rowB = { BO[l], BO[l] }; - v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1] }; - v4sf_t rowA1 = { AO[(l << 3) + 2], AO[(l << 3) + 3] }; - v4sf_t rowA2 = { AO[(l << 3) + 4], AO[(l << 3) + 5] }; - v4sf_t rowA3 = { AO[(l << 3) + 6], AO[(l << 3) + 7] }; - t += rowA * rowB; - t1 += rowA1 * rowB; - t2 += rowA2 * rowB; - t3 += rowA3 * rowB; - } - t = t * valpha; - t1 = t1 * valpha; - t2 = t2 * valpha; - t3 = t3 * valpha; -#if defined(TRMMKERNEL) - CO[0] = t[0]; - CO[1] = t[1]; - CO[2] = t1[0]; - CO[3] = t1[1]; - CO[4] = t2[0]; - CO[5] = t2[1]; - CO[6] = t3[0]; - CO[7] = t3[1]; + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + v4sf_t t1 = { 0, 0 }; + v4sf_t t2 = { 0, 0 }; + v4sf_t t3 = { 0, 0 }; + for (l = 0; l < temp; l++) { + v4sf_t rowB = { BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1] }; + v4sf_t rowA1 = { AO[(l << 3) + 2], AO[(l << 3) + 3] }; + v4sf_t rowA2 = { AO[(l << 3) + 4], AO[(l << 3) + 5] }; + v4sf_t rowA3 = { AO[(l << 3) + 6], AO[(l << 3) + 7] }; + t += rowA * rowB; + t1 += rowA1 * rowB; + t2 += rowA2 * rowB; + t3 += rowA3 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; + t2 = t2 * valpha; + t3 = t3 * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t1[0]; + CO[3] = t1[1]; + CO[4] = t2[0]; + CO[5] = t2[1]; + CO[6] = t3[0]; + CO[7] = t3[1]; #else - CO[0] += t[0]; - CO[1] += t[1]; - CO[2] += t1[0]; - CO[3] += t1[1]; - CO[4] += t2[0]; - CO[5] += t2[1]; - CO[6] += t3[0]; - CO[7] += t3[1]; -#endif - AO += temp << 3; - BO += temp; - CO += 8; -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (8, 1) -#endif - } - if (m & 4) - { - FLOAT *BO; -#if defined(TRMMKERNEL) - REFRESH_POINTERS (4, 1) + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t1[0]; + CO[3] += t1[1]; + CO[4] += t2[0]; + CO[5] += t2[1]; + CO[6] += t3[0]; + CO[7] += t3[1]; +#endif + AO += temp << 3; + BO += temp; + CO += 8; +#if defined(TRMMKERNEL) + REFRESH_AFTER_SAVE (8, 1) +#endif + } + if (m & 4) { + FLOAT *BO; +#if defined(TRMMKERNEL) + REFRESH_POINTERS (4, 1) #else - BO = B; - temp = k; -#endif - BLASLONG l = 0; - v4sf_t t = { 0, 0 }; - v4sf_t t1 = { 0, 0 }; - for (l = 0; l < temp; l++) - { - v4sf_t rowB = { BO[l], BO[l] }; - v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1] }; - v4sf_t rowA1 = { AO[(l << 2) + 2], AO[(l << 2) + 3] }; - t += rowA * rowB; - t1 += rowA1 * rowB; - } - t = t * valpha; - t1 = t1 * valpha; -#if defined(TRMMKERNEL) - CO[0] = t[0]; - CO[1] = t[1]; - CO[2] = t1[0]; - CO[3] = t1[1]; + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + v4sf_t t1 = { 0, 0 }; + for (l = 0; l < temp; l++) { + v4sf_t rowB = { BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1] }; + v4sf_t rowA1 = { AO[(l << 2) + 2], AO[(l << 2) + 3] }; + t += rowA * rowB; + t1 += rowA1 * rowB; + } + t = t * valpha; + t1 = t1 * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; + CO[2] = t1[0]; + CO[3] = t1[1]; #else - CO[0] += t[0]; - CO[1] += t[1]; - CO[2] += t1[0]; - CO[3] += t1[1]; + CO[0] += t[0]; + CO[1] += t[1]; + CO[2] += t1[0]; + CO[3] += t1[1]; #endif - AO += temp << 2; - BO += temp; - CO += 4; + AO += temp << 2; + BO += temp; + CO += 4; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (4, 1) + REFRESH_AFTER_SAVE (4, 1) #endif - } - if (m & 2) - { - FLOAT *BO; + } + if (m & 2) { + FLOAT *BO; #if defined(TRMMKERNEL) - REFRESH_POINTERS (2, 1) + REFRESH_POINTERS (2, 1) #else - BO = B; - temp = k; -#endif - BLASLONG l = 0; - v4sf_t t = { 0, 0 }; - for (l = 0; l < temp; l++) - { - v4sf_t rowB = { BO[l], BO[l] }; - v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1] }; - t += rowA * rowB; - } - t = t * valpha; -#if defined(TRMMKERNEL) - CO[0] = t[0]; - CO[1] = t[1]; + BO = B; + temp = k; +#endif + BLASLONG l = 0; + v4sf_t t = { 0, 0 }; + for (l = 0; l < temp; l++) { + v4sf_t rowB = { BO[l], BO[l] }; + v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1] }; + t += rowA * rowB; + } + t = t * valpha; +#if defined(TRMMKERNEL) + CO[0] = t[0]; + CO[1] = t[1]; #else - CO[0] += t[0]; - CO[1] += t[1]; + CO[0] += t[0]; + CO[1] += t[1]; #endif - AO += temp << 1; - BO += temp; - CO += 2; + AO += temp << 1; + BO += temp; + CO += 2; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (2, 1) + REFRESH_AFTER_SAVE (2, 1) #endif - } - if (m & 1) - { - FLOAT *BO; + } + if (m & 1) { + FLOAT *BO; #if defined(TRMMKERNEL) - REFRESH_POINTERS (1, 1) + REFRESH_POINTERS (1, 1) #else - BO = B; - temp = k; -#endif - BLASLONG l = 0; - FLOAT t = 0; - for (l = 0; l < temp; l++) - { - t += AO[l] * BO[l]; - } - AO += temp; - BO += temp; -#if defined(TRMMKERNEL) - CO[0] = t * alpha; + BO = B; + temp = k; +#endif + BLASLONG l = 0; + FLOAT t = 0; + for (l = 0; l < temp; l++) { + t += AO[l] * BO[l]; + } + AO += temp; + BO += temp; +#if defined(TRMMKERNEL) + CO[0] = t * alpha; #else - CO[0] += t * alpha; + CO[0] += t * alpha; #endif - CO += 1; + CO += 1; #if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE (1, 1) + REFRESH_AFTER_SAVE (1, 1) #endif - } + } #if defined(TRMMKERNEL) && !defined(LEFT) - off += 1; // number of values in A + off += 1; #endif - B += k; + B += k; } - return 0; + return 0; } + +// Made with Bob diff --git a/kernel/power/dgemm_small_kernel_nn_power10.c b/kernel/power/dgemm_small_kernel_nn_power10.c index 73f6d5b994..59bee5fe5d 100644 --- a/kernel/power/dgemm_small_kernel_nn_power10.c +++ b/kernel/power/dgemm_small_kernel_nn_power10.c @@ -314,8 +314,8 @@ typedef __vector unsigned char vec_t; *((__vector_pair *)(void *)(packB+(k*8)+4+offset)) = pb1; #define LOAD_PACKED_B(pb0, pb1, offset) \ - pb0 = *((__vector_pair *)((void *)(packB+(k*8)+0+offset))); \ - pb1 = *((__vector_pair *)((void *)(packB+(k*8)+4+offset))); + pb0 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)(packB+(k*8)+0+offset))); \ + pb1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)(packB+(k*8)+4+offset))); #ifdef B0 int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, IFLOAT * A, BLASLONG lda, FLOAT alpha, IFLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc) diff --git a/kernel/power/dgemm_small_kernel_nt_power10.c b/kernel/power/dgemm_small_kernel_nt_power10.c index 7cc8c9f6c6..009f2dd6cd 100644 --- a/kernel/power/dgemm_small_kernel_nt_power10.c +++ b/kernel/power/dgemm_small_kernel_nt_power10.c @@ -144,11 +144,11 @@ typedef __vector unsigned char vec_t; #define LOAD_A_1x1(K, M) ra0 = vec_splats(A[K*lda+M]); #define LOAD_BP_1x8(K, N) \ - pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0])); \ - pb1 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+4])); + pb0 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&B[((K)*ldb)+N+0])); \ + pb1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&B[((K)*ldb)+N+4])); #define LOAD_BP_1x4(K, N) \ - pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0])); + pb0 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&B[((K)*ldb)+N+0])); #define LOAD_BP_1x2(K, N) \ t0 = vec_xl(0, B+(K*ldb)+N); \ diff --git a/kernel/power/dgemm_small_kernel_tt_power10.c b/kernel/power/dgemm_small_kernel_tt_power10.c index b47b6201f4..62e3b8b678 100644 --- a/kernel/power/dgemm_small_kernel_tt_power10.c +++ b/kernel/power/dgemm_small_kernel_tt_power10.c @@ -207,11 +207,11 @@ typedef __vector unsigned char vec_t; #define LOAD_A_1x1(M, K) ra0 = vec_splats(A[(M)*lda+K]); #define LOAD_BP_1x8(K, N) \ - pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0])); \ - pb1 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+4])); + pb0 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&B[((K)*ldb)+N+0])); \ + pb1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&B[((K)*ldb)+N+4])); #define LOAD_BP_1x4(K, N) \ - pb0 = *((__vector_pair *)((void *)&B[((K)*ldb)+N+0])); + pb0 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&B[((K)*ldb)+N+0])); #define LOAD_BP_1x2(K, N) \ t0 = vec_xl(0, B+((K)*ldb)+N); \ diff --git a/kernel/power/dgemv_t_power10.c b/kernel/power/dgemv_t_power10.c index b2aff60887..9aaeec902f 100644 --- a/kernel/power/dgemv_t_power10.c +++ b/kernel/power/dgemv_t_power10.c @@ -61,37 +61,37 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA a6 = a5 + lda; a7 = a6 + lda; for (i = 0; i < n/2; i += 2) { - vp = *((__vector_pair *)((void *)&a0[i*2])); - vx = *((__vector_pair *)((void *)&x[i*2])); + vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a0[i*2])); + vx = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&x[i*2])); __builtin_vsx_disassemble_pair (res, &vx); __builtin_vsx_disassemble_pair (res1, &vp); temp0 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp0); temp0 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp0); - vp = *((__vector_pair *)((void *)&a1[i*2])); + vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a1[i*2])); __builtin_vsx_disassemble_pair (res1, &vp); temp1 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp1); temp1 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp1); - vp = *((__vector_pair *)((void *)&a2[i*2])); + vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a2[i*2])); __builtin_vsx_disassemble_pair (res1, &vp); temp2 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp2); temp2 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp2); - vp = *((__vector_pair *)((void *)&a3[i*2])); + vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a3[i*2])); __builtin_vsx_disassemble_pair (res1, &vp); temp3 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp3); temp3 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp3); - vp = *((__vector_pair *)((void *)&a4[i*2])); + vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a4[i*2])); __builtin_vsx_disassemble_pair (res1, &vp); temp4 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp4); temp4 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp4); - vp = *((__vector_pair *)((void *)&a5[i*2])); + vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a5[i*2])); __builtin_vsx_disassemble_pair (res1, &vp); temp5 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp5); temp5 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp5); - vp = *((__vector_pair *)((void *)&a6[i*2])); + vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a6[i*2])); __builtin_vsx_disassemble_pair (res1, &vp); temp6 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp6); temp6 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp6); - vp = *((__vector_pair *)((void *)&a7[i*2])); + vp = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&a7[i*2])); __builtin_vsx_disassemble_pair (res1, &vp); temp7 = vec_madd ((__vector double)res[0], (__vector double)res1[0], temp7); temp7 = vec_madd ((__vector double)res[1], (__vector double)res1[1], temp7); diff --git a/kernel/power/gemm_common.c b/kernel/power/gemm_common.c index 0aaeddf0b6..4f3a769fa2 100644 --- a/kernel/power/gemm_common.c +++ b/kernel/power/gemm_common.c @@ -46,11 +46,11 @@ FORCEINLINE void vec_load_pair(vec_f32 *dst, vec_f32 *src) { #ifdef USE_VECTOR_PAIRS __vector_pair vy0p; -#ifdef __clang__ +//#ifdef __clang__ vy0p = __builtin_vsx_lxvp(0L, (const __vector_pair *)(src)); -#else - vy0p = *(__vector_pair *)((void *)src); -#endif +//#else +// vy0p = *(__vector_pair *)((void *)src); +//#endif __builtin_vsx_disassemble_pair((void *)(dst), &vy0p); #else dst[0] = src[0]; diff --git a/kernel/power/zgemm_kernel_power10.c b/kernel/power/zgemm_kernel_power10.c index e4e609067c..8091418683 100644 --- a/kernel/power/zgemm_kernel_power10.c +++ b/kernel/power/zgemm_kernel_power10.c @@ -316,10 +316,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * SET_ACC_ZERO() for (l = 0; l < temp; ++l) { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); - __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); - __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); + __vector_pair rowA1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[l<<4])); + __vector_pair rowA2 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<4)+4])); + __vector_pair rowA3 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<4)+8])); + __vector_pair rowA4 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<4)+12])); vec_t rowB1 = *(vec_t *) & BO[l<<2]; vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); @@ -406,10 +406,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * SET_ACC_ZERO() for (l = 0; l < (temp & (~1)); l+=2) { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); - __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8])); - __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12])); + __vector_pair rowA1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<3)+4])); + __vector_pair rowA3 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<3)+8])); + __vector_pair rowA4 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<3)+12])); vec_t rowB1 = *(vec_t *) & BO[l<<2]; vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; @@ -425,8 +425,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * } for (l = (temp & (~1)); l < temp; ++l) { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + __vector_pair rowA1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<3)+4])); vec_t rowB1 = *(vec_t *) & BO[l<<2]; vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); @@ -454,10 +454,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * SET_ACC_ZERO() for (l = 0; l < (temp & (~3)); l+=4) { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4])); - __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8])); - __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12])); + __vector_pair rowA1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[l<<2])); + __vector_pair rowA2 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<2)+4])); + __vector_pair rowA3 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<2)+8])); + __vector_pair rowA4 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<2)+12])); vec_t rowB1 = *(vec_t *) & BO[l<<2]; vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; @@ -477,7 +477,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * } for (l = (temp & (~3)); l < temp; ++l) { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + __vector_pair rowA1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[l<<2])); vec_t rowB1 = *(vec_t *) & BO[l<<2]; vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); @@ -503,10 +503,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * SET_ACC_ZERO() for (l = 0; l < (temp & (~3)); l+=4) { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2])); - __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4])); - __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6])); + __vector_pair rowA1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[l<<1])); + __vector_pair rowA2 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<1)+2])); + __vector_pair rowA3 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<1)+4])); + __vector_pair rowA4 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<1)+6])); vec_t rowB1 = *(vec_t *) & BO[l<<2]; vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4]; @@ -526,7 +526,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * } for (l = (temp & (~3)); l < temp; ++l) { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + __vector_pair rowA1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[l<<1])); vec_t rowB1 = *(vec_t *) & BO[l<<2]; vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2]; __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); @@ -564,14 +564,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * SET_ACC_ZERO() for (l = 0; l < (temp & (~1)); l+=2) { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); - __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); - __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); - __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<4)+16])); - __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<4)+20])); - __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<4)+24])); - __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<4)+28])); + __vector_pair rowA1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[l<<4])); + __vector_pair rowA2 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<4)+4])); + __vector_pair rowA3 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<4)+8])); + __vector_pair rowA4 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<4)+12])); + __vector_pair rowA5 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<4)+16])); + __vector_pair rowA6 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<4)+20])); + __vector_pair rowA7 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<4)+24])); + __vector_pair rowA8 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<4)+28])); vec_t rowB1 = *(vec_t *) & BO[l<<1]; vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); @@ -585,10 +585,10 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * } for (l = (temp & (~1)); l < temp; ++l) { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4])); - __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8])); - __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12])); + __vector_pair rowA1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[l<<4])); + __vector_pair rowA2 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<4)+4])); + __vector_pair rowA3 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<4)+8])); + __vector_pair rowA4 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<4)+12])); vec_t rowB1 = *(vec_t *) & BO[l<<1]; __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); @@ -615,14 +615,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * SET_ACC_ZERO() for (l = 0; l < (temp & (~3)); l+=4) { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); - __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8])); - __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12])); - __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<3)+16])); - __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<3)+20])); - __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<3)+24])); - __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<3)+28])); + __vector_pair rowA1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<3)+4])); + __vector_pair rowA3 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<3)+8])); + __vector_pair rowA4 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<3)+12])); + __vector_pair rowA5 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<3)+16])); + __vector_pair rowA6 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<3)+20])); + __vector_pair rowA7 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<3)+24])); + __vector_pair rowA8 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<3)+28])); vec_t rowB1 = *(vec_t *) & BO[l<<1]; vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; @@ -638,8 +638,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * } for (l = (temp & (~3)); l < temp; ++l) { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4])); + __vector_pair rowA1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[l<<3])); + __vector_pair rowA2 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<3)+4])); vec_t rowB1 = *(vec_t *) & BO[l<<1]; __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1); @@ -662,14 +662,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * SET_ACC_ZERO() for (l = 0; l < (temp & (~7)); l+=8) { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4])); - __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8])); - __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12])); - __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<2)+16])); - __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<2)+20])); - __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<2)+24])); - __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<2)+28])); + __vector_pair rowA1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[l<<2])); + __vector_pair rowA2 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<2)+4])); + __vector_pair rowA3 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<2)+8])); + __vector_pair rowA4 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<2)+12])); + __vector_pair rowA5 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<2)+16])); + __vector_pair rowA6 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<2)+20])); + __vector_pair rowA7 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<2)+24])); + __vector_pair rowA8 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<2)+28])); vec_t rowB1 = *(vec_t *) & BO[l<<1]; vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; @@ -689,7 +689,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * } for (l = (temp & (~7)); l < temp; ++l) { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2])); + __vector_pair rowA1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[l<<2])); vec_t rowB1 = *(vec_t *) & BO[l<<1]; __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); } @@ -713,14 +713,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * SET_ACC_ZERO() for (l = 0; l < (temp & (~7)); l+=8) { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); - __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2])); - __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4])); - __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6])); - __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<1)+8])); - __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<1)+10])); - __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<1)+12])); - __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<1)+14])); + __vector_pair rowA1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[l<<1])); + __vector_pair rowA2 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<1)+2])); + __vector_pair rowA3 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<1)+4])); + __vector_pair rowA4 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<1)+6])); + __vector_pair rowA5 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<1)+8])); + __vector_pair rowA6 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<1)+10])); + __vector_pair rowA7 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<1)+12])); + __vector_pair rowA8 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[(l<<1)+14])); vec_t rowB1 = *(vec_t *) & BO[l<<1]; vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2]; vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4]; @@ -740,7 +740,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, FLOAT * } for (l = (temp & (~7)); l < temp; ++l) { - __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1])); + __vector_pair rowA1 = __builtin_vsx_lxvp(0, (__vector_pair *)((void *)&AO[l<<1])); vec_t rowB1 = *(vec_t *) & BO[l<<1]; __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1); }