From cc0ba35b51d7642d485bedb4cc1ca4f2b9ae27f1 Mon Sep 17 00:00:00 2001 From: Your Name Date: Tue, 2 Jun 2026 19:08:19 -0400 Subject: [PATCH 1/4] test: cover fp16 bf16 WMMA outputs --- ci/regression.sh.in | 6 ++++++ tests/regression/sgemm_tcu/main.cpp | 16 +++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/ci/regression.sh.in b/ci/regression.sh.in index 8be9598723..908e36d5e3 100755 --- a/ci/regression.sh.in +++ b/ci/regression.sh.in @@ -492,11 +492,17 @@ tensor() CONFIGS="-DNUM_THREADS=8 -DEXT_TCU_ENABLE -DTCU_BHF" ./ci/blackbox.sh --driver=rtlsim --app=sgemm_tcu CONFIGS="-DNUM_THREADS=8 -DEXT_TCU_ENABLE -DTCU_DSP" ./ci/blackbox.sh --driver=rtlsim --app=sgemm_tcu + make -C tests/regression/sgemm_tcu clean && CONFIGS="-DNUM_THREADS=8 -DITYPE=fp16 -DOTYPE=fp16" make -C tests/regression/sgemm_tcu + CONFIGS="-DNUM_THREADS=8 -DEXT_TCU_ENABLE -DTCU_BHF" ./ci/blackbox.sh --driver=rtlsim --app=sgemm_tcu + make -C tests/regression/sgemm_tcu clean && CONFIGS="-DNUM_THREADS=16 -DITYPE=bf16 -DOTYPE=fp32" make -C tests/regression/sgemm_tcu CONFIGS="-DNUM_THREADS=16 -DEXT_TCU_ENABLE -DTCU_DPI" ./ci/blackbox.sh --driver=rtlsim --app=sgemm_tcu CONFIGS="-DNUM_THREADS=16 -DEXT_TCU_ENABLE -DTCU_BHF" ./ci/blackbox.sh --driver=rtlsim --app=sgemm_tcu CONFIGS="-DNUM_THREADS=16 -DEXT_TCU_ENABLE -DTCU_DSP" ./ci/blackbox.sh --driver=rtlsim --app=sgemm_tcu + make -C tests/regression/sgemm_tcu clean && CONFIGS="-DNUM_THREADS=16 -DITYPE=bf16 -DOTYPE=bf16" make -C tests/regression/sgemm_tcu + CONFIGS="-DNUM_THREADS=16 -DEXT_TCU_ENABLE -DTCU_BHF" ./ci/blackbox.sh --driver=rtlsim --app=sgemm_tcu + echo "tensor tests done!" } diff --git a/tests/regression/sgemm_tcu/main.cpp b/tests/regression/sgemm_tcu/main.cpp index 132e4127d0..03a8a16aad 100644 --- a/tests/regression/sgemm_tcu/main.cpp +++ b/tests/regression/sgemm_tcu/main.cpp @@ -199,6 +199,10 @@ class Comparator { } }; +static int32_t ulp_distance16(uint16_t a, uint16_t b) { + return std::abs(static_cast(a) - static_cast(b)); +} + template <> class Comparator { public: @@ -207,9 +211,10 @@ class Comparator { return rv_ftoh_s(bit_cast(fvalue), 0, nullptr); } static bool compare(uint16_t a, uint16_t b, int index, int errors) { - if (a != b) { + auto d = ulp_distance16(a, b); + if (d > FLOAT_ULP) { if (errors < MAX_ERRORS) { - printf("*** error: [%d] expected=0x%x, actual=0x%x\n", index, b, a); + printf("*** error: [%d] expected=0x%x, actual=0x%x, ulp=%d\n", index, b, a, d); } return false; } @@ -225,9 +230,10 @@ class Comparator { return rv_ftob_s(bit_cast(fvalue), 0, nullptr); } static bool compare(uint16_t a, uint16_t b, int index, int errors) { - if (a != b) { + auto d = ulp_distance16(a, b); + if (d > FLOAT_ULP) { if (errors < MAX_ERRORS) { - printf("*** error: [%d] expected=0x%x, actual=0x%x\n", index, b, a); + printf("*** error: [%d] expected=0x%x, actual=0x%x, ulp=%d\n", index, b, a, d); } return false; } @@ -686,4 +692,4 @@ int main(int argc, char *argv[]) { std::cout << "PASSED!" << std::endl; return 0; -} \ No newline at end of file +} From a60a013313d5e700046e84a19f99dff72e113aaf Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 3 Jun 2026 02:18:28 -0400 Subject: [PATCH 2/4] hw: fix WMMA fp16 bf16 outputs --- hw/rtl/tcu/VX_tcu_fedp_bhf.sv | 169 +++++++++++++++++++++++++++++++--- 1 file changed, 156 insertions(+), 13 deletions(-) diff --git a/hw/rtl/tcu/VX_tcu_fedp_bhf.sv b/hw/rtl/tcu/VX_tcu_fedp_bhf.sv index d11cfc0f07..7611aee8e1 100644 --- a/hw/rtl/tcu/VX_tcu_fedp_bhf.sv +++ b/hw/rtl/tcu/VX_tcu_fedp_bhf.sv @@ -12,8 +12,9 @@ // limitations under the License. `include "VX_define.vh" +`include "HardFloat_consts.vi" -module VX_tcu_fedp_bhf #( +module VX_tcu_fedp_bhf import VX_tcu_pkg::*; #( parameter LATENCY = 1, parameter N = 1 ) ( @@ -41,8 +42,9 @@ module VX_tcu_fedp_bhf #( localparam FMT_DELAY = FMUL_LATENCY + FRND_LATENCY; localparam C_DELAY = (FMUL_LATENCY + FRND_LATENCY) + 1 + FRED_LATENCY; - `UNUSED_VAR ({fmt_d, c_val}); - +`ifdef XLEN_64 + `UNUSED_VAR (c_val[63:32]); +`endif wire [2:0] frm = '0; // RNE rounding mode wire [TCK-1:0][15:0] a_row16; @@ -120,9 +122,9 @@ module VX_tcu_fedp_bhf #( logic [32:0] mult_result_mux; always_comb begin - case(fmt_s_delayed) - 3'd1: mult_result_mux = mult_result_fp16; - 3'd2: mult_result_mux = mult_result_bf16; + case (fmt_s_delayed) + TCU_FP16_ID: mult_result_mux = mult_result_fp16; + TCU_BF16_ID: mult_result_mux = mult_result_bf16; default: mult_result_mux = 'x; endcase end @@ -173,17 +175,73 @@ module VX_tcu_fedp_bhf #( // Accumulation input C recoding and delay handling - wire [32:0] c_rec, c_delayed; - wire [31:0] result; + wire [16:0] c_fp16_rec, c_bf16_rec; + wire [32:0] c_fp32_rec, c_fp16_to_fp32_rec, c_bf16_to_fp32_rec; + logic [32:0] c_rec; + wire [32:0] c_delayed; fNToRecFN #( .expWidth (8), .sigWidth (24) - ) conv_c ( + ) conv_c_fp32 ( .in (c_val[31:0]), - .out (c_rec) + .out (c_fp32_rec) + ); + + fNToRecFN #( + .expWidth (5), + .sigWidth (11) + ) conv_c_fp16 ( + .in (c_val[15:0]), + .out (c_fp16_rec) + ); + + // Match the BHF fadd/fmul HardFloat tininess policy. + wire control = `flControl_tininessAfterRounding; // IEEE 754-2008 + + recFNToRecFN #( + .inExpWidth (5), + .inSigWidth (11), + .outExpWidth (8), + .outSigWidth (24) + ) widen_c_fp16 ( + .control (control), + .in (c_fp16_rec), + .roundingMode (frm), + .out (c_fp16_to_fp32_rec), + `UNUSED_PIN (exceptionFlags) + ); + + fNToRecFN #( + .expWidth (8), + .sigWidth (8) + ) conv_c_bf16 ( + .in (c_val[15:0]), + .out (c_bf16_rec) ); + recFNToRecFN #( + .inExpWidth (8), + .inSigWidth (8), + .outExpWidth (8), + .outSigWidth (24) + ) widen_c_bf16 ( + .control (control), + .in (c_bf16_rec), + .roundingMode (frm), + .out (c_bf16_to_fp32_rec), + `UNUSED_PIN (exceptionFlags) + ); + + always_comb begin + case (fmt_d) + TCU_FP32_ID: c_rec = c_fp32_rec; + TCU_FP16_ID: c_rec = c_fp16_to_fp32_rec; + TCU_BF16_ID: c_rec = c_bf16_to_fp32_rec; + default: c_rec = 'x; + endcase + end + VX_pipe_register #( .DATAW (33), .DEPTH (C_DELAY) @@ -195,12 +253,28 @@ module VX_tcu_fedp_bhf #( .data_out(c_delayed) ); + wire [2:0] fmt_d_delayed; + + VX_pipe_register #( + .DATAW (3), + .DEPTH (TOTAL_LATENCY) + ) pipe_fmt_d ( + .clk (clk), + .reset (reset), + .enable (enable), + .data_in (fmt_d), + .data_out(fmt_d_delayed) + ); + // Final accumulation + + wire [32:0] result_rec; + VX_tcu_bhf_fadd #( .IN_EXPW (8), .IN_SIGW (23+1), .IN_REC (1), // input in recoded format - .OUT_REC (0), // output in IEEE format + .OUT_REC (1), // output in recoded format .ADD_LATENCY (FADD_LATENCY), .RND_LATENCY (FRND_LATENCY) ) final_add ( @@ -210,10 +284,79 @@ module VX_tcu_fedp_bhf #( .frm (frm), .a (red_in[LEVELS][0]), .b (c_delayed), - .y (result), + .y (result_rec), `UNUSED_PIN(fflags) ); - assign d_val = `XLEN'(result); + wire [31:0] result_fp32; + wire [16:0] result_fp16_rec, result_bf16_rec; + wire [15:0] result_fp16, result_bf16; + + recFNToFN #( + .expWidth (8), + .sigWidth (24) + ) to_fp32 ( + .in (result_rec), + .out (result_fp32) + ); + + recFNToRecFN #( + .inExpWidth (8), + .inSigWidth (24), + .outExpWidth (5), + .outSigWidth (11) + ) narrow_result_fp16 ( + .control (control), + .in (result_rec), + .roundingMode (frm), + .out (result_fp16_rec), + `UNUSED_PIN (exceptionFlags) + ); + + recFNToFN #( + .expWidth (5), + .sigWidth (11) + ) to_fp16 ( + .in (result_fp16_rec), + .out (result_fp16) + ); + + recFNToRecFN #( + .inExpWidth (8), + .inSigWidth (24), + .outExpWidth (8), + .outSigWidth (8) + ) narrow_result_bf16 ( + .control (control), + .in (result_rec), + .roundingMode (frm), + .out (result_bf16_rec), + `UNUSED_PIN (exceptionFlags) + ); + + recFNToFN #( + .expWidth (8), + .sigWidth (8) + ) to_bf16 ( + .in (result_bf16_rec), + .out (result_bf16) + ); + + logic [31:0] result; + + always_comb begin + case (fmt_d_delayed) + TCU_FP32_ID: result = result_fp32; + TCU_FP16_ID: result = {16'b0, result_fp16}; + TCU_BF16_ID: result = {16'b0, result_bf16}; + default: result = 'x; + endcase + end + +`ifdef XLEN_64 + assign d_val = {32'hffffffff, result}; +`else + assign d_val = result; +`endif endmodule From a543094211641724f606c3bf4a5122890ee2c14c Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 3 Jun 2026 01:52:23 -0400 Subject: [PATCH 3/4] ci: add TCU BHF synthesis coverage --- ci/regression.sh.in | 3 +++ hw/rtl/tcu/VX_tcu_top.sv | 2 +- hw/syn/yosys/Makefile | 13 +++++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/ci/regression.sh.in b/ci/regression.sh.in index 908e36d5e3..74b51213db 100755 --- a/ci/regression.sh.in +++ b/ci/regression.sh.in @@ -452,6 +452,9 @@ synthesis() PREFIX=build_base make -C hw/syn/yosys clean PREFIX=build_base CONFIGS="-DDPI_DISABLE -DEXT_F_DISABLE -DNUM_WARPS=2 -DNUM_THREADS=2" make -C hw/syn/yosys synthesis + PREFIX=build_tcu_bhf make -C hw/syn/yosys clean + PREFIX=build_tcu_bhf CONFIGS="-DEXT_TCU_ENABLE -DTCU_BHF -DNUM_WARPS=2 -DNUM_THREADS=8" make -C hw/syn/yosys synthesis + echo "synthesis tests done!" } diff --git a/hw/rtl/tcu/VX_tcu_top.sv b/hw/rtl/tcu/VX_tcu_top.sv index 6d3aff0587..c379be4806 100644 --- a/hw/rtl/tcu/VX_tcu_top.sv +++ b/hw/rtl/tcu/VX_tcu_top.sv @@ -33,7 +33,7 @@ module VX_tcu_top import VX_gpu_pkg::*, VX_tcu_pkg::*; #( ); VX_execute_if #( .data_t (tcu_exe_t) - ) VX_execute_if(); + ) execute_if(); VX_result_if #( .data_t (tcu_res_t) diff --git a/hw/syn/yosys/Makefile b/hw/syn/yosys/Makefile index 6935d6dc61..bad108aaeb 100644 --- a/hw/syn/yosys/Makefile +++ b/hw/syn/yosys/Makefile @@ -50,6 +50,19 @@ endif RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache RTL_INCLUDE += $(FPU_INCLUDE) +# Add TCU extension sources +ifneq (,$(findstring -DEXT_TCU_ENABLE, $(CONFIGS))) + RTL_INCLUDE += -I$(RTL_DIR)/tcu +ifneq (,$(findstring -DTCU_DRL, $(CONFIGS))) + RTL_INCLUDE += -I$(RTL_DIR)/tcu/drl +endif +ifneq (,$(findstring -DTCU_BHF, $(CONFIGS))) + RTL_INCLUDE += -I$(RTL_DIR)/tcu/bhf +endif + RTL_INCLUDE += -J$(THIRD_PARTY_DIR)/hardfloat/source/RISCV + RTL_INCLUDE += -I$(THIRD_PARTY_DIR)/hardfloat/source +endif + # Debugging ifdef DEBUG CFLAGS += $(DBG_TRACE_FLAGS) From ce6d361699e733c664fdda0bcda4cb8d58b84cbd Mon Sep 17 00:00:00 2001 From: Your Name Date: Wed, 3 Jun 2026 03:38:55 -0400 Subject: [PATCH 4/4] ci: fix TCU BHF synthesis coverage --- ci/regression.sh.in | 2 +- hw/rtl/tcu/VX_tcu_uops.sv | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/ci/regression.sh.in b/ci/regression.sh.in index 74b51213db..49acaf9000 100755 --- a/ci/regression.sh.in +++ b/ci/regression.sh.in @@ -453,7 +453,7 @@ synthesis() PREFIX=build_base CONFIGS="-DDPI_DISABLE -DEXT_F_DISABLE -DNUM_WARPS=2 -DNUM_THREADS=2" make -C hw/syn/yosys synthesis PREFIX=build_tcu_bhf make -C hw/syn/yosys clean - PREFIX=build_tcu_bhf CONFIGS="-DEXT_TCU_ENABLE -DTCU_BHF -DNUM_WARPS=2 -DNUM_THREADS=8" make -C hw/syn/yosys synthesis + PREFIX=build_tcu_bhf CONFIGS="-DDPI_DISABLE -DEXT_F_DISABLE -DEXT_TCU_ENABLE -DTCU_BHF -DNUM_WARPS=2 -DNUM_THREADS=8" make -C hw/syn/yosys synthesis echo "synthesis tests done!" } diff --git a/hw/rtl/tcu/VX_tcu_uops.sv b/hw/rtl/tcu/VX_tcu_uops.sv index 101c3d8495..bb0d7fcb90 100644 --- a/hw/rtl/tcu/VX_tcu_uops.sv +++ b/hw/rtl/tcu/VX_tcu_uops.sv @@ -84,6 +84,7 @@ module VX_tcu_uops import assign ibuf_out.PC = ibuf_in.PC; assign ibuf_out.ex_type = ibuf_in.ex_type; assign ibuf_out.op_type = ibuf_in.op_type; + assign ibuf_out.op_args.tcu.__padding = '0; assign ibuf_out.op_args.tcu.fmt_s = ibuf_in.op_args.tcu.fmt_s; assign ibuf_out.op_args.tcu.fmt_d = ibuf_in.op_args.tcu.fmt_d; assign ibuf_out.op_args.tcu.step_m = 4'(m_index); @@ -99,6 +100,7 @@ module VX_tcu_uops import `UNUSED_VAR (ibuf_in.rs1) `UNUSED_VAR (ibuf_in.rs2) `UNUSED_VAR (ibuf_in.rs3) + `UNUSED_VAR (ibuf_in.op_args.tcu.__padding) reg busy;