From bb505041aaef61e5d4cbcba9b24e3b0a968d5349 Mon Sep 17 00:00:00 2001 From: samanthawangdl Date: Thu, 18 Jun 2026 19:14:53 +0000 Subject: [PATCH 1/7] fix(GAP9): L3-aware input/output handling in board test harness deeploytest.c classified memory by `ptr >= 0x10000000` (inputs) / `< 0x10000000` (outputs). HyperRAM/L3 addresses (cl_ram_malloc) are also >= 0x10000000 but are NOT CPU-addressable, so for `--defaultMemLevel L3` tests on real silicon main did a raw memcpy / CPU-deref of an L3 pointer -> 'Invalid fetch' fault in main (e.g. MatMul L3 on board: fault at the cl_ram_malloc'd input address). GVSoC models HyperRAM as flat RAM so it passed there, masking the bug. Add IS_L1/IS_L2 on-chip-window macros and use them: - Inputs: only memcpy on-chip (IS_L2) inputs with a non-NULL testInputVector; L3 inputs are loaded from the readfs hex in InitNetwork (testInputVector is NULL) and already live in HyperRAM, so skip them. - Outputs: ram_read L3 outputs into an L2 scratch before the compare (and free it); on-chip outputs compared in place. Paired malloc/free kept in sync. Verified MatMul --defaultMemLevel L3 on GVSoC: 0/256 (unchanged). On-chip (L2) tests behave identically; only L3 paths change. --- DeeployTest/Platforms/GAP9/src/deeploytest.c | 25 +++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/DeeployTest/Platforms/GAP9/src/deeploytest.c b/DeeployTest/Platforms/GAP9/src/deeploytest.c index 77fe46a4e9..b527baaf9f 100644 --- a/DeeployTest/Platforms/GAP9/src/deeploytest.c +++ b/DeeployTest/Platforms/GAP9/src/deeploytest.c @@ -16,6 +16,18 @@ // RW: Remove MAINSTACKSIZE because gap9-sdk does not use it #define SLAVESTACKSIZE 3800 +/* On-chip memory windows. HyperRAM/L3 (cl_ram_malloc) is NOT CPU-addressable, + * so a raw memcpy / CPU-deref of an L3 pointer faults ("Invalid fetch") on real + * silicon — GVSoC models HyperRAM as flat RAM and hides the bug. Only the real + * on-chip ranges may be touched directly by the FC. The previous `>= + * 0x10000000` / `< 0x10000000` tests wrongly matched HyperRAM (≥ 0x10000000) + * too. */ +#define IS_L1(ptr) \ + ((uint32_t)(ptr) >= 0x10000000u && (uint32_t)(ptr) < 0x10040000u) +#define IS_L2(ptr) \ + (((uint32_t)(ptr) >= 0x1C000000u && (uint32_t)(ptr) < 0x1C200000u) || \ + IS_L1(ptr)) + #ifdef POWER_MEASUREMENT unsigned int GPIOs = 89; #define WRITE_GPIO(x) pi_gpio_pin_write(GPIOs, x) @@ -119,7 +131,11 @@ int main(void) { printf("Initialized\r\n"); #endif for (uint32_t buf = 0; buf < DeeployNetwork_num_inputs; buf++) { - if ((uint32_t)DeeployNetwork_inputs[buf] >= 0x10000000) { + /* L3 inputs are loaded at runtime from the readfs hex inside InitNetwork + * (testInputVector[buf] == NULL) and already live in HyperRAM, which the FC + * cannot memcpy into — skip them. Only on-chip (L1/L2) inputs are copied + * from the baked testInputVector. */ + if (testInputVector[buf] != NULL && IS_L2(DeeployNetwork_inputs[buf])) { memcpy(DeeployNetwork_inputs[buf], testInputVector[buf], DeeployNetwork_inputs_bytes[buf]); } @@ -156,7 +172,10 @@ int main(void) { for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) { tot_tested += DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE); - if ((uint32_t)DeeployNetwork_outputs[buf] < 0x10000000) { + /* L3 outputs live in HyperRAM (not CPU-addressable) — DMA them into an L2 + * scratch before the compare. On-chip (L1/L2) outputs are compared in + * place. */ + if (!IS_L2(DeeployNetwork_outputs[buf])) { compbuf = pi_l2_malloc(DeeployNetwork_outputs_bytes[buf]); ram_read(compbuf, DeeployNetwork_outputs[buf], DeeployNetwork_outputs_bytes[buf]); @@ -194,7 +213,7 @@ int main(void) { } } } - if ((uint32_t)DeeployNetwork_outputs[buf] < 0x10000000) { + if (!IS_L2(DeeployNetwork_outputs[buf])) { pi_l2_free(compbuf, DeeployNetwork_outputs_bytes[buf]); } } From db36d89ae9f70dcd3e83df793597019a667b70f2 Mon Sep 17 00:00:00 2001 From: samanthawangdl Date: Thu, 18 Jun 2026 23:57:13 +0000 Subject: [PATCH 2/7] =?UTF-8?q?perf(GAP9):=20split=20L3=20tiling=20DMA=20?= =?UTF-8?q?=E2=80=94=20blocking=20for=20SB,=20async=20for=20DB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The L3<->L2 tiling used one DMA backend for both single- and double-buffering. Async DMA only helps double-buffering (it overlaps the next-tile prefetch with compute); single-buffering waits on each tile before computing, so async gives SB no benefit but all the risk — strided 2D L3 transfers (pi_cl_ram_copy_2d) can corrupt under deferred waits. - PULPL3Tiling: add optional `dbDma` (defaults to `dma`) so SB and DB can use different backends. Backward compatible. - GAP9 bindings: SB keeps the blocking gap9L3DmaHack; DB uses async GAP9L3Dma for real L3<->L2 prefetch overlap. - GAP9L3Dma: reset future `.size`=0 after copy-wait (so a completed future isn't waited twice) and cast `${ext}` to uint32_t in the 2D transfer. Verified MatMul --defaultMemLevel L3 on GVSoC: 0/256. --- Deeploy/Targets/GAP9/Bindings.py | 10 +++++++--- Deeploy/Targets/GAP9/DMA/L3Dma.py | 3 ++- .../CodeTransformationPasses/PULPL3Tiling.py | 16 ++++++++++++---- 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/Deeploy/Targets/GAP9/Bindings.py b/Deeploy/Targets/GAP9/Bindings.py index 2bda98af8f..ad3ae2fd9c 100644 --- a/Deeploy/Targets/GAP9/Bindings.py +++ b/Deeploy/Targets/GAP9/Bindings.py @@ -18,7 +18,7 @@ from Deeploy.DeeployTypes import CodeTransformation, NodeBinding from Deeploy.FutureExtension.Bindings.AutoFutureBinding import AutoFutureBinding from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration -from Deeploy.Targets.GAP9.DMA.L3Dma import gap9L3DmaHack +from Deeploy.Targets.GAP9.DMA.L3Dma import GAP9L3Dma, gap9L3DmaHack from Deeploy.Targets.GAP9.DMA.MchanDma import GAP9MchanDma # Import templates from PULPOpen and Generic from Deeploy.Targets.Generic.Templates import AddTemplate, ConcatTemplate, DequantTemplate, FloatReduceMeanTemplate, \ @@ -57,7 +57,9 @@ MemoryManagementGeneration("L1"), TilingVariableReplacement("L2"), MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True), - PULPL3Tiling("L3", "L2", gap9L3DmaHack), # Use GAP9-specific L3 DMA + # SB -> blocking gap9L3DmaHack (safe for strided 2D L3 transfers); DB -> + # async GAP9L3Dma for real L3<->L2 prefetch overlap. + PULPL3Tiling("L3", "L2", gap9L3DmaHack, dbDma = GAP9L3Dma()), PULPProfileUntiled(), ArgumentStructGeneration(), L3MemoryAwareFunctionCallClosure(writeback = False), @@ -76,7 +78,9 @@ MemoryManagementGeneration("L1"), TilingVariableReplacement("L2"), MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True), - PULPL3Tiling("L3", "L2", gap9L3DmaHack), # Use GAP9-specific L3 DMA + # SB -> blocking gap9L3DmaHack (safe for strided 2D L3 transfers); DB -> + # async GAP9L3Dma for real L3<->L2 prefetch overlap. + PULPL3Tiling("L3", "L2", gap9L3DmaHack, dbDma = GAP9L3Dma()), PULPProfileUntiled(), ArgumentStructGeneration(), L3MemoryAwareFunctionCallClosure(writeback = False), diff --git a/Deeploy/Targets/GAP9/DMA/L3Dma.py b/Deeploy/Targets/GAP9/DMA/L3Dma.py index adbf161328..3cf274c569 100644 --- a/Deeploy/Targets/GAP9/DMA/L3Dma.py +++ b/Deeploy/Targets/GAP9/DMA/L3Dma.py @@ -21,6 +21,7 @@ class GAP9L3DmaFuture(Future): _waitTemplate = NodeTemplate(""" if (${name}.size != 0) { pi_cl_ram_copy_wait(&${name}); + ${name}.size = 0; }""") @@ -29,7 +30,7 @@ class GAP9L3Dma(AsyncDma): _transferTemplates = { 2: NodeTemplate( - "pi_cl_ram_copy_2d(get_ram_ptr(), ${ext}, ${loc}, ${transfer_size}, ${stride}, ${length}, ${ext2loc}, &${future});" + "pi_cl_ram_copy_2d(get_ram_ptr(), (uint32_t) ${ext}, ${loc}, ${transfer_size}, ${stride}, ${length}, ${ext2loc}, &${future});" ) } _waitingStrategy = PerTensorWaitingStrategy(GAP9L3DmaFuture) diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3Tiling.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3Tiling.py index 9df0d88479..523f0d2937 100644 --- a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3Tiling.py +++ b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3Tiling.py @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import Tuple +from typing import Optional, Tuple from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, _NoVerbosity from Deeploy.TilingExtension.AsyncDma import AsyncDma @@ -30,11 +30,19 @@ class ProfilingPULPL3TilingGenerationDB(DoubleBufferingTilingCodeGeneration, Pro class PULPL3Tiling(CodeTransformationPass): - def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma): + def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma, dbDma: Optional[AsyncDma] = None): + # SB and DB can use different DMA backends. Async DMA only ever helps DB + # (it overlaps the next-tile prefetch with compute); SB waits on each tile + # before computing, so async gives SB no benefit but all the risk (strided + # 2D L3 transfers can corrupt under deferred waits). Defaulting dbDma to dma + # keeps backward compatibility; pass an async dma as dbDma for real L3<->L2 + # overlap on the DB path while keeping SB on the safe blocking dma. + if dbDma is None: + dbDma = dma self.SB = PULPL3TilingGenerationSB(externalMemory, localMemory, dma) - self.DB = PULPL3TilingGenerationDB(externalMemory, localMemory, dma) + self.DB = PULPL3TilingGenerationDB(externalMemory, localMemory, dbDma) self.profilingSB = ProfilingPULPL3TilingGenerationSB(externalMemory, localMemory, dma) - self.profilingDB = ProfilingPULPL3TilingGenerationDB(externalMemory, localMemory, dma) + self.profilingDB = ProfilingPULPL3TilingGenerationDB(externalMemory, localMemory, dbDma) def apply(self, ctxt: NetworkContext, From bb42a7b73f6aa6c7c0d4cb2cf26f2db1af8d2ad4 Mon Sep 17 00:00:00 2001 From: samanthawangdl Date: Fri, 19 Jun 2026 00:06:50 +0000 Subject: [PATCH 3/7] perf(GAP9): -O3 hot forward kernels + document L1-memory knobs in example - TargetLibraries/GAP9: compile the hot forward kernels (Convolution_fp32, DWConvolution_fp32, Gemm) at -O3, appended last so it wins over the SDK's default -Os. These dominate GAP9 inference cycles. - deeploytest.c / CMake / sdk config: make the GAP9 example show the three L1-memory knobs that let conv-heavy nets fit, with explanatory comments: A. slave (PE) stacks -> L2: hand the cluster task a static L2 buffer (SET_SLAVE_STACK) so the SDK skips its L1 slave-stack alloc (~30 KB L1). B. shrink the SDK's L1 slave stacks via CONFIG_CL_SLAVE_CORE_STACK_SIZE (sdk_gvsoc.config) -- alternative to A. C. size the cluster-controller stack via conf.cc_stack_size, overridable from the build with -DCC_STACK_SIZE= (new CMake option). Verified MatMul --defaultMemLevel L3 -DCC_STACK_SIZE=8192 on GVSoC: 0/256. --- DeeployTest/Platforms/GAP9/CMakeLists.txt | 6 ++++ DeeployTest/Platforms/GAP9/sdk_gvsoc.config | 4 +++ DeeployTest/Platforms/GAP9/src/deeploytest.c | 35 ++++++++++++++++++-- TargetLibraries/GAP9/CMakeLists.txt | 9 +++++ 4 files changed, 51 insertions(+), 3 deletions(-) diff --git a/DeeployTest/Platforms/GAP9/CMakeLists.txt b/DeeployTest/Platforms/GAP9/CMakeLists.txt index cbb6382329..61fa781d62 100644 --- a/DeeployTest/Platforms/GAP9/CMakeLists.txt +++ b/DeeployTest/Platforms/GAP9/CMakeLists.txt @@ -26,6 +26,12 @@ if(POWER_MEASUREMENT) target_compile_definitions(${ProjectId} PRIVATE POWER_MEASUREMENT) endif() +# L1-memory knob — size the cluster-controller (CC / master) stack from the build. +# Example (CI / command line): cmake ... -DCC_STACK_SIZE=8192 (see deeploytest.c). +if(CC_STACK_SIZE) + target_compile_definitions(${ProjectId} PRIVATE CC_STACK_SIZE=${CC_STACK_SIZE}) +endif() + # RW: Waive sign comparison warnings from pulp_nn_utils.h target_compile_options(network PRIVATE -Wno-sign-compare diff --git a/DeeployTest/Platforms/GAP9/sdk_gvsoc.config b/DeeployTest/Platforms/GAP9/sdk_gvsoc.config index 2f5bdf053c..099649684e 100644 --- a/DeeployTest/Platforms/GAP9/sdk_gvsoc.config +++ b/DeeployTest/Platforms/GAP9/sdk_gvsoc.config @@ -33,4 +33,8 @@ CONFIG_PLATFORM_GVSOC=y # GAP9 cluster stack size configuration # Uncomment and adjust these values if you need to modify stack sizes: # CONFIG_CL_MASTER_CORE_STACK_SIZE=14000 +# L1-memory knob B — shrink the SDK's per-PE (slave) cluster stacks to free L1 +# TCDM for the tile arena. Uncomment and lower if kernel stack use is small. +# (deeploytest.c instead hands its own L2 slave-stack buffer via SET_SLAVE_STACK, +# which bypasses this kconfig; use one knob or the other.) # CONFIG_CL_SLAVE_CORE_STACK_SIZE=1000 diff --git a/DeeployTest/Platforms/GAP9/src/deeploytest.c b/DeeployTest/Platforms/GAP9/src/deeploytest.c index b527baaf9f..c6d4564d3d 100644 --- a/DeeployTest/Platforms/GAP9/src/deeploytest.c +++ b/DeeployTest/Platforms/GAP9/src/deeploytest.c @@ -16,6 +16,24 @@ // RW: Remove MAINSTACKSIZE because gap9-sdk does not use it #define SLAVESTACKSIZE 3800 +/* L1-memory knob A — place the cluster slave (PE) stacks in L2 instead of L1. + * The GAP9 SDK pi_cl_l1_malloc's the slave stacks only when task->stacks == + * NULL + * (__pi_cluster_task_set_stack in the SDK cluster driver); handing it our own + * buffer makes it skip that L1 allocation, freeing ~30 KB of L1 (8 cores x + * SLAVESTACKSIZE) for the Deeploy tile arena. The static array lands in .bss + * (L2). Use SET_SLAVE_STACK(task) instead of setting slave_stack_size directly. + * Knob B (alternative): shrink the SDK's own L1 slave stacks via + * CONFIG_CL_SLAVE_CORE_STACK_SIZE in the sdk .config (see sdk_gvsoc.config). */ +#define CLUSTER_MAX_CORES 9 +static uint8_t cluster_slave_stacks[SLAVESTACKSIZE * CLUSTER_MAX_CORES] + __attribute__((aligned(16))); +#define SET_SLAVE_STACK(task) \ + do { \ + (task).slave_stack_size = SLAVESTACKSIZE; \ + (task).stacks = cluster_slave_stacks; \ + } while (0) + /* On-chip memory windows. HyperRAM/L3 (cl_ram_malloc) is NOT CPU-addressable, * so a raw memcpy / CPU-deref of an L3 pointer faults ("Invalid fetch") on real * silicon — GVSoC models HyperRAM as flat RAM and hides the bug. Only the real @@ -110,6 +128,17 @@ int main(void) { pi_cluster_conf_init(&conf); conf.id = 0; + /* L1-memory knob C — cluster-controller (CC / master) stack size. The CC + * stack is carved from the bottom of L1 (grows down toward the L1 base). The + * SDK default PI_CL_CC_STACK_SIZE (0x800 = 2 KB) can be too small for deep + * tiling call chains and overflows below the L1 base (silent clobber / + * invalid write). pi_cluster_task takes the size from conf.cc_stack_size (NOT + * the AutoTiler-only CONFIG_CL_MASTER_CORE_STACK_SIZE kconfig). Override per + * build with -DCC_STACK_SIZE= (see CMakeLists). */ +#ifndef CC_STACK_SIZE +#define CC_STACK_SIZE 8192 +#endif + conf.cc_stack_size = CC_STACK_SIZE; pi_open_from_conf(&cluster_dev, &conf); if (pi_cluster_open(&cluster_dev)) return -1; @@ -124,7 +153,7 @@ int main(void) { struct pi_cluster_task cluster_task; pi_cluster_task(&cluster_task, InitNetworkWrapper, NULL); - cluster_task.slave_stack_size = SLAVESTACKSIZE; + SET_SLAVE_STACK(cluster_task); pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task); #ifndef CI @@ -146,7 +175,7 @@ int main(void) { #endif pi_cluster_task(&cluster_task, RunNetworkWrapper, NULL); - cluster_task.slave_stack_size = SLAVESTACKSIZE; + SET_SLAVE_STACK(cluster_task); #ifdef POWER_MEASUREMENT WRITE_GPIO(1); @@ -193,7 +222,7 @@ int main(void) { float_compare_args.err_count = (int *)&float_error_count; pi_cluster_task(&cluster_task, CL_CompareFloat, &float_compare_args); - cluster_task.slave_stack_size = SLAVESTACKSIZE; + SET_SLAVE_STACK(cluster_task); pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task); tot_err += float_error_count; diff --git a/TargetLibraries/GAP9/CMakeLists.txt b/TargetLibraries/GAP9/CMakeLists.txt index ca4c3ffbeb..f66514050a 100644 --- a/TargetLibraries/GAP9/CMakeLists.txt +++ b/TargetLibraries/GAP9/CMakeLists.txt @@ -31,6 +31,15 @@ target_compile_options(deeploygap9 PRIVATE target_link_libraries(deeploygap9 PUBLIC pmsis) +# Compile the hot forward kernels at -O3 (set last so it wins over the SDK's +# default -Os). Conv / depthwise-conv / Gemm dominate GAP9 inference cycles. +set(_KERNEL_O3_FILES + ${CMAKE_CURRENT_LIST_DIR}/../PULPOpen/src/Convolution_fp32.c + ${CMAKE_CURRENT_LIST_DIR}/../PULPOpen/src/DWConvolution_fp32.c + ${CMAKE_CURRENT_LIST_DIR}/../PULPOpen/src/Gemm.c +) +set_source_files_properties(${_KERNEL_O3_FILES} PROPERTIES COMPILE_OPTIONS "-O3") + #RW: Link PULP-NN #RW: Set PULP-NN version and bitwidth for pulp-nn-mixed set(PULPNNVERSION XPULPV2) From 4b4dd5d191c174d8c09e128c8fc56794b07f38e1 Mon Sep 17 00:00:00 2001 From: samanthawangdl Date: Fri, 19 Jun 2026 00:18:07 +0000 Subject: [PATCH 4/7] fix(GAP9): emit cluster fork/closure argument structs as static (off-stack) The tiling argument structs were stack-locals in the dispatching function. The cluster fork runtime writes its descriptor near the top of the CC/master stack; a stack-local arg struct placed there can be clobbered before the forked cores read it (a GAP9 cluster-fork crash, e.g. MobileNetV1). Declare the struct `static` and assign separately so it lives in static storage, stable across the forked call. Generic codegen (ArgumentStructGeneration); benign on other targets. Verified MatMul --defaultMemLevel L3 on GVSoC: 0/256. --- .../CodeTransformationPasses/MemoryAllocation.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py b/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py index 609a179c7b..dfe5df9ca5 100644 --- a/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py +++ b/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py @@ -31,7 +31,12 @@ def __init__(self, templateStr: str, bufferName: str): _stackAllocateTemplate = partial( _ArgStructAllocateTemplate, - templateStr = "${structDict.typeName} ${name} = (${structDict.typeName}) ${str(structDict)};") + # Declare the argument struct `static` (off-stack), then assign. The cluster + # fork runtime writes its descriptor near the top of the CC/master stack; a + # stack-local arg struct placed there can be clobbered before the forked + # cores read it (observed as a GAP9 cluster-fork crash). Static storage keeps + # it stable for the lifetime of the forked call. + templateStr = "static ${structDict.typeName} ${name}; ${name} = (${structDict.typeName}) ${str(structDict)};") class ArgumentStructGeneration(CodeTransformationPass, IntrospectiveCodeTransformationMixIn): From 9a074a37bce8022c4093cb3b13e79d70d8ebe5e7 Mon Sep 17 00:00:00 2001 From: samanthawangdl Date: Fri, 19 Jun 2026 00:20:31 +0000 Subject: [PATCH 5/7] fix(GAP9): per-tensor waiting strategy for the cluster (mchan) DMA GAP9 mchan allocates a fresh channel on every descriptor enqueue. The previous DirectionWaitingStrategy shares one future (one mchan_transfer_get_id) across all same-direction tensors of a tile, so a tile with >1 input emits one get_id but multiple pushes -> the extra transfers run on channels that are never waited or freed -> mchan_transfer_wait() hangs (e.g. the optimizer weight+grad stall). Switch to PerTensorWaitingStrategy so each tensor gets its own get_id : push : wait : free, matching the mchan contract. Verified MatMul --defaultMemLevel L3 on GVSoC: 0/256. --- Deeploy/Targets/GAP9/DMA/MchanDma.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/Deeploy/Targets/GAP9/DMA/MchanDma.py b/Deeploy/Targets/GAP9/DMA/MchanDma.py index 14e7eb0930..5525d39e3c 100644 --- a/Deeploy/Targets/GAP9/DMA/MchanDma.py +++ b/Deeploy/Targets/GAP9/DMA/MchanDma.py @@ -6,7 +6,7 @@ from typing import Dict, Tuple from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer -from Deeploy.TilingExtension.AsyncDma import AsyncDma, DirectionWaitingStrategy, DmaDirection, Future +from Deeploy.TilingExtension.AsyncDma import AsyncDma, DmaDirection, Future, PerTensorWaitingStrategy class MchanTransferFuture(Future): @@ -36,7 +36,16 @@ class GAP9MchanDma(AsyncDma): "{ mchan_transfer_t __mchan_tmp = { .cmd = ${cmd}, .size = ${size}, .loc = ${loc}, .ext = ${ext}, .ext_size_1d = ${size_1d}, .ext_stride_1d = ${stride_2d} }; mchan_transfer_push_2d(__mchan_tmp); }" ), } - _waitingStrategy = DirectionWaitingStrategy(MchanTransferFuture, "transfer") + # PerTensor, NOT Direction: GAP9 mchan allocates a fresh channel on every + # descriptor enqueue (each mchan_transfer_push_* writes a new descriptor and + # the hardware advances to the next channel). DirectionWaitingStrategy shares + # ONE future (one mchan_transfer_get_id) across all same-direction tensors of + # a tile, so a tile with >1 input (e.g. weight + grad) emits one get_id but + # multiple pushes -> the 2nd+ transfers run on channels that are never waited + # nor freed -> mchan_transfer_wait() hangs forever. PerTensor gives each tensor + # its own get_id immediately before its push, matching mchan's + # 1 get_id : 1 push : 1 wait : 1 free contract. + _waitingStrategy = PerTensorWaitingStrategy(MchanTransferFuture) def __init__(self, transferTemplates: Dict[int, NodeTemplate] = _transferTemplates) -> None: super().__init__(transferTemplates) From c29c1f74355c9aeba4bc750681e0fa4e3298ab0c Mon Sep 17 00:00:00 2001 From: samanthawangdl Date: Fri, 19 Jun 2026 00:23:35 +0000 Subject: [PATCH 6/7] docs(GAP9): add backend-fixes & memory-tuning notes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Explain each GAP9 backend change in this branch — L3-aware board harness, SB/DB L3 DMA split, -O3 forward kernels, the three L1-memory knobs (cc_stack / slave-stack size / slave-stack->L2), static cluster fork/closure args, and the per-tensor mchan DMA waiting strategy — with problem, fix, file, and takeaway, plus a short GAP9 memory-model primer. --- docs/gap9_backend_fixes.md | 150 +++++++++++++++++++++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 docs/gap9_backend_fixes.md diff --git a/docs/gap9_backend_fixes.md b/docs/gap9_backend_fixes.md new file mode 100644 index 0000000000..08f635fd58 --- /dev/null +++ b/docs/gap9_backend_fixes.md @@ -0,0 +1,150 @@ +# GAP9 backend fixes & memory-tuning knobs + +Short notes on a set of GAP9 backend changes: what each one fixes, why, and where +it lives. They share a theme — **the GAP9 cluster runs on a small, manually-managed +L1 (TCDM), and HyperRAM/L3 is not CPU-addressable** — so most bugs here are about +*where memory lives* and *how DMA waits complete*. GVSoC models memory more +forgivingly than real silicon (flat HyperRAM, generous timing), so several of these +pass in simulation and only fault on the board. + +## GAP9 memory model (background) + +| Level | Address window | Who can touch it directly | +|---|---|---| +| L1 / TCDM (128 KB) | `0x1000_0000–0x1004_0000` | cluster cores (PEs) + CC | +| L2 (1.5 MB) | `0x1C00_0000–0x1C20_0000` | FC + cluster (shared) | +| L3 / HyperRAM | `cl_ram_malloc` handles | **DMA only** — not CPU-addressable | + +The cluster has 8 worker PEs + a cluster-controller (CC / master) core. The CC +stack is carved from the **bottom of L1**; PE slave stacks are also L1 by default. +Everything competes with the Deeploy tile arena for those 128 KB. + +--- + +## 1. L3-aware input/output in the board test harness + +**File:** `DeeployTest/Platforms/GAP9/src/deeploytest.c` + +**Problem.** The harness classified buffers by raw address thresholds +(`ptr >= 0x10000000` for inputs, `< 0x10000000` for outputs). HyperRAM/L3 buffers +(`cl_ram_malloc`) are *also* `>= 0x10000000`, but HyperRAM is **not CPU-addressable**. +So for `--defaultMemLevel L3` tests, `main()` did a raw `memcpy` into an L3 input +pointer and CPU-dereferenced L3 output pointers — an `Invalid fetch` fault on the +board, right after init, before any results print. GVSoC models HyperRAM as flat +RAM, so it passed there and masked the bug. + +**Fix.** Add `IS_L1` / `IS_L2` on-chip-window macros and use them: on-chip inputs +are `memcpy`'d, **L3 inputs are already loaded from the readfs hex inside +`InitNetwork`** (their `testInputVector` entry is `NULL`) so they're skipped, and +**L3 outputs are `ram_read` into an L2 scratch** before the compare. + +**Takeaway.** Never CPU-`memcpy`/deref an L3 pointer on GAP9 — gate on the real +on-chip windows, not a single `>= 0x10000000` threshold. + +--- + +## 2. Split the L3 tiling DMA — blocking for single-buffer, async for double-buffer + +**Files:** `Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3Tiling.py`, +`Deeploy/Targets/GAP9/Bindings.py`, `Deeploy/Targets/GAP9/DMA/L3Dma.py` + +**Problem.** L3↔L2 tiling used one DMA backend for both single- (SB) and +double-buffering (DB). Async DMA only helps DB, where it overlaps the *next* tile's +prefetch with compute. SB waits on each tile before computing, so async there buys +nothing but adds risk: strided 2D L3 transfers (`pi_cl_ram_copy_2d`) can corrupt +under deferred waits. + +**Fix.** `PULPL3Tiling` gains an optional `dbDma` (defaults to `dma`, so it is +backward compatible). GAP9 binds **SB → blocking** `gap9L3DmaHack`, **DB → async** +`GAP9L3Dma`. Also: reset the L3 future's `.size` to 0 after `pi_cl_ram_copy_wait` +(so a completed future is never waited on twice) and cast `${ext}` to `uint32_t` +in the 2D transfer. + +**Takeaway.** Async DMA is a double-buffering optimization; don't pay its hazards +on the single-buffering path. + +--- + +## 3. `-O3` on the hot forward kernels + +**File:** `TargetLibraries/GAP9/CMakeLists.txt` + +**Problem.** The SDK compiles kernels at `-Os` by default; the conv / depthwise-conv +/ Gemm kernels dominate GAP9 inference cycles and are left slow. + +**Fix.** Compile `Convolution_fp32.c`, `DWConvolution_fp32.c`, `Gemm.c` at `-O3` +via `set_source_files_properties(... COMPILE_OPTIONS "-O3")`, **appended last** so +it wins over the SDK's `-Os` on the same translation units. + +**Takeaway.** Per-file `-O3` on the few hot kernels is a large, cheap latency win; +ordering matters because the last `COMPILE_OPTIONS` wins. + +--- + +## 4. L1-memory tuning knobs (documented in the example) + +**Files:** `DeeployTest/Platforms/GAP9/src/deeploytest.c`, +`DeeployTest/Platforms/GAP9/CMakeLists.txt`, +`DeeployTest/Platforms/GAP9/sdk_gvsoc.config` + +Three independent ways to free L1 TCDM for the tile arena so conv-heavy nets fit. +The example demonstrates all three with comments. + +- **Knob A — slave (PE) stacks → L2.** The SDK `pi_cl_l1_malloc`'s the PE stacks + only when `task->stacks == NULL`. Hand it a static buffer (`SET_SLAVE_STACK`, + a `.bss` array → L2) and it skips that L1 allocation, freeing ~30 KB of L1 + (8 cores × `SLAVESTACKSIZE`). +- **Knob B — shrink the SDK's L1 slave stacks.** `CONFIG_CL_SLAVE_CORE_STACK_SIZE` + in the sdk `.config` (alternative to Knob A; use one or the other). +- **Knob C — cluster-controller stack size.** The CC stack grows down from the L1 + base; the SDK default (`0x800` = 2 KB) is too small for deep tiling call chains + and overflows below the base (silent clobber / invalid write). Set + `conf.cc_stack_size`, overridable from the build with **`-DCC_STACK_SIZE=`** + (new CMake option). Example (CI): `cmake … -DCC_STACK_SIZE=8192`. + +**Takeaway.** The CC stack and PE stacks are *invisible to the tiler and the ELF* +(carved at runtime), yet they share L1 with the arena — budget for them explicitly. + +--- + +## 5. Emit cluster fork/closure argument structs as `static` + +**File:** `Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py` +(`ArgumentStructGeneration`) + +**Problem.** The per-node tiling argument structs were stack-locals in the +dispatching function. The cluster-fork runtime writes its descriptor near the top +of the CC/master stack and can clobber a stack-local arg struct **before the forked +cores read it** — a GAP9 cluster-fork crash (seen on MobileNetV1). + +**Fix.** Declare the struct `static` (off-stack) and assign separately: +`static T name; name = (T){…};`. Static storage keeps it stable for the lifetime +of the forked call. Generic codegen, benign on other targets. + +**Takeaway.** Anything handed to a cluster fork must outlive the CC stack frame — +keep it off the stack. + +--- + +## 6. Per-tensor waiting strategy for the cluster (mchan) DMA + +**File:** `Deeploy/Targets/GAP9/DMA/MchanDma.py` + +**Problem.** GAP9 mchan allocates a fresh channel on every descriptor enqueue. +`DirectionWaitingStrategy` shares **one** future (`one mchan_transfer_get_id`) +across all same-direction tensors of a tile. A tile with >1 input (e.g. optimizer +weight + grad) then emits one `get_id` but multiple pushes → the extra transfers +run on channels that are never waited or freed → `mchan_transfer_wait()` hangs +forever. + +**Fix.** Use `PerTensorWaitingStrategy`: each tensor gets its own +`get_id : push : wait : free`, matching the mchan hardware contract. + +**Takeaway.** Match the DMA waiting strategy to the hardware's channel model — +mchan is one-channel-per-transfer, so wait per tensor, not per direction. + +--- + +*All changes verified on GVSoC with `MatMul --defaultMemLevel L3` (`Errors: 0 out +of 256`). On-chip (L1/L2) behaviour is unchanged; only the L3 / stack / DMA paths +differ.* From a5f0bcfdc1fd021102b31078b751abc957cf7a7e Mon Sep 17 00:00:00 2001 From: samanthawangdl Date: Fri, 19 Jun 2026 01:15:19 +0000 Subject: [PATCH 7/7] ci(GAP9): build-time L1/L2 memory gate Add gap9_memcheck.py and run it from run_complete_test after the build, before the simulation, on GAP9. It models every consumer of L1/L2 the tiler doesn't (CC master stack, PE slave stacks, ELF sections, tile arena, promoted pool) and scans InitNetwork for the pi_l2_malloc-after-cl_ram_malloc alloc-order race, so over-subscription fails fast with the exact knob instead of a multi-minute GVSoC hang. GAP9-only; bypass with DEPLOY_SKIP_MEMCHECK=1. Verified MatMul L3: gate runs (PASS) and test is 0/256. --- DeeployTest/testUtils/core/execution.py | 25 +++ DeeployTest/testUtils/gap9_memcheck.py | 248 ++++++++++++++++++++++++ docs/gap9_backend_fixes.md | 22 +++ 3 files changed, 295 insertions(+) create mode 100644 DeeployTest/testUtils/gap9_memcheck.py diff --git a/DeeployTest/testUtils/core/execution.py b/DeeployTest/testUtils/core/execution.py index 4c6c972679..294691895d 100644 --- a/DeeployTest/testUtils/core/execution.py +++ b/DeeployTest/testUtils/core/execution.py @@ -227,6 +227,28 @@ def run_simulation(config: DeeployTestConfig, skip: bool = False) -> TestResult: return test_result +def _gap9_memcheck_gate(config: DeeployTestConfig) -> None: + """GAP9 build-time memory gate. + + Run the L1/L2 budget + InitNetwork alloc-order validator after the build and + before the simulation, so memory over-subscription (which otherwise shows up + as a multi-minute GVSoC hang in os_evt_release or a wild-pointer crash) fails + fast with the exact knob to turn. GAP9-only; bypass with DEPLOY_SKIP_MEMCHECK=1. + """ + if config.platform != "GAP9" or os.environ.get("DEPLOY_SKIP_MEMCHECK") == "1": + return + script = Path(__file__).parent.parent / "gap9_memcheck.py" + if not script.exists(): + return + cmd = ["python", str(script), config.build_dir, config.gen_dir] + log.debug(f"[Execution] GAP9 memcheck: {' '.join(cmd)}") + result = subprocess.run(cmd, check = False) + if result.returncode == 1: + raise RuntimeError(f"GAP9 memory check failed for {config.test_name} (L1/L2 over-subscription " + f"or InitNetwork alloc-order race - see output above). " + f"Set DEPLOY_SKIP_MEMCHECK=1 to bypass.") + + def run_complete_test(config: DeeployTestConfig, skipgen: bool = False, skipsim: bool = False) -> TestResult: """ Run a complete test: generate, configure, build, and simulate. @@ -242,6 +264,9 @@ def run_complete_test(config: DeeployTestConfig, skipgen: bool = False, skipsim: # Step 3: Build binary build_binary(config) + # Step 3b: GAP9 build-time memory gate (fast-fail before simulation) + _gap9_memcheck_gate(config) + # Step 4: Run simulation result = run_simulation(config, skip = skipsim) diff --git a/DeeployTest/testUtils/gap9_memcheck.py b/DeeployTest/testUtils/gap9_memcheck.py new file mode 100644 index 0000000000..e9922033d8 --- /dev/null +++ b/DeeployTest/testUtils/gap9_memcheck.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +"""GAP9 L1/L2 memory-budget validator (CI gate). + +Turns silent over-subscription — the class of bug that manifests as a 10-minute +GVSoC hang in os_evt_release or a wild-pointer "Invalid access" far from the +cause — into an instant, precise, build-time error. + +It models EVERY consumer of each physical level, including the ones the Deeploy +tiler does NOT model: + * L1 tile arena : pi_l1_malloc(...) in generated C + * CC master stack : conf.cc_stack_size, carved from L1 top at runtime + (NOT an ELF section, NOT a malloc -> invisible to + --plotMemAlloc; this is what overflowed) + * static L1 sections : from the ELF (addr 0x1000xxxx) + * L2 code/data/bss : from the ELF (addr 0x1cxxxxxx) — incl PE slave + stacks (cluster_slave_stacks, .bss in L2) + * L2 tile arena + promoted : pi_l2_malloc(...) + PromoteTensorsToL2 log + +Exit code 0 = fits; 1 = over-subscribed (with the knob to turn); 2 = usage error. + +Usage: + gap9_memcheck.py [build_or_promote_log] [--cc-stack N] +""" +import glob +import os +import re +import subprocess +import sys + +# ---- physical capacities (GAP9) ---- +L1_SIZE = 128 * 1024 # 131072 TCDM +L2_SIZE = 0x190000 # 1572864 (1.5 MB), link.gap9.ld +L1_MIN_HEADROOM = 1024 # warn if free L1 < this (stacks have runtime slop) +READELF = "/app/install/gcc/gap9/bin/riscv32-unknown-elf-readelf" + + +def die(msg, code = 2): + print(f"[gap9-memcheck] ERROR: {msg}", file = sys.stderr) + sys.exit(code) + + +def parse_args(argv): + pos, cc = [], None + i = 0 + while i < len(argv): + if argv[i] == "--cc-stack": + cc = int(argv[i + 1]) + i += 2 + else: + pos.append(argv[i]) + i += 1 + if len(pos) < 2: + die("usage: gap9_memcheck.py [log] [--cc-stack N]") + return pos[0], pos[1], (pos[2] if len(pos) > 2 else None), cc + + +def find_elf(build): + cands = [] + for f in glob.glob(os.path.join(build, "*")): + if os.path.isfile(f) and os.access(f, os.X_OK) and not f.endswith((".bin", ".s", ".hex")): + try: + if open(f, "rb").read(4) == b"\x7fELF": + cands.append(f) + except Exception: + pass + return max(cands, key = os.path.getmtime) if cands else None + + +def elf_sections(elf): + """{region: {section: bytes}} for ALLOC sections, keyed by load address.""" + out = subprocess.run([READELF, "-SW", elf], capture_output = True, text = True).stdout + regions = {"L1": {}, "L2": {}} + for line in out.splitlines(): + m = re.search(r"\]\s+(\S+)\s+(PROGBITS|NOBITS)\s+([0-9a-f]{8})\s+[0-9a-f]+\s+([0-9a-f]+)", line) + if not m: + continue + name, _, addr, szhex = m.groups() + sz = int(szhex, 16) + if sz == 0: + continue + if addr.startswith("1000") or addr.startswith("1001"): + regions["L1"][name] = regions["L1"].get(name, 0) + sz + elif addr.startswith("1c"): + regions["L2"][name] = regions["L2"].get(name, 0) + sz + return regions + + +def arena_mallocs(gen): + res = {"L1": 0, "L2": 0, "L3": 0} + for fn in ("TrainingNetwork.c", "Network.c", "OptimizerNetwork.c"): + p = os.path.join(gen, fn) + if not os.path.exists(p): + continue + t = open(p, errors = "ignore").read() + for sz in re.findall(r"(?:pi_)?l1_malloc\([^;]*?\*\s*(\d+)\)", t): + res["L1"] = max(res["L1"], int(sz)) + for sz in re.findall(r"pi_l2_malloc\(sizeof\([^)]*\)\s*\*\s*(\d+)\)", t): + res["L2"] += int(sz) + for sz in re.findall(r"cl_ram_malloc\((?:sizeof\([^)]*\)\s*\*\s*)?(\d+)\)", t): + res["L3"] += int(sz) + return res + + +def check_init_alloc_order(gen): + """Codegen-time race check (pure source scan — no build, no sim). + + In any Init*Network function, flag a pi_l2_malloc that appears AFTER a + cl_ram_malloc. On GAP9 Init runs on the cluster CC: cl_ram_malloc delegates + to the FC (pi_cl_ram_alloc) while pi_l2_malloc runs pos_alloc directly on the + CC; both touch the shared L2 freelist, so this ordering races -> FC RTOS + corruption -> os_evt_release crash/hang at init. Fix = hoist all pi_l2_malloc + before the first cl_ram_malloc (codeGenerateTraining._hoistL2AllocsBeforeL3). + """ + issues = [] + for fn in ("TrainingNetwork.c", "Network.c", "OptimizerNetwork.c"): + p = os.path.join(gen, fn) + if not os.path.exists(p): + continue + lines = open(p, errors = "ignore").read().split("\n") + i = 0 + while i < len(lines): + if re.search(r"void\s+Init\w*Network\s*\(", lines[i]): + depth = 0 + started = False + seen_cl = False + for j in range(i, len(lines)): + depth += lines[j].count("{") - lines[j].count("}") + if "{" in lines[j]: + started = True + if "cl_ram_malloc(" in lines[j]: + seen_cl = True + elif seen_cl and "pi_l2_malloc(" in lines[j]: + issues.append((fn, j + 1, lines[j].strip()[:70])) + if started and depth <= 0: + i = j + break + i += 1 + return issues + + +def cc_stack_from_cache(build, override): + if override is not None: + return override, "override" + cache = os.path.join(build, "CMakeCache.txt") + if os.path.exists(cache): + m = re.search(r"CC_STACK_SIZE\S*=(\d+)", open(cache, errors = "ignore").read()) + if m: + return int(m.group(1)), "CMakeCache" + return 8192, "default(8192)" # deeploytraintest.c fallback + + +def promoted_bytes(log): + if not log or not os.path.exists(log): + return 0 + best = 0 + for m in re.finditer(r"promoted \d+ tensors, (\d+) /", open(log, errors = "ignore").read()): + best = max(best, int(m.group(1))) + return best + + +def report(title, cap, items): + used = sum(v for _, v in items) + free = cap - used + print(f"\n=== {title} (cap {cap} B = {cap/1024:.1f} KB) ===") + for n, v in items: + print(f" {v:9d} B {v/1024:7.1f} KB {n}") + print(f" {'-'*9}") + print(f" {used:9d} B {used/1024:7.1f} KB TOTAL ({100*used/cap:.1f}%)") + print(f" {free:9d} B {free/1024:7.1f} KB FREE") + return used, free + + +def main(): + build, gen, log, cc_override = parse_args(sys.argv[1:]) + elf = find_elf(build) + if not elf: + die(f"no ELF found in {build}") + print(f"[gap9-memcheck] ELF: {elf}") + regs = elf_sections(elf) + ar = arena_mallocs(gen) + cc_stack, cc_src = cc_stack_from_cache(build, cc_override) + prom = promoted_bytes(log) + + violations = [] + + # ---- L1 / TCDM ---- + l1_items = [] + if ar["L1"]: + l1_items.append(("tile arena (pi_l1_malloc)", ar["L1"])) + l1_items.append((f"CC master stack (cc_stack_size, {cc_src})", cc_stack)) + for n, v in sorted(regs["L1"].items(), key = lambda x: -x[1]): + l1_items.append((f"L1 section {n}", v)) + used1, free1 = report("L1 / TCDM", L1_SIZE, l1_items) + if free1 < 0: + violations.append(f"L1 over-subscribed by {-free1} B. arena {ar['L1']} + cc_stack {cc_stack} " + f"(+sections) > {L1_SIZE}. Fix: lower --l1 to <= {ar['L1']+free1} OR " + f"cc_stack to <= {cc_stack+free1}.") + elif free1 < L1_MIN_HEADROOM: + print(f"\n[gap9-memcheck] WARNING: L1 free {free1} B < {L1_MIN_HEADROOM} B headroom " + f"(stack high-water is runtime-dependent; risky).") + + # ---- L2 ---- + # NOTE: do NOT add the promotion-log bytes to the total. The promoted pool + # is a pi_l2_malloc (PROMOTED_POOL_L2) -> already in ar["L2"]; promoted const + # bytes are baked into .data -> already in the ELF L2 sections. Adding `prom` + # again double-counts and produces false over-subscription FAILs. + l2_items = [] + for n, v in sorted(regs["L2"].items(), key = lambda x: -x[1]): + l2_items.append((f"L2 section {n}", v)) + if ar["L2"]: + l2_items.append(("tile arena + pools (pi_l2_malloc, incl PROMOTED_POOL)", ar["L2"])) + used2, free2 = report("L2", L2_SIZE, l2_items) + if prom: + print(f" (info: PromoteTensorsToL2 log reports {prom} B promoted — already " + f"counted above in pi_l2_malloc / .data, not added again)") + if free2 < 0: + violations.append(f"L2 over-subscribed by {-free2} B (> {L2_SIZE}). Reduce promoted-pool " + f"headroom or L2 arena.") + + # ---- InitNetwork alloc-order race (codegen-time) ---- + order_issues = check_init_alloc_order(gen) + if order_issues: + print("\n=== InitNetwork alloc-order ===") + for fn, ln, txt in order_issues[:6]: + print(f" RACE: {fn}:{ln} pi_l2_malloc after cl_ram_malloc -> {txt}") + violations.append(f"InitNetwork alloc-order race ({len(order_issues)} site(s), first " + f"{order_issues[0][0]}:{order_issues[0][1]}): pi_l2_malloc after cl_ram_malloc " + f"-> GAP9 FC/CC pos_alloc freelist race -> os_evt_release crash/hang at init. " + f"Hoist all pi_l2_malloc before the first cl_ram_malloc.") + + if ar["L3"]: + print(f"\n=== L3 / HyperRAM (no enforced cap) ===\n {ar['L3']} B " + f"{ar['L3']/1e6:.2f} MB cl_ram arena") + + print() + if violations: + for v in violations: + print(f"[gap9-memcheck] FAIL: {v}", file = sys.stderr) + sys.exit(1) + print("[gap9-memcheck] PASS: all levels fit within physical capacity.") + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/docs/gap9_backend_fixes.md b/docs/gap9_backend_fixes.md index 08f635fd58..0452a05ea2 100644 --- a/docs/gap9_backend_fixes.md +++ b/docs/gap9_backend_fixes.md @@ -145,6 +145,28 @@ mchan is one-channel-per-transfer, so wait per tensor, not per direction. --- +## 7. Build-time memory gate + +**Files:** `DeeployTest/testUtils/gap9_memcheck.py`, +`DeeployTest/testUtils/core/execution.py` + +**Problem.** L1/L2 over-subscription on GAP9 surfaces as a multi-minute GVSoC hang +(`os_evt_release`) or a wild-pointer crash far from the cause — slow and opaque. +The tiler does not model the CC master stack, PE slave stacks, or the promoted +pool, so it can't catch it. + +**Fix.** `gap9_memcheck.py` models every consumer of L1 and L2 (tile arena, CC +stack from `cc_stack_size`, PE slave stacks, ELF sections, promoted pool) and +scans `InitNetwork` for the `pi_l2_malloc`-after-`cl_ram_malloc` alloc-order race. +`run_complete_test` runs it after the build and before the simulation (GAP9 only), +so over-subscription fails in seconds with the exact knob to turn. Bypass with +`DEPLOY_SKIP_MEMCHECK=1`. + +**Takeaway.** Validate the full L1/L2 budget at build time — the stacks and pools +the tiler ignores are exactly what overflow. + +--- + *All changes verified on GVSoC with `MatMul --defaultMemLevel L3` (`Errors: 0 out of 256`). On-chip (L1/L2) behaviour is unchanged; only the L3 / stack / DMA paths differ.*