From bb505041aaef61e5d4cbcba9b24e3b0a968d5349 Mon Sep 17 00:00:00 2001
From: samanthawangdl <samanthawangdl@gmail.com>
Date: Thu, 18 Jun 2026 19:14:53 +0000
Subject: [PATCH 1/7] fix(GAP9): L3-aware input/output handling in board test
 harness

deeploytest.c classified memory by `ptr >= 0x10000000` (inputs) / `< 0x10000000`
(outputs). HyperRAM/L3 addresses (cl_ram_malloc) are also >= 0x10000000 but are
NOT CPU-addressable, so for `--defaultMemLevel L3` tests on real silicon main did
a raw memcpy / CPU-deref of an L3 pointer -> 'Invalid fetch' fault in main (e.g.
MatMul L3 on board: fault at the cl_ram_malloc'd input address). GVSoC models
HyperRAM as flat RAM so it passed there, masking the bug.

Add IS_L1/IS_L2 on-chip-window macros and use them:
- Inputs: only memcpy on-chip (IS_L2) inputs with a non-NULL testInputVector;
  L3 inputs are loaded from the readfs hex in InitNetwork (testInputVector is
  NULL) and already live in HyperRAM, so skip them.
- Outputs: ram_read L3 outputs into an L2 scratch before the compare (and free
  it); on-chip outputs compared in place. Paired malloc/free kept in sync.

Verified MatMul --defaultMemLevel L3 on GVSoC: 0/256 (unchanged). On-chip (L2)
tests behave identically; only L3 paths change.
---
 DeeployTest/Platforms/GAP9/src/deeploytest.c | 25 +++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/DeeployTest/Platforms/GAP9/src/deeploytest.c b/DeeployTest/Platforms/GAP9/src/deeploytest.c
index 77fe46a4e9..b527baaf9f 100644
--- a/DeeployTest/Platforms/GAP9/src/deeploytest.c
+++ b/DeeployTest/Platforms/GAP9/src/deeploytest.c
@@ -16,6 +16,18 @@
 // RW: Remove MAINSTACKSIZE because gap9-sdk does not use it
 #define SLAVESTACKSIZE 3800
 
+/* On-chip memory windows. HyperRAM/L3 (cl_ram_malloc) is NOT CPU-addressable,
+ * so a raw memcpy / CPU-deref of an L3 pointer faults ("Invalid fetch") on real
+ * silicon — GVSoC models HyperRAM as flat RAM and hides the bug. Only the real
+ * on-chip ranges may be touched directly by the FC. The previous `>=
+ * 0x10000000` / `< 0x10000000` tests wrongly matched HyperRAM (≥ 0x10000000)
+ * too. */
+#define IS_L1(ptr)                                                             \
+  ((uint32_t)(ptr) >= 0x10000000u && (uint32_t)(ptr) < 0x10040000u)
+#define IS_L2(ptr)                                                             \
+  (((uint32_t)(ptr) >= 0x1C000000u && (uint32_t)(ptr) < 0x1C200000u) ||        \
+   IS_L1(ptr))
+
 #ifdef POWER_MEASUREMENT
 unsigned int GPIOs = 89;
 #define WRITE_GPIO(x) pi_gpio_pin_write(GPIOs, x)
@@ -119,7 +131,11 @@ int main(void) {
   printf("Initialized\r\n");
 #endif
   for (uint32_t buf = 0; buf < DeeployNetwork_num_inputs; buf++) {
-    if ((uint32_t)DeeployNetwork_inputs[buf] >= 0x10000000) {
+    /* L3 inputs are loaded at runtime from the readfs hex inside InitNetwork
+     * (testInputVector[buf] == NULL) and already live in HyperRAM, which the FC
+     * cannot memcpy into — skip them. Only on-chip (L1/L2) inputs are copied
+     * from the baked testInputVector. */
+    if (testInputVector[buf] != NULL && IS_L2(DeeployNetwork_inputs[buf])) {
       memcpy(DeeployNetwork_inputs[buf], testInputVector[buf],
              DeeployNetwork_inputs_bytes[buf]);
     }
@@ -156,7 +172,10 @@ int main(void) {
   for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) {
     tot_tested += DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE);
 
-    if ((uint32_t)DeeployNetwork_outputs[buf] < 0x10000000) {
+    /* L3 outputs live in HyperRAM (not CPU-addressable) — DMA them into an L2
+     * scratch before the compare. On-chip (L1/L2) outputs are compared in
+     * place. */
+    if (!IS_L2(DeeployNetwork_outputs[buf])) {
       compbuf = pi_l2_malloc(DeeployNetwork_outputs_bytes[buf]);
       ram_read(compbuf, DeeployNetwork_outputs[buf],
                DeeployNetwork_outputs_bytes[buf]);
@@ -194,7 +213,7 @@ int main(void) {
         }
       }
     }
-    if ((uint32_t)DeeployNetwork_outputs[buf] < 0x10000000) {
+    if (!IS_L2(DeeployNetwork_outputs[buf])) {
       pi_l2_free(compbuf, DeeployNetwork_outputs_bytes[buf]);
     }
   }

From db36d89ae9f70dcd3e83df793597019a667b70f2 Mon Sep 17 00:00:00 2001
From: samanthawangdl <samanthawangdl@gmail.com>
Date: Thu, 18 Jun 2026 23:57:13 +0000
Subject: [PATCH 2/7] =?UTF-8?q?perf(GAP9):=20split=20L3=20tiling=20DMA=20?=
 =?UTF-8?q?=E2=80=94=20blocking=20for=20SB,=20async=20for=20DB?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The L3<->L2 tiling used one DMA backend for both single- and double-buffering.
Async DMA only helps double-buffering (it overlaps the next-tile prefetch with
compute); single-buffering waits on each tile before computing, so async gives
SB no benefit but all the risk — strided 2D L3 transfers (pi_cl_ram_copy_2d) can
corrupt under deferred waits.

- PULPL3Tiling: add optional `dbDma` (defaults to `dma`) so SB and DB can use
  different backends. Backward compatible.
- GAP9 bindings: SB keeps the blocking gap9L3DmaHack; DB uses async GAP9L3Dma for
  real L3<->L2 prefetch overlap.
- GAP9L3Dma: reset future `.size`=0 after copy-wait (so a completed future isn't
  waited twice) and cast `${ext}` to uint32_t in the 2D transfer.

Verified MatMul --defaultMemLevel L3 on GVSoC: 0/256.
---
 Deeploy/Targets/GAP9/Bindings.py                 | 10 +++++++---
 Deeploy/Targets/GAP9/DMA/L3Dma.py                |  3 ++-
 .../CodeTransformationPasses/PULPL3Tiling.py     | 16 ++++++++++++----
 3 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/Deeploy/Targets/GAP9/Bindings.py b/Deeploy/Targets/GAP9/Bindings.py
index 2bda98af8f..ad3ae2fd9c 100644
--- a/Deeploy/Targets/GAP9/Bindings.py
+++ b/Deeploy/Targets/GAP9/Bindings.py
@@ -18,7 +18,7 @@
 from Deeploy.DeeployTypes import CodeTransformation, NodeBinding
 from Deeploy.FutureExtension.Bindings.AutoFutureBinding import AutoFutureBinding
 from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
-from Deeploy.Targets.GAP9.DMA.L3Dma import gap9L3DmaHack
+from Deeploy.Targets.GAP9.DMA.L3Dma import GAP9L3Dma, gap9L3DmaHack
 from Deeploy.Targets.GAP9.DMA.MchanDma import GAP9MchanDma
 # Import templates from PULPOpen and Generic
 from Deeploy.Targets.Generic.Templates import AddTemplate, ConcatTemplate, DequantTemplate, FloatReduceMeanTemplate, \
@@ -57,7 +57,9 @@
     MemoryManagementGeneration("L1"),
     TilingVariableReplacement("L2"),
     MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True),
-    PULPL3Tiling("L3", "L2", gap9L3DmaHack),  # Use GAP9-specific L3 DMA
+    # SB -> blocking gap9L3DmaHack (safe for strided 2D L3 transfers); DB ->
+    # async GAP9L3Dma for real L3<->L2 prefetch overlap.
+    PULPL3Tiling("L3", "L2", gap9L3DmaHack, dbDma = GAP9L3Dma()),
     PULPProfileUntiled(),
     ArgumentStructGeneration(),
     L3MemoryAwareFunctionCallClosure(writeback = False),
@@ -76,7 +78,9 @@
     MemoryManagementGeneration("L1"),
     TilingVariableReplacement("L2"),
     MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True),
-    PULPL3Tiling("L3", "L2", gap9L3DmaHack),  # Use GAP9-specific L3 DMA
+    # SB -> blocking gap9L3DmaHack (safe for strided 2D L3 transfers); DB ->
+    # async GAP9L3Dma for real L3<->L2 prefetch overlap.
+    PULPL3Tiling("L3", "L2", gap9L3DmaHack, dbDma = GAP9L3Dma()),
     PULPProfileUntiled(),
     ArgumentStructGeneration(),
     L3MemoryAwareFunctionCallClosure(writeback = False),
diff --git a/Deeploy/Targets/GAP9/DMA/L3Dma.py b/Deeploy/Targets/GAP9/DMA/L3Dma.py
index adbf161328..3cf274c569 100644
--- a/Deeploy/Targets/GAP9/DMA/L3Dma.py
+++ b/Deeploy/Targets/GAP9/DMA/L3Dma.py
@@ -21,6 +21,7 @@ class GAP9L3DmaFuture(Future):
     _waitTemplate = NodeTemplate("""
     if (${name}.size != 0) {
         pi_cl_ram_copy_wait(&${name});
+        ${name}.size = 0;
     }""")
 
 
@@ -29,7 +30,7 @@ class GAP9L3Dma(AsyncDma):
     _transferTemplates = {
         2:
             NodeTemplate(
-                "pi_cl_ram_copy_2d(get_ram_ptr(), ${ext}, ${loc}, ${transfer_size}, ${stride}, ${length}, ${ext2loc}, &${future});"
+                "pi_cl_ram_copy_2d(get_ram_ptr(), (uint32_t) ${ext}, ${loc}, ${transfer_size}, ${stride}, ${length}, ${ext2loc}, &${future});"
             )
     }
     _waitingStrategy = PerTensorWaitingStrategy(GAP9L3DmaFuture)
diff --git a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3Tiling.py b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3Tiling.py
index 9df0d88479..523f0d2937 100644
--- a/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3Tiling.py
+++ b/Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3Tiling.py
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Tuple
+from typing import Optional, Tuple
 
 from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, _NoVerbosity
 from Deeploy.TilingExtension.AsyncDma import AsyncDma
@@ -30,11 +30,19 @@ class ProfilingPULPL3TilingGenerationDB(DoubleBufferingTilingCodeGeneration, Pro
 
 class PULPL3Tiling(CodeTransformationPass):
 
-    def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma):
+    def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma, dbDma: Optional[AsyncDma] = None):
+        # SB and DB can use different DMA backends. Async DMA only ever helps DB
+        # (it overlaps the next-tile prefetch with compute); SB waits on each tile
+        # before computing, so async gives SB no benefit but all the risk (strided
+        # 2D L3 transfers can corrupt under deferred waits). Defaulting dbDma to dma
+        # keeps backward compatibility; pass an async dma as dbDma for real L3<->L2
+        # overlap on the DB path while keeping SB on the safe blocking dma.
+        if dbDma is None:
+            dbDma = dma
         self.SB = PULPL3TilingGenerationSB(externalMemory, localMemory, dma)
-        self.DB = PULPL3TilingGenerationDB(externalMemory, localMemory, dma)
+        self.DB = PULPL3TilingGenerationDB(externalMemory, localMemory, dbDma)
         self.profilingSB = ProfilingPULPL3TilingGenerationSB(externalMemory, localMemory, dma)
-        self.profilingDB = ProfilingPULPL3TilingGenerationDB(externalMemory, localMemory, dma)
+        self.profilingDB = ProfilingPULPL3TilingGenerationDB(externalMemory, localMemory, dbDma)
 
     def apply(self,
               ctxt: NetworkContext,

From bb42a7b73f6aa6c7c0d4cb2cf26f2db1af8d2ad4 Mon Sep 17 00:00:00 2001
From: samanthawangdl <samanthawangdl@gmail.com>
Date: Fri, 19 Jun 2026 00:06:50 +0000
Subject: [PATCH 3/7] perf(GAP9): -O3 hot forward kernels + document L1-memory
 knobs in example

- TargetLibraries/GAP9: compile the hot forward kernels (Convolution_fp32,
  DWConvolution_fp32, Gemm) at -O3, appended last so it wins over the SDK's
  default -Os. These dominate GAP9 inference cycles.
- deeploytest.c / CMake / sdk config: make the GAP9 example show the three
  L1-memory knobs that let conv-heavy nets fit, with explanatory comments:
    A. slave (PE) stacks -> L2: hand the cluster task a static L2 buffer
       (SET_SLAVE_STACK) so the SDK skips its L1 slave-stack alloc (~30 KB L1).
    B. shrink the SDK's L1 slave stacks via CONFIG_CL_SLAVE_CORE_STACK_SIZE
       (sdk_gvsoc.config) -- alternative to A.
    C. size the cluster-controller stack via conf.cc_stack_size, overridable
       from the build with -DCC_STACK_SIZE=<bytes> (new CMake option).

Verified MatMul --defaultMemLevel L3 -DCC_STACK_SIZE=8192 on GVSoC: 0/256.
---
 DeeployTest/Platforms/GAP9/CMakeLists.txt    |  6 ++++
 DeeployTest/Platforms/GAP9/sdk_gvsoc.config  |  4 +++
 DeeployTest/Platforms/GAP9/src/deeploytest.c | 35 ++++++++++++++++++--
 TargetLibraries/GAP9/CMakeLists.txt          |  9 +++++
 4 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/DeeployTest/Platforms/GAP9/CMakeLists.txt b/DeeployTest/Platforms/GAP9/CMakeLists.txt
index cbb6382329..61fa781d62 100644
--- a/DeeployTest/Platforms/GAP9/CMakeLists.txt
+++ b/DeeployTest/Platforms/GAP9/CMakeLists.txt
@@ -26,6 +26,12 @@ if(POWER_MEASUREMENT)
   target_compile_definitions(${ProjectId} PRIVATE POWER_MEASUREMENT)
 endif()
 
+# L1-memory knob — size the cluster-controller (CC / master) stack from the build.
+# Example (CI / command line): cmake ... -DCC_STACK_SIZE=8192  (see deeploytest.c).
+if(CC_STACK_SIZE)
+  target_compile_definitions(${ProjectId} PRIVATE CC_STACK_SIZE=${CC_STACK_SIZE})
+endif()
+
 # RW: Waive sign comparison warnings from pulp_nn_utils.h
 target_compile_options(network PRIVATE
     -Wno-sign-compare
diff --git a/DeeployTest/Platforms/GAP9/sdk_gvsoc.config b/DeeployTest/Platforms/GAP9/sdk_gvsoc.config
index 2f5bdf053c..099649684e 100644
--- a/DeeployTest/Platforms/GAP9/sdk_gvsoc.config
+++ b/DeeployTest/Platforms/GAP9/sdk_gvsoc.config
@@ -33,4 +33,8 @@ CONFIG_PLATFORM_GVSOC=y
 # GAP9 cluster stack size configuration
 # Uncomment and adjust these values if you need to modify stack sizes:
 # CONFIG_CL_MASTER_CORE_STACK_SIZE=14000
+# L1-memory knob B — shrink the SDK's per-PE (slave) cluster stacks to free L1
+# TCDM for the tile arena. Uncomment and lower if kernel stack use is small.
+# (deeploytest.c instead hands its own L2 slave-stack buffer via SET_SLAVE_STACK,
+#  which bypasses this kconfig; use one knob or the other.)
 # CONFIG_CL_SLAVE_CORE_STACK_SIZE=1000
diff --git a/DeeployTest/Platforms/GAP9/src/deeploytest.c b/DeeployTest/Platforms/GAP9/src/deeploytest.c
index b527baaf9f..c6d4564d3d 100644
--- a/DeeployTest/Platforms/GAP9/src/deeploytest.c
+++ b/DeeployTest/Platforms/GAP9/src/deeploytest.c
@@ -16,6 +16,24 @@
 // RW: Remove MAINSTACKSIZE because gap9-sdk does not use it
 #define SLAVESTACKSIZE 3800
 
+/* L1-memory knob A — place the cluster slave (PE) stacks in L2 instead of L1.
+ * The GAP9 SDK pi_cl_l1_malloc's the slave stacks only when task->stacks ==
+ * NULL
+ * (__pi_cluster_task_set_stack in the SDK cluster driver); handing it our own
+ * buffer makes it skip that L1 allocation, freeing ~30 KB of L1 (8 cores x
+ * SLAVESTACKSIZE) for the Deeploy tile arena. The static array lands in .bss
+ * (L2). Use SET_SLAVE_STACK(task) instead of setting slave_stack_size directly.
+ * Knob B (alternative): shrink the SDK's own L1 slave stacks via
+ * CONFIG_CL_SLAVE_CORE_STACK_SIZE in the sdk .config (see sdk_gvsoc.config). */
+#define CLUSTER_MAX_CORES 9
+static uint8_t cluster_slave_stacks[SLAVESTACKSIZE * CLUSTER_MAX_CORES]
+    __attribute__((aligned(16)));
+#define SET_SLAVE_STACK(task)                                                  \
+  do {                                                                         \
+    (task).slave_stack_size = SLAVESTACKSIZE;                                  \
+    (task).stacks = cluster_slave_stacks;                                      \
+  } while (0)
+
 /* On-chip memory windows. HyperRAM/L3 (cl_ram_malloc) is NOT CPU-addressable,
  * so a raw memcpy / CPU-deref of an L3 pointer faults ("Invalid fetch") on real
  * silicon — GVSoC models HyperRAM as flat RAM and hides the bug. Only the real
@@ -110,6 +128,17 @@ int main(void) {
 
   pi_cluster_conf_init(&conf);
   conf.id = 0;
+  /* L1-memory knob C — cluster-controller (CC / master) stack size. The CC
+   * stack is carved from the bottom of L1 (grows down toward the L1 base). The
+   * SDK default PI_CL_CC_STACK_SIZE (0x800 = 2 KB) can be too small for deep
+   * tiling call chains and overflows below the L1 base (silent clobber /
+   * invalid write). pi_cluster_task takes the size from conf.cc_stack_size (NOT
+   * the AutoTiler-only CONFIG_CL_MASTER_CORE_STACK_SIZE kconfig). Override per
+   * build with -DCC_STACK_SIZE=<bytes> (see CMakeLists). */
+#ifndef CC_STACK_SIZE
+#define CC_STACK_SIZE 8192
+#endif
+  conf.cc_stack_size = CC_STACK_SIZE;
   pi_open_from_conf(&cluster_dev, &conf);
   if (pi_cluster_open(&cluster_dev))
     return -1;
@@ -124,7 +153,7 @@ int main(void) {
   struct pi_cluster_task cluster_task;
 
   pi_cluster_task(&cluster_task, InitNetworkWrapper, NULL);
-  cluster_task.slave_stack_size = SLAVESTACKSIZE;
+  SET_SLAVE_STACK(cluster_task);
   pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task);
 
 #ifndef CI
@@ -146,7 +175,7 @@ int main(void) {
 #endif
 
   pi_cluster_task(&cluster_task, RunNetworkWrapper, NULL);
-  cluster_task.slave_stack_size = SLAVESTACKSIZE;
+  SET_SLAVE_STACK(cluster_task);
 
 #ifdef POWER_MEASUREMENT
   WRITE_GPIO(1);
@@ -193,7 +222,7 @@ int main(void) {
       float_compare_args.err_count = (int *)&float_error_count;
 
       pi_cluster_task(&cluster_task, CL_CompareFloat, &float_compare_args);
-      cluster_task.slave_stack_size = SLAVESTACKSIZE;
+      SET_SLAVE_STACK(cluster_task);
       pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task);
 
       tot_err += float_error_count;
diff --git a/TargetLibraries/GAP9/CMakeLists.txt b/TargetLibraries/GAP9/CMakeLists.txt
index ca4c3ffbeb..f66514050a 100644
--- a/TargetLibraries/GAP9/CMakeLists.txt
+++ b/TargetLibraries/GAP9/CMakeLists.txt
@@ -31,6 +31,15 @@ target_compile_options(deeploygap9 PRIVATE
 
 target_link_libraries(deeploygap9 PUBLIC pmsis)
 
+# Compile the hot forward kernels at -O3 (set last so it wins over the SDK's
+# default -Os). Conv / depthwise-conv / Gemm dominate GAP9 inference cycles.
+set(_KERNEL_O3_FILES
+  ${CMAKE_CURRENT_LIST_DIR}/../PULPOpen/src/Convolution_fp32.c
+  ${CMAKE_CURRENT_LIST_DIR}/../PULPOpen/src/DWConvolution_fp32.c
+  ${CMAKE_CURRENT_LIST_DIR}/../PULPOpen/src/Gemm.c
+)
+set_source_files_properties(${_KERNEL_O3_FILES} PROPERTIES COMPILE_OPTIONS "-O3")
+
 #RW: Link PULP-NN
 #RW: Set PULP-NN version and bitwidth for pulp-nn-mixed
 set(PULPNNVERSION XPULPV2)

From 4b4dd5d191c174d8c09e128c8fc56794b07f38e1 Mon Sep 17 00:00:00 2001
From: samanthawangdl <samanthawangdl@gmail.com>
Date: Fri, 19 Jun 2026 00:18:07 +0000
Subject: [PATCH 4/7] fix(GAP9): emit cluster fork/closure argument structs as
 static (off-stack)

The tiling argument structs were stack-locals in the dispatching function. The
cluster fork runtime writes its descriptor near the top of the CC/master stack;
a stack-local arg struct placed there can be clobbered before the forked cores
read it (a GAP9 cluster-fork crash, e.g. MobileNetV1). Declare the struct
`static` and assign separately so it lives in static storage, stable across the
forked call.

Generic codegen (ArgumentStructGeneration); benign on other targets. Verified
MatMul --defaultMemLevel L3 on GVSoC: 0/256.
---
 .../CodeTransformationPasses/MemoryAllocation.py           | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py b/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py
index 609a179c7b..dfe5df9ca5 100644
--- a/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py
+++ b/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py
@@ -31,7 +31,12 @@ def __init__(self, templateStr: str, bufferName: str):
 
 _stackAllocateTemplate = partial(
     _ArgStructAllocateTemplate,
-    templateStr = "${structDict.typeName} ${name} = (${structDict.typeName}) ${str(structDict)};")
+    # Declare the argument struct `static` (off-stack), then assign. The cluster
+    # fork runtime writes its descriptor near the top of the CC/master stack; a
+    # stack-local arg struct placed there can be clobbered before the forked
+    # cores read it (observed as a GAP9 cluster-fork crash). Static storage keeps
+    # it stable for the lifetime of the forked call.
+    templateStr = "static ${structDict.typeName} ${name}; ${name} = (${structDict.typeName}) ${str(structDict)};")
 
 
 class ArgumentStructGeneration(CodeTransformationPass, IntrospectiveCodeTransformationMixIn):

From 9a074a37bce8022c4093cb3b13e79d70d8ebe5e7 Mon Sep 17 00:00:00 2001
From: samanthawangdl <samanthawangdl@gmail.com>
Date: Fri, 19 Jun 2026 00:20:31 +0000
Subject: [PATCH 5/7] fix(GAP9): per-tensor waiting strategy for the cluster
 (mchan) DMA

GAP9 mchan allocates a fresh channel on every descriptor enqueue. The previous
DirectionWaitingStrategy shares one future (one mchan_transfer_get_id) across all
same-direction tensors of a tile, so a tile with >1 input emits one get_id but
multiple pushes -> the extra transfers run on channels that are never waited or
freed -> mchan_transfer_wait() hangs (e.g. the optimizer weight+grad stall).
Switch to PerTensorWaitingStrategy so each tensor gets its own
get_id : push : wait : free, matching the mchan contract.

Verified MatMul --defaultMemLevel L3 on GVSoC: 0/256.
---
 Deeploy/Targets/GAP9/DMA/MchanDma.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/Deeploy/Targets/GAP9/DMA/MchanDma.py b/Deeploy/Targets/GAP9/DMA/MchanDma.py
index 14e7eb0930..5525d39e3c 100644
--- a/Deeploy/Targets/GAP9/DMA/MchanDma.py
+++ b/Deeploy/Targets/GAP9/DMA/MchanDma.py
@@ -6,7 +6,7 @@
 from typing import Dict, Tuple
 
 from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer
-from Deeploy.TilingExtension.AsyncDma import AsyncDma, DirectionWaitingStrategy, DmaDirection, Future
+from Deeploy.TilingExtension.AsyncDma import AsyncDma, DmaDirection, Future, PerTensorWaitingStrategy
 
 
 class MchanTransferFuture(Future):
@@ -36,7 +36,16 @@ class GAP9MchanDma(AsyncDma):
                 "{ mchan_transfer_t __mchan_tmp = { .cmd = ${cmd}, .size = ${size}, .loc = ${loc}, .ext = ${ext}, .ext_size_1d = ${size_1d}, .ext_stride_1d = ${stride_2d} }; mchan_transfer_push_2d(__mchan_tmp); }"
             ),
     }
-    _waitingStrategy = DirectionWaitingStrategy(MchanTransferFuture, "transfer")
+    # PerTensor, NOT Direction: GAP9 mchan allocates a fresh channel on every
+    # descriptor enqueue (each mchan_transfer_push_* writes a new descriptor and
+    # the hardware advances to the next channel). DirectionWaitingStrategy shares
+    # ONE future (one mchan_transfer_get_id) across all same-direction tensors of
+    # a tile, so a tile with >1 input (e.g. weight + grad) emits one get_id but
+    # multiple pushes -> the 2nd+ transfers run on channels that are never waited
+    # nor freed -> mchan_transfer_wait() hangs forever. PerTensor gives each tensor
+    # its own get_id immediately before its push, matching mchan's
+    # 1 get_id : 1 push : 1 wait : 1 free contract.
+    _waitingStrategy = PerTensorWaitingStrategy(MchanTransferFuture)
 
     def __init__(self, transferTemplates: Dict[int, NodeTemplate] = _transferTemplates) -> None:
         super().__init__(transferTemplates)

From c29c1f74355c9aeba4bc750681e0fa4e3298ab0c Mon Sep 17 00:00:00 2001
From: samanthawangdl <samanthawangdl@gmail.com>
Date: Fri, 19 Jun 2026 00:23:35 +0000
Subject: [PATCH 6/7] docs(GAP9): add backend-fixes & memory-tuning notes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Explain each GAP9 backend change in this branch — L3-aware board harness, SB/DB
L3 DMA split, -O3 forward kernels, the three L1-memory knobs (cc_stack /
slave-stack size / slave-stack->L2), static cluster fork/closure args, and the
per-tensor mchan DMA waiting strategy — with problem, fix, file, and takeaway,
plus a short GAP9 memory-model primer.
---
 docs/gap9_backend_fixes.md | 150 +++++++++++++++++++++++++++++++++++++
 1 file changed, 150 insertions(+)
 create mode 100644 docs/gap9_backend_fixes.md

diff --git a/docs/gap9_backend_fixes.md b/docs/gap9_backend_fixes.md
new file mode 100644
index 0000000000..08f635fd58
--- /dev/null
+++ b/docs/gap9_backend_fixes.md
@@ -0,0 +1,150 @@
+# GAP9 backend fixes & memory-tuning knobs
+
+Short notes on a set of GAP9 backend changes: what each one fixes, why, and where
+it lives. They share a theme — **the GAP9 cluster runs on a small, manually-managed
+L1 (TCDM), and HyperRAM/L3 is not CPU-addressable** — so most bugs here are about
+*where memory lives* and *how DMA waits complete*. GVSoC models memory more
+forgivingly than real silicon (flat HyperRAM, generous timing), so several of these
+pass in simulation and only fault on the board.
+
+## GAP9 memory model (background)
+
+| Level | Address window | Who can touch it directly |
+|---|---|---|
+| L1 / TCDM (128 KB) | `0x1000_0000–0x1004_0000` | cluster cores (PEs) + CC |
+| L2 (1.5 MB) | `0x1C00_0000–0x1C20_0000` | FC + cluster (shared) |
+| L3 / HyperRAM | `cl_ram_malloc` handles | **DMA only** — not CPU-addressable |
+
+The cluster has 8 worker PEs + a cluster-controller (CC / master) core. The CC
+stack is carved from the **bottom of L1**; PE slave stacks are also L1 by default.
+Everything competes with the Deeploy tile arena for those 128 KB.
+
+---
+
+## 1. L3-aware input/output in the board test harness
+
+**File:** `DeeployTest/Platforms/GAP9/src/deeploytest.c`
+
+**Problem.** The harness classified buffers by raw address thresholds
+(`ptr >= 0x10000000` for inputs, `< 0x10000000` for outputs). HyperRAM/L3 buffers
+(`cl_ram_malloc`) are *also* `>= 0x10000000`, but HyperRAM is **not CPU-addressable**.
+So for `--defaultMemLevel L3` tests, `main()` did a raw `memcpy` into an L3 input
+pointer and CPU-dereferenced L3 output pointers — an `Invalid fetch` fault on the
+board, right after init, before any results print. GVSoC models HyperRAM as flat
+RAM, so it passed there and masked the bug.
+
+**Fix.** Add `IS_L1` / `IS_L2` on-chip-window macros and use them: on-chip inputs
+are `memcpy`'d, **L3 inputs are already loaded from the readfs hex inside
+`InitNetwork`** (their `testInputVector` entry is `NULL`) so they're skipped, and
+**L3 outputs are `ram_read` into an L2 scratch** before the compare.
+
+**Takeaway.** Never CPU-`memcpy`/deref an L3 pointer on GAP9 — gate on the real
+on-chip windows, not a single `>= 0x10000000` threshold.
+
+---
+
+## 2. Split the L3 tiling DMA — blocking for single-buffer, async for double-buffer
+
+**Files:** `Deeploy/Targets/PULPOpen/CodeTransformationPasses/PULPL3Tiling.py`,
+`Deeploy/Targets/GAP9/Bindings.py`, `Deeploy/Targets/GAP9/DMA/L3Dma.py`
+
+**Problem.** L3↔L2 tiling used one DMA backend for both single- (SB) and
+double-buffering (DB). Async DMA only helps DB, where it overlaps the *next* tile's
+prefetch with compute. SB waits on each tile before computing, so async there buys
+nothing but adds risk: strided 2D L3 transfers (`pi_cl_ram_copy_2d`) can corrupt
+under deferred waits.
+
+**Fix.** `PULPL3Tiling` gains an optional `dbDma` (defaults to `dma`, so it is
+backward compatible). GAP9 binds **SB → blocking** `gap9L3DmaHack`, **DB → async**
+`GAP9L3Dma`. Also: reset the L3 future's `.size` to 0 after `pi_cl_ram_copy_wait`
+(so a completed future is never waited on twice) and cast `${ext}` to `uint32_t`
+in the 2D transfer.
+
+**Takeaway.** Async DMA is a double-buffering optimization; don't pay its hazards
+on the single-buffering path.
+
+---
+
+## 3. `-O3` on the hot forward kernels
+
+**File:** `TargetLibraries/GAP9/CMakeLists.txt`
+
+**Problem.** The SDK compiles kernels at `-Os` by default; the conv / depthwise-conv
+/ Gemm kernels dominate GAP9 inference cycles and are left slow.
+
+**Fix.** Compile `Convolution_fp32.c`, `DWConvolution_fp32.c`, `Gemm.c` at `-O3`
+via `set_source_files_properties(... COMPILE_OPTIONS "-O3")`, **appended last** so
+it wins over the SDK's `-Os` on the same translation units.
+
+**Takeaway.** Per-file `-O3` on the few hot kernels is a large, cheap latency win;
+ordering matters because the last `COMPILE_OPTIONS` wins.
+
+---
+
+## 4. L1-memory tuning knobs (documented in the example)
+
+**Files:** `DeeployTest/Platforms/GAP9/src/deeploytest.c`,
+`DeeployTest/Platforms/GAP9/CMakeLists.txt`,
+`DeeployTest/Platforms/GAP9/sdk_gvsoc.config`
+
+Three independent ways to free L1 TCDM for the tile arena so conv-heavy nets fit.
+The example demonstrates all three with comments.
+
+- **Knob A — slave (PE) stacks → L2.** The SDK `pi_cl_l1_malloc`'s the PE stacks
+  only when `task->stacks == NULL`. Hand it a static buffer (`SET_SLAVE_STACK`,
+  a `.bss` array → L2) and it skips that L1 allocation, freeing ~30 KB of L1
+  (8 cores × `SLAVESTACKSIZE`).
+- **Knob B — shrink the SDK's L1 slave stacks.** `CONFIG_CL_SLAVE_CORE_STACK_SIZE`
+  in the sdk `.config` (alternative to Knob A; use one or the other).
+- **Knob C — cluster-controller stack size.** The CC stack grows down from the L1
+  base; the SDK default (`0x800` = 2 KB) is too small for deep tiling call chains
+  and overflows below the base (silent clobber / invalid write). Set
+  `conf.cc_stack_size`, overridable from the build with **`-DCC_STACK_SIZE=<bytes>`**
+  (new CMake option). Example (CI): `cmake … -DCC_STACK_SIZE=8192`.
+
+**Takeaway.** The CC stack and PE stacks are *invisible to the tiler and the ELF*
+(carved at runtime), yet they share L1 with the arena — budget for them explicitly.
+
+---
+
+## 5. Emit cluster fork/closure argument structs as `static`
+
+**File:** `Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py`
+(`ArgumentStructGeneration`)
+
+**Problem.** The per-node tiling argument structs were stack-locals in the
+dispatching function. The cluster-fork runtime writes its descriptor near the top
+of the CC/master stack and can clobber a stack-local arg struct **before the forked
+cores read it** — a GAP9 cluster-fork crash (seen on MobileNetV1).
+
+**Fix.** Declare the struct `static` (off-stack) and assign separately:
+`static T name; name = (T){…};`. Static storage keeps it stable for the lifetime
+of the forked call. Generic codegen, benign on other targets.
+
+**Takeaway.** Anything handed to a cluster fork must outlive the CC stack frame —
+keep it off the stack.
+
+---
+
+## 6. Per-tensor waiting strategy for the cluster (mchan) DMA
+
+**File:** `Deeploy/Targets/GAP9/DMA/MchanDma.py`
+
+**Problem.** GAP9 mchan allocates a fresh channel on every descriptor enqueue.
+`DirectionWaitingStrategy` shares **one** future (`one mchan_transfer_get_id`)
+across all same-direction tensors of a tile. A tile with >1 input (e.g. optimizer
+weight + grad) then emits one `get_id` but multiple pushes → the extra transfers
+run on channels that are never waited or freed → `mchan_transfer_wait()` hangs
+forever.
+
+**Fix.** Use `PerTensorWaitingStrategy`: each tensor gets its own
+`get_id : push : wait : free`, matching the mchan hardware contract.
+
+**Takeaway.** Match the DMA waiting strategy to the hardware's channel model —
+mchan is one-channel-per-transfer, so wait per tensor, not per direction.
+
+---
+
+*All changes verified on GVSoC with `MatMul --defaultMemLevel L3` (`Errors: 0 out
+of 256`). On-chip (L1/L2) behaviour is unchanged; only the L3 / stack / DMA paths
+differ.*

From a5f0bcfdc1fd021102b31078b751abc957cf7a7e Mon Sep 17 00:00:00 2001
From: samanthawangdl <samanthawangdl@gmail.com>
Date: Fri, 19 Jun 2026 01:15:19 +0000
Subject: [PATCH 7/7] ci(GAP9): build-time L1/L2 memory gate

Add gap9_memcheck.py and run it from run_complete_test after the build, before
the simulation, on GAP9. It models every consumer of L1/L2 the tiler doesn't
(CC master stack, PE slave stacks, ELF sections, tile arena, promoted pool) and
scans InitNetwork for the pi_l2_malloc-after-cl_ram_malloc alloc-order race, so
over-subscription fails fast with the exact knob instead of a multi-minute GVSoC
hang. GAP9-only; bypass with DEPLOY_SKIP_MEMCHECK=1. Verified MatMul L3: gate
runs (PASS) and test is 0/256.
---
 DeeployTest/testUtils/core/execution.py |  25 +++
 DeeployTest/testUtils/gap9_memcheck.py  | 248 ++++++++++++++++++++++++
 docs/gap9_backend_fixes.md              |  22 +++
 3 files changed, 295 insertions(+)
 create mode 100644 DeeployTest/testUtils/gap9_memcheck.py

diff --git a/DeeployTest/testUtils/core/execution.py b/DeeployTest/testUtils/core/execution.py
index 4c6c972679..294691895d 100644
--- a/DeeployTest/testUtils/core/execution.py
+++ b/DeeployTest/testUtils/core/execution.py
@@ -227,6 +227,28 @@ def run_simulation(config: DeeployTestConfig, skip: bool = False) -> TestResult:
     return test_result
 
 
+def _gap9_memcheck_gate(config: DeeployTestConfig) -> None:
+    """GAP9 build-time memory gate.
+
+    Run the L1/L2 budget + InitNetwork alloc-order validator after the build and
+    before the simulation, so memory over-subscription (which otherwise shows up
+    as a multi-minute GVSoC hang in os_evt_release or a wild-pointer crash) fails
+    fast with the exact knob to turn. GAP9-only; bypass with DEPLOY_SKIP_MEMCHECK=1.
+    """
+    if config.platform != "GAP9" or os.environ.get("DEPLOY_SKIP_MEMCHECK") == "1":
+        return
+    script = Path(__file__).parent.parent / "gap9_memcheck.py"
+    if not script.exists():
+        return
+    cmd = ["python", str(script), config.build_dir, config.gen_dir]
+    log.debug(f"[Execution] GAP9 memcheck: {' '.join(cmd)}")
+    result = subprocess.run(cmd, check = False)
+    if result.returncode == 1:
+        raise RuntimeError(f"GAP9 memory check failed for {config.test_name} (L1/L2 over-subscription "
+                           f"or InitNetwork alloc-order race - see output above). "
+                           f"Set DEPLOY_SKIP_MEMCHECK=1 to bypass.")
+
+
 def run_complete_test(config: DeeployTestConfig, skipgen: bool = False, skipsim: bool = False) -> TestResult:
     """
     Run a complete test: generate, configure, build, and simulate.
@@ -242,6 +264,9 @@ def run_complete_test(config: DeeployTestConfig, skipgen: bool = False, skipsim:
     # Step 3: Build binary
     build_binary(config)
 
+    # Step 3b: GAP9 build-time memory gate (fast-fail before simulation)
+    _gap9_memcheck_gate(config)
+
     # Step 4: Run simulation
     result = run_simulation(config, skip = skipsim)
 
diff --git a/DeeployTest/testUtils/gap9_memcheck.py b/DeeployTest/testUtils/gap9_memcheck.py
new file mode 100644
index 0000000000..e9922033d8
--- /dev/null
+++ b/DeeployTest/testUtils/gap9_memcheck.py
@@ -0,0 +1,248 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""GAP9 L1/L2 memory-budget validator (CI gate).
+
+Turns silent over-subscription — the class of bug that manifests as a 10-minute
+GVSoC hang in os_evt_release or a wild-pointer "Invalid access" far from the
+cause — into an instant, precise, build-time error.
+
+It models EVERY consumer of each physical level, including the ones the Deeploy
+tiler does NOT model:
+  * L1 tile arena            : pi_l1_malloc(...) in generated C
+  * CC master stack          : conf.cc_stack_size, carved from L1 top at runtime
+                               (NOT an ELF section, NOT a malloc -> invisible to
+                               --plotMemAlloc; this is what overflowed)
+  * static L1 sections       : from the ELF (addr 0x1000xxxx)
+  * L2 code/data/bss         : from the ELF (addr 0x1cxxxxxx) — incl PE slave
+                               stacks (cluster_slave_stacks, .bss in L2)
+  * L2 tile arena + promoted : pi_l2_malloc(...) + PromoteTensorsToL2 log
+
+Exit code 0 = fits; 1 = over-subscribed (with the knob to turn); 2 = usage error.
+
+Usage:
+  gap9_memcheck.py <build_dir> <gen_dir> [build_or_promote_log] [--cc-stack N]
+"""
+import glob
+import os
+import re
+import subprocess
+import sys
+
+# ---- physical capacities (GAP9) ----
+L1_SIZE = 128 * 1024  # 131072  TCDM
+L2_SIZE = 0x190000  # 1572864 (1.5 MB), link.gap9.ld
+L1_MIN_HEADROOM = 1024  # warn if free L1 < this (stacks have runtime slop)
+READELF = "/app/install/gcc/gap9/bin/riscv32-unknown-elf-readelf"
+
+
+def die(msg, code = 2):
+    print(f"[gap9-memcheck] ERROR: {msg}", file = sys.stderr)
+    sys.exit(code)
+
+
+def parse_args(argv):
+    pos, cc = [], None
+    i = 0
+    while i < len(argv):
+        if argv[i] == "--cc-stack":
+            cc = int(argv[i + 1])
+            i += 2
+        else:
+            pos.append(argv[i])
+            i += 1
+    if len(pos) < 2:
+        die("usage: gap9_memcheck.py <build_dir> <gen_dir> [log] [--cc-stack N]")
+    return pos[0], pos[1], (pos[2] if len(pos) > 2 else None), cc
+
+
+def find_elf(build):
+    cands = []
+    for f in glob.glob(os.path.join(build, "*")):
+        if os.path.isfile(f) and os.access(f, os.X_OK) and not f.endswith((".bin", ".s", ".hex")):
+            try:
+                if open(f, "rb").read(4) == b"\x7fELF":
+                    cands.append(f)
+            except Exception:
+                pass
+    return max(cands, key = os.path.getmtime) if cands else None
+
+
+def elf_sections(elf):
+    """{region: {section: bytes}} for ALLOC sections, keyed by load address."""
+    out = subprocess.run([READELF, "-SW", elf], capture_output = True, text = True).stdout
+    regions = {"L1": {}, "L2": {}}
+    for line in out.splitlines():
+        m = re.search(r"\]\s+(\S+)\s+(PROGBITS|NOBITS)\s+([0-9a-f]{8})\s+[0-9a-f]+\s+([0-9a-f]+)", line)
+        if not m:
+            continue
+        name, _, addr, szhex = m.groups()
+        sz = int(szhex, 16)
+        if sz == 0:
+            continue
+        if addr.startswith("1000") or addr.startswith("1001"):
+            regions["L1"][name] = regions["L1"].get(name, 0) + sz
+        elif addr.startswith("1c"):
+            regions["L2"][name] = regions["L2"].get(name, 0) + sz
+    return regions
+
+
+def arena_mallocs(gen):
+    res = {"L1": 0, "L2": 0, "L3": 0}
+    for fn in ("TrainingNetwork.c", "Network.c", "OptimizerNetwork.c"):
+        p = os.path.join(gen, fn)
+        if not os.path.exists(p):
+            continue
+        t = open(p, errors = "ignore").read()
+        for sz in re.findall(r"(?:pi_)?l1_malloc\([^;]*?\*\s*(\d+)\)", t):
+            res["L1"] = max(res["L1"], int(sz))
+        for sz in re.findall(r"pi_l2_malloc\(sizeof\([^)]*\)\s*\*\s*(\d+)\)", t):
+            res["L2"] += int(sz)
+        for sz in re.findall(r"cl_ram_malloc\((?:sizeof\([^)]*\)\s*\*\s*)?(\d+)\)", t):
+            res["L3"] += int(sz)
+    return res
+
+
+def check_init_alloc_order(gen):
+    """Codegen-time race check (pure source scan — no build, no sim).
+
+    In any Init*Network function, flag a pi_l2_malloc that appears AFTER a
+    cl_ram_malloc. On GAP9 Init runs on the cluster CC: cl_ram_malloc delegates
+    to the FC (pi_cl_ram_alloc) while pi_l2_malloc runs pos_alloc directly on the
+    CC; both touch the shared L2 freelist, so this ordering races -> FC RTOS
+    corruption -> os_evt_release crash/hang at init. Fix = hoist all pi_l2_malloc
+    before the first cl_ram_malloc (codeGenerateTraining._hoistL2AllocsBeforeL3).
+    """
+    issues = []
+    for fn in ("TrainingNetwork.c", "Network.c", "OptimizerNetwork.c"):
+        p = os.path.join(gen, fn)
+        if not os.path.exists(p):
+            continue
+        lines = open(p, errors = "ignore").read().split("\n")
+        i = 0
+        while i < len(lines):
+            if re.search(r"void\s+Init\w*Network\s*\(", lines[i]):
+                depth = 0
+                started = False
+                seen_cl = False
+                for j in range(i, len(lines)):
+                    depth += lines[j].count("{") - lines[j].count("}")
+                    if "{" in lines[j]:
+                        started = True
+                    if "cl_ram_malloc(" in lines[j]:
+                        seen_cl = True
+                    elif seen_cl and "pi_l2_malloc(" in lines[j]:
+                        issues.append((fn, j + 1, lines[j].strip()[:70]))
+                    if started and depth <= 0:
+                        i = j
+                        break
+            i += 1
+    return issues
+
+
+def cc_stack_from_cache(build, override):
+    if override is not None:
+        return override, "override"
+    cache = os.path.join(build, "CMakeCache.txt")
+    if os.path.exists(cache):
+        m = re.search(r"CC_STACK_SIZE\S*=(\d+)", open(cache, errors = "ignore").read())
+        if m:
+            return int(m.group(1)), "CMakeCache"
+    return 8192, "default(8192)"  # deeploytraintest.c fallback
+
+
+def promoted_bytes(log):
+    if not log or not os.path.exists(log):
+        return 0
+    best = 0
+    for m in re.finditer(r"promoted \d+ tensors, (\d+) /", open(log, errors = "ignore").read()):
+        best = max(best, int(m.group(1)))
+    return best
+
+
+def report(title, cap, items):
+    used = sum(v for _, v in items)
+    free = cap - used
+    print(f"\n=== {title}  (cap {cap} B = {cap/1024:.1f} KB) ===")
+    for n, v in items:
+        print(f"    {v:9d} B  {v/1024:7.1f} KB  {n}")
+    print(f"    {'-'*9}")
+    print(f"    {used:9d} B  {used/1024:7.1f} KB  TOTAL  ({100*used/cap:.1f}%)")
+    print(f"    {free:9d} B  {free/1024:7.1f} KB  FREE")
+    return used, free
+
+
+def main():
+    build, gen, log, cc_override = parse_args(sys.argv[1:])
+    elf = find_elf(build)
+    if not elf:
+        die(f"no ELF found in {build}")
+    print(f"[gap9-memcheck] ELF: {elf}")
+    regs = elf_sections(elf)
+    ar = arena_mallocs(gen)
+    cc_stack, cc_src = cc_stack_from_cache(build, cc_override)
+    prom = promoted_bytes(log)
+
+    violations = []
+
+    # ---- L1 / TCDM ----
+    l1_items = []
+    if ar["L1"]:
+        l1_items.append(("tile arena  (pi_l1_malloc)", ar["L1"]))
+    l1_items.append((f"CC master stack  (cc_stack_size, {cc_src})", cc_stack))
+    for n, v in sorted(regs["L1"].items(), key = lambda x: -x[1]):
+        l1_items.append((f"L1 section {n}", v))
+    used1, free1 = report("L1 / TCDM", L1_SIZE, l1_items)
+    if free1 < 0:
+        violations.append(f"L1 over-subscribed by {-free1} B. arena {ar['L1']} + cc_stack {cc_stack} "
+                          f"(+sections) > {L1_SIZE}. Fix: lower --l1 to <= {ar['L1']+free1} OR "
+                          f"cc_stack to <= {cc_stack+free1}.")
+    elif free1 < L1_MIN_HEADROOM:
+        print(f"\n[gap9-memcheck] WARNING: L1 free {free1} B < {L1_MIN_HEADROOM} B headroom "
+              f"(stack high-water is runtime-dependent; risky).")
+
+    # ---- L2 ----
+    # NOTE: do NOT add the promotion-log bytes to the total. The promoted pool
+    # is a pi_l2_malloc (PROMOTED_POOL_L2) -> already in ar["L2"]; promoted const
+    # bytes are baked into .data -> already in the ELF L2 sections. Adding `prom`
+    # again double-counts and produces false over-subscription FAILs.
+    l2_items = []
+    for n, v in sorted(regs["L2"].items(), key = lambda x: -x[1]):
+        l2_items.append((f"L2 section {n}", v))
+    if ar["L2"]:
+        l2_items.append(("tile arena + pools (pi_l2_malloc, incl PROMOTED_POOL)", ar["L2"]))
+    used2, free2 = report("L2", L2_SIZE, l2_items)
+    if prom:
+        print(f"    (info: PromoteTensorsToL2 log reports {prom} B promoted — already "
+              f"counted above in pi_l2_malloc / .data, not added again)")
+    if free2 < 0:
+        violations.append(f"L2 over-subscribed by {-free2} B (> {L2_SIZE}). Reduce promoted-pool "
+                          f"headroom or L2 arena.")
+
+    # ---- InitNetwork alloc-order race (codegen-time) ----
+    order_issues = check_init_alloc_order(gen)
+    if order_issues:
+        print("\n=== InitNetwork alloc-order ===")
+        for fn, ln, txt in order_issues[:6]:
+            print(f"    RACE: {fn}:{ln}  pi_l2_malloc after cl_ram_malloc -> {txt}")
+        violations.append(f"InitNetwork alloc-order race ({len(order_issues)} site(s), first "
+                          f"{order_issues[0][0]}:{order_issues[0][1]}): pi_l2_malloc after cl_ram_malloc "
+                          f"-> GAP9 FC/CC pos_alloc freelist race -> os_evt_release crash/hang at init. "
+                          f"Hoist all pi_l2_malloc before the first cl_ram_malloc.")
+
+    if ar["L3"]:
+        print(f"\n=== L3 / HyperRAM (no enforced cap) ===\n    {ar['L3']} B  "
+              f"{ar['L3']/1e6:.2f} MB  cl_ram arena")
+
+    print()
+    if violations:
+        for v in violations:
+            print(f"[gap9-memcheck] FAIL: {v}", file = sys.stderr)
+        sys.exit(1)
+    print("[gap9-memcheck] PASS: all levels fit within physical capacity.")
+    sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/gap9_backend_fixes.md b/docs/gap9_backend_fixes.md
index 08f635fd58..0452a05ea2 100644
--- a/docs/gap9_backend_fixes.md
+++ b/docs/gap9_backend_fixes.md
@@ -145,6 +145,28 @@ mchan is one-channel-per-transfer, so wait per tensor, not per direction.
 
 ---
 
+## 7. Build-time memory gate
+
+**Files:** `DeeployTest/testUtils/gap9_memcheck.py`,
+`DeeployTest/testUtils/core/execution.py`
+
+**Problem.** L1/L2 over-subscription on GAP9 surfaces as a multi-minute GVSoC hang
+(`os_evt_release`) or a wild-pointer crash far from the cause — slow and opaque.
+The tiler does not model the CC master stack, PE slave stacks, or the promoted
+pool, so it can't catch it.
+
+**Fix.** `gap9_memcheck.py` models every consumer of L1 and L2 (tile arena, CC
+stack from `cc_stack_size`, PE slave stacks, ELF sections, promoted pool) and
+scans `InitNetwork` for the `pi_l2_malloc`-after-`cl_ram_malloc` alloc-order race.
+`run_complete_test` runs it after the build and before the simulation (GAP9 only),
+so over-subscription fails in seconds with the exact knob to turn. Bypass with
+`DEPLOY_SKIP_MEMCHECK=1`.
+
+**Takeaway.** Validate the full L1/L2 budget at build time — the stacks and pools
+the tiler ignores are exactly what overflow.
+
+---
+
 *All changes verified on GVSoC with `MatMul --defaultMemLevel L3` (`Errors: 0 out
 of 256`). On-chip (L1/L2) behaviour is unchanged; only the L3 / stack / DMA paths
 differ.*