pulp-platform · runwangdl · Jun 18, 2026 · Jun 18, 2026 · Jun 19, 2026 · Jun 19, 2026
@@ -31,7 +31,12 @@ def __init__(self, templateStr: str, bufferName: str):
 
 _stackAllocateTemplate = partial(
     _ArgStructAllocateTemplate,
-    templateStr = "${structDict.typeName} ${name} = (${structDict.typeName}) ${str(structDict)};")
+    # Declare the argument struct `static` (off-stack), then assign. The cluster
+    # fork runtime writes its descriptor near the top of the CC/master stack; a
+    # stack-local arg struct placed there can be clobbered before the forked
+    # cores read it (observed as a GAP9 cluster-fork crash). Static storage keeps
+    # it stable for the lifetime of the forked call.
+    templateStr = "static ${structDict.typeName} ${name}; ${name} = (${structDict.typeName}) ${str(structDict)};")
 
 
 class ArgumentStructGeneration(CodeTransformationPass, IntrospectiveCodeTransformationMixIn):

@@ -18,7 +18,7 @@
 from Deeploy.DeeployTypes import CodeTransformation, NodeBinding
 from Deeploy.FutureExtension.Bindings.AutoFutureBinding import AutoFutureBinding
 from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
-from Deeploy.Targets.GAP9.DMA.L3Dma import gap9L3DmaHack
+from Deeploy.Targets.GAP9.DMA.L3Dma import GAP9L3Dma, gap9L3DmaHack
 from Deeploy.Targets.GAP9.DMA.MchanDma import GAP9MchanDma
 # Import templates from PULPOpen and Generic
 from Deeploy.Targets.Generic.Templates import AddTemplate, ConcatTemplate, DequantTemplate, FloatReduceMeanTemplate, \
@@ -57,7 +57,9 @@
     MemoryManagementGeneration("L1"),
     TilingVariableReplacement("L2"),
     MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True),
-    PULPL3Tiling("L3", "L2", gap9L3DmaHack),  # Use GAP9-specific L3 DMA
+    # SB -> blocking gap9L3DmaHack (safe for strided 2D L3 transfers); DB ->
+    # async GAP9L3Dma for real L3<->L2 prefetch overlap.
+    PULPL3Tiling("L3", "L2", gap9L3DmaHack, dbDma = GAP9L3Dma()),
     PULPProfileUntiled(),
     ArgumentStructGeneration(),
     L3MemoryAwareFunctionCallClosure(writeback = False),
@@ -76,7 +78,9 @@
     MemoryManagementGeneration("L1"),
     TilingVariableReplacement("L2"),
     MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True),
-    PULPL3Tiling("L3", "L2", gap9L3DmaHack),  # Use GAP9-specific L3 DMA
+    # SB -> blocking gap9L3DmaHack (safe for strided 2D L3 transfers); DB ->
+    # async GAP9L3Dma for real L3<->L2 prefetch overlap.
+    PULPL3Tiling("L3", "L2", gap9L3DmaHack, dbDma = GAP9L3Dma()),
     PULPProfileUntiled(),
     ArgumentStructGeneration(),
     L3MemoryAwareFunctionCallClosure(writeback = False),

@@ -21,6 +21,7 @@ class GAP9L3DmaFuture(Future):
     _waitTemplate = NodeTemplate("""
     if (${name}.size != 0) {
         pi_cl_ram_copy_wait(&${name});
+        ${name}.size = 0;
     }""")
 
 
@@ -29,7 +30,7 @@ class GAP9L3Dma(AsyncDma):
     _transferTemplates = {
         2:
             NodeTemplate(
-                "pi_cl_ram_copy_2d(get_ram_ptr(), ${ext}, ${loc}, ${transfer_size}, ${stride}, ${length}, ${ext2loc}, &${future});"
+                "pi_cl_ram_copy_2d(get_ram_ptr(), (uint32_t) ${ext}, ${loc}, ${transfer_size}, ${stride}, ${length}, ${ext2loc}, &${future});"
             )
     }
     _waitingStrategy = PerTensorWaitingStrategy(GAP9L3DmaFuture)

@@ -6,7 +6,7 @@
 from typing import Dict, Tuple
 
 from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer
-from Deeploy.TilingExtension.AsyncDma import AsyncDma, DirectionWaitingStrategy, DmaDirection, Future
+from Deeploy.TilingExtension.AsyncDma import AsyncDma, DmaDirection, Future, PerTensorWaitingStrategy
 
 
 class MchanTransferFuture(Future):
@@ -36,7 +36,16 @@ class GAP9MchanDma(AsyncDma):
                 "{ mchan_transfer_t __mchan_tmp = { .cmd = ${cmd}, .size = ${size}, .loc = ${loc}, .ext = ${ext}, .ext_size_1d = ${size_1d}, .ext_stride_1d = ${stride_2d} }; mchan_transfer_push_2d(__mchan_tmp); }"
             ),
     }
-    _waitingStrategy = DirectionWaitingStrategy(MchanTransferFuture, "transfer")
+    # PerTensor, NOT Direction: GAP9 mchan allocates a fresh channel on every
+    # descriptor enqueue (each mchan_transfer_push_* writes a new descriptor and
+    # the hardware advances to the next channel). DirectionWaitingStrategy shares
+    # ONE future (one mchan_transfer_get_id) across all same-direction tensors of
+    # a tile, so a tile with >1 input (e.g. weight + grad) emits one get_id but
+    # multiple pushes -> the 2nd+ transfers run on channels that are never waited
+    # nor freed -> mchan_transfer_wait() hangs forever. PerTensor gives each tensor
+    # its own get_id immediately before its push, matching mchan's
+    # 1 get_id : 1 push : 1 wait : 1 free contract.
+    _waitingStrategy = PerTensorWaitingStrategy(MchanTransferFuture)
 
     def __init__(self, transferTemplates: Dict[int, NodeTemplate] = _transferTemplates) -> None:
         super().__init__(transferTemplates)

@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Tuple
+from typing import Optional, Tuple
 
 from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, _NoVerbosity
 from Deeploy.TilingExtension.AsyncDma import AsyncDma
@@ -30,11 +30,19 @@ class ProfilingPULPL3TilingGenerationDB(DoubleBufferingTilingCodeGeneration, Pro
 
 class PULPL3Tiling(CodeTransformationPass):
 
-    def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma):
+    def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma, dbDma: Optional[AsyncDma] = None):
+        # SB and DB can use different DMA backends. Async DMA only ever helps DB
+        # (it overlaps the next-tile prefetch with compute); SB waits on each tile
+        # before computing, so async gives SB no benefit but all the risk (strided
+        # 2D L3 transfers can corrupt under deferred waits). Defaulting dbDma to dma
+        # keeps backward compatibility; pass an async dma as dbDma for real L3<->L2
+        # overlap on the DB path while keeping SB on the safe blocking dma.
+        if dbDma is None:
+            dbDma = dma
         self.SB = PULPL3TilingGenerationSB(externalMemory, localMemory, dma)
-        self.DB = PULPL3TilingGenerationDB(externalMemory, localMemory, dma)
+        self.DB = PULPL3TilingGenerationDB(externalMemory, localMemory, dbDma)
         self.profilingSB = ProfilingPULPL3TilingGenerationSB(externalMemory, localMemory, dma)
-        self.profilingDB = ProfilingPULPL3TilingGenerationDB(externalMemory, localMemory, dma)
+        self.profilingDB = ProfilingPULPL3TilingGenerationDB(externalMemory, localMemory, dbDma)
 
     def apply(self,
               ctxt: NetworkContext,

@@ -26,6 +26,12 @@ if(POWER_MEASUREMENT)
   target_compile_definitions(${ProjectId} PRIVATE POWER_MEASUREMENT)
 endif()
 
+# L1-memory knob — size the cluster-controller (CC / master) stack from the build.
+# Example (CI / command line): cmake ... -DCC_STACK_SIZE=8192  (see deeploytest.c).
+if(CC_STACK_SIZE)
+  target_compile_definitions(${ProjectId} PRIVATE CC_STACK_SIZE=${CC_STACK_SIZE})
+endif()
+
 # RW: Waive sign comparison warnings from pulp_nn_utils.h
 target_compile_options(network PRIVATE
     -Wno-sign-compare

@@ -33,4 +33,8 @@ CONFIG_PLATFORM_GVSOC=y
 # GAP9 cluster stack size configuration
 # Uncomment and adjust these values if you need to modify stack sizes:
 # CONFIG_CL_MASTER_CORE_STACK_SIZE=14000
+# L1-memory knob B — shrink the SDK's per-PE (slave) cluster stacks to free L1
+# TCDM for the tile arena. Uncomment and lower if kernel stack use is small.
+# (deeploytest.c instead hands its own L2 slave-stack buffer via SET_SLAVE_STACK,
+#  which bypasses this kconfig; use one knob or the other.)
 # CONFIG_CL_SLAVE_CORE_STACK_SIZE=1000
@@ -16,6 +16,36 @@
 // RW: Remove MAINSTACKSIZE because gap9-sdk does not use it
 #define SLAVESTACKSIZE 3800
 
+/* L1-memory knob A — place the cluster slave (PE) stacks in L2 instead of L1.
+ * The GAP9 SDK pi_cl_l1_malloc's the slave stacks only when task->stacks ==
+ * NULL
+ * (__pi_cluster_task_set_stack in the SDK cluster driver); handing it our own
+ * buffer makes it skip that L1 allocation, freeing ~30 KB of L1 (8 cores x
+ * SLAVESTACKSIZE) for the Deeploy tile arena. The static array lands in .bss
+ * (L2). Use SET_SLAVE_STACK(task) instead of setting slave_stack_size directly.
+ * Knob B (alternative): shrink the SDK's own L1 slave stacks via
+ * CONFIG_CL_SLAVE_CORE_STACK_SIZE in the sdk .config (see sdk_gvsoc.config). */
+#define CLUSTER_MAX_CORES 9
+static uint8_t cluster_slave_stacks[SLAVESTACKSIZE * CLUSTER_MAX_CORES]
+    __attribute__((aligned(16)));
+#define SET_SLAVE_STACK(task)                                                  \
+  do {                                                                         \
+    (task).slave_stack_size = SLAVESTACKSIZE;                                  \
+    (task).stacks = cluster_slave_stacks;                                      \
+  } while (0)
+
+/* On-chip memory windows. HyperRAM/L3 (cl_ram_malloc) is NOT CPU-addressable,
+ * so a raw memcpy / CPU-deref of an L3 pointer faults ("Invalid fetch") on real
+ * silicon — GVSoC models HyperRAM as flat RAM and hides the bug. Only the real
+ * on-chip ranges may be touched directly by the FC. The previous `>=
+ * 0x10000000` / `< 0x10000000` tests wrongly matched HyperRAM (≥ 0x10000000)
+ * too. */
+#define IS_L1(ptr)                                                             \
+  ((uint32_t)(ptr) >= 0x10000000u && (uint32_t)(ptr) < 0x10040000u)
+#define IS_L2(ptr)                                                             \
+  (((uint32_t)(ptr) >= 0x1C000000u && (uint32_t)(ptr) < 0x1C200000u) ||        \
+   IS_L1(ptr))
+
 #ifdef POWER_MEASUREMENT
 unsigned int GPIOs = 89;
 #define WRITE_GPIO(x) pi_gpio_pin_write(GPIOs, x)
@@ -98,6 +128,17 @@ int main(void) {
 
   pi_cluster_conf_init(&conf);
   conf.id = 0;
+  /* L1-memory knob C — cluster-controller (CC / master) stack size. The CC
+   * stack is carved from the bottom of L1 (grows down toward the L1 base). The
+   * SDK default PI_CL_CC_STACK_SIZE (0x800 = 2 KB) can be too small for deep
+   * tiling call chains and overflows below the L1 base (silent clobber /
+   * invalid write). pi_cluster_task takes the size from conf.cc_stack_size (NOT
+   * the AutoTiler-only CONFIG_CL_MASTER_CORE_STACK_SIZE kconfig). Override per
+   * build with -DCC_STACK_SIZE=<bytes> (see CMakeLists). */
+#ifndef CC_STACK_SIZE
+#define CC_STACK_SIZE 8192
+#endif
+  conf.cc_stack_size = CC_STACK_SIZE;
   pi_open_from_conf(&cluster_dev, &conf);
   if (pi_cluster_open(&cluster_dev))
     return -1;
@@ -112,14 +153,18 @@ int main(void) {
   struct pi_cluster_task cluster_task;
 
   pi_cluster_task(&cluster_task, InitNetworkWrapper, NULL);
-  cluster_task.slave_stack_size = SLAVESTACKSIZE;
+  SET_SLAVE_STACK(cluster_task);
   pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task);
 
 #ifndef CI
   printf("Initialized\r\n");
 #endif
   for (uint32_t buf = 0; buf < DeeployNetwork_num_inputs; buf++) {
-    if ((uint32_t)DeeployNetwork_inputs[buf] >= 0x10000000) {
+    /* L3 inputs are loaded at runtime from the readfs hex inside InitNetwork
+     * (testInputVector[buf] == NULL) and already live in HyperRAM, which the FC
+     * cannot memcpy into — skip them. Only on-chip (L1/L2) inputs are copied
+     * from the baked testInputVector. */
+    if (testInputVector[buf] != NULL && IS_L2(DeeployNetwork_inputs[buf])) {
       memcpy(DeeployNetwork_inputs[buf], testInputVector[buf],
              DeeployNetwork_inputs_bytes[buf]);
     }
@@ -130,7 +175,7 @@ int main(void) {
 #endif
 
   pi_cluster_task(&cluster_task, RunNetworkWrapper, NULL);
-  cluster_task.slave_stack_size = SLAVESTACKSIZE;
+  SET_SLAVE_STACK(cluster_task);
 
 #ifdef POWER_MEASUREMENT
   WRITE_GPIO(1);
@@ -156,7 +201,10 @@ int main(void) {
   for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) {
     tot_tested += DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE);
 
-    if ((uint32_t)DeeployNetwork_outputs[buf] < 0x10000000) {
+    /* L3 outputs live in HyperRAM (not CPU-addressable) — DMA them into an L2
+     * scratch before the compare. On-chip (L1/L2) outputs are compared in
+     * place. */
+    if (!IS_L2(DeeployNetwork_outputs[buf])) {
       compbuf = pi_l2_malloc(DeeployNetwork_outputs_bytes[buf]);
       ram_read(compbuf, DeeployNetwork_outputs[buf],
                DeeployNetwork_outputs_bytes[buf]);
@@ -174,7 +222,7 @@ int main(void) {
       float_compare_args.err_count = (int *)&float_error_count;
 
       pi_cluster_task(&cluster_task, CL_CompareFloat, &float_compare_args);
-      cluster_task.slave_stack_size = SLAVESTACKSIZE;
+      SET_SLAVE_STACK(cluster_task);
       pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task);
 
       tot_err += float_error_count;
@@ -194,7 +242,7 @@ int main(void) {
         }
       }
     }
-    if ((uint32_t)DeeployNetwork_outputs[buf] < 0x10000000) {
+    if (!IS_L2(DeeployNetwork_outputs[buf])) {
       pi_l2_free(compbuf, DeeployNetwork_outputs_bytes[buf]);
     }
   }

@@ -227,6 +227,28 @@ def run_simulation(config: DeeployTestConfig, skip: bool = False) -> TestResult:
     return test_result
 
 
+def _gap9_memcheck_gate(config: DeeployTestConfig) -> None:
+    """GAP9 build-time memory gate.
+
+    Run the L1/L2 budget + InitNetwork alloc-order validator after the build and
+    before the simulation, so memory over-subscription (which otherwise shows up
+    as a multi-minute GVSoC hang in os_evt_release or a wild-pointer crash) fails
+    fast with the exact knob to turn. GAP9-only; bypass with DEPLOY_SKIP_MEMCHECK=1.
+    """
+    if config.platform != "GAP9" or os.environ.get("DEPLOY_SKIP_MEMCHECK") == "1":
+        return
+    script = Path(__file__).parent.parent / "gap9_memcheck.py"
+    if not script.exists():
+        return
+    cmd = ["python", str(script), config.build_dir, config.gen_dir]
+    log.debug(f"[Execution] GAP9 memcheck: {' '.join(cmd)}")
+    result = subprocess.run(cmd, check = False)
+    if result.returncode == 1:
+        raise RuntimeError(f"GAP9 memory check failed for {config.test_name} (L1/L2 over-subscription "
+                           f"or InitNetwork alloc-order race - see output above). "
+                           f"Set DEPLOY_SKIP_MEMCHECK=1 to bypass.")
+
+
 def run_complete_test(config: DeeployTestConfig, skipgen: bool = False, skipsim: bool = False) -> TestResult:
     """
     Run a complete test: generate, configure, build, and simulate.
@@ -242,6 +264,9 @@ def run_complete_test(config: DeeployTestConfig, skipgen: bool = False, skipsim:
     # Step 3: Build binary
     build_binary(config)
 
+    # Step 3b: GAP9 build-time memory gate (fast-fail before simulation)
+    _gap9_memcheck_gate(config)
+
     # Step 4: Run simulation
     result = run_simulation(config, skip = skipsim)