Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,12 @@ def __init__(self, templateStr: str, bufferName: str):

_stackAllocateTemplate = partial(
_ArgStructAllocateTemplate,
templateStr = "${structDict.typeName} ${name} = (${structDict.typeName}) ${str(structDict)};")
# Declare the argument struct `static` (off-stack), then assign. The cluster
# fork runtime writes its descriptor near the top of the CC/master stack; a
# stack-local arg struct placed there can be clobbered before the forked
# cores read it (observed as a GAP9 cluster-fork crash). Static storage keeps
# it stable for the lifetime of the forked call.
templateStr = "static ${structDict.typeName} ${name}; ${name} = (${structDict.typeName}) ${str(structDict)};")


class ArgumentStructGeneration(CodeTransformationPass, IntrospectiveCodeTransformationMixIn):
Expand Down
10 changes: 7 additions & 3 deletions Deeploy/Targets/GAP9/Bindings.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from Deeploy.DeeployTypes import CodeTransformation, NodeBinding
from Deeploy.FutureExtension.Bindings.AutoFutureBinding import AutoFutureBinding
from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
from Deeploy.Targets.GAP9.DMA.L3Dma import gap9L3DmaHack
from Deeploy.Targets.GAP9.DMA.L3Dma import GAP9L3Dma, gap9L3DmaHack
from Deeploy.Targets.GAP9.DMA.MchanDma import GAP9MchanDma
# Import templates from PULPOpen and Generic
from Deeploy.Targets.Generic.Templates import AddTemplate, ConcatTemplate, DequantTemplate, FloatReduceMeanTemplate, \
Expand Down Expand Up @@ -57,7 +57,9 @@
MemoryManagementGeneration("L1"),
TilingVariableReplacement("L2"),
MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True),
PULPL3Tiling("L3", "L2", gap9L3DmaHack), # Use GAP9-specific L3 DMA
# SB -> blocking gap9L3DmaHack (safe for strided 2D L3 transfers); DB ->
# async GAP9L3Dma for real L3<->L2 prefetch overlap.
PULPL3Tiling("L3", "L2", gap9L3DmaHack, dbDma = GAP9L3Dma()),
PULPProfileUntiled(),
ArgumentStructGeneration(),
L3MemoryAwareFunctionCallClosure(writeback = False),
Expand All @@ -76,7 +78,9 @@
MemoryManagementGeneration("L1"),
TilingVariableReplacement("L2"),
MemoryAwareFunctionCallClosure(writeback = False, generateStruct = True),
PULPL3Tiling("L3", "L2", gap9L3DmaHack), # Use GAP9-specific L3 DMA
# SB -> blocking gap9L3DmaHack (safe for strided 2D L3 transfers); DB ->
# async GAP9L3Dma for real L3<->L2 prefetch overlap.
PULPL3Tiling("L3", "L2", gap9L3DmaHack, dbDma = GAP9L3Dma()),
PULPProfileUntiled(),
ArgumentStructGeneration(),
L3MemoryAwareFunctionCallClosure(writeback = False),
Expand Down
3 changes: 2 additions & 1 deletion Deeploy/Targets/GAP9/DMA/L3Dma.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class GAP9L3DmaFuture(Future):
_waitTemplate = NodeTemplate("""
if (${name}.size != 0) {
pi_cl_ram_copy_wait(&${name});
${name}.size = 0;
}""")


Expand All @@ -29,7 +30,7 @@ class GAP9L3Dma(AsyncDma):
_transferTemplates = {
2:
NodeTemplate(
"pi_cl_ram_copy_2d(get_ram_ptr(), ${ext}, ${loc}, ${transfer_size}, ${stride}, ${length}, ${ext2loc}, &${future});"
"pi_cl_ram_copy_2d(get_ram_ptr(), (uint32_t) ${ext}, ${loc}, ${transfer_size}, ${stride}, ${length}, ${ext2loc}, &${future});"
)
}
_waitingStrategy = PerTensorWaitingStrategy(GAP9L3DmaFuture)
Expand Down
13 changes: 11 additions & 2 deletions Deeploy/Targets/GAP9/DMA/MchanDma.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from typing import Dict, Tuple

from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation, VariableBuffer
from Deeploy.TilingExtension.AsyncDma import AsyncDma, DirectionWaitingStrategy, DmaDirection, Future
from Deeploy.TilingExtension.AsyncDma import AsyncDma, DmaDirection, Future, PerTensorWaitingStrategy


class MchanTransferFuture(Future):
Expand Down Expand Up @@ -36,7 +36,16 @@ class GAP9MchanDma(AsyncDma):
"{ mchan_transfer_t __mchan_tmp = { .cmd = ${cmd}, .size = ${size}, .loc = ${loc}, .ext = ${ext}, .ext_size_1d = ${size_1d}, .ext_stride_1d = ${stride_2d} }; mchan_transfer_push_2d(__mchan_tmp); }"
),
}
_waitingStrategy = DirectionWaitingStrategy(MchanTransferFuture, "transfer")
# PerTensor, NOT Direction: GAP9 mchan allocates a fresh channel on every
# descriptor enqueue (each mchan_transfer_push_* writes a new descriptor and
# the hardware advances to the next channel). DirectionWaitingStrategy shares
# ONE future (one mchan_transfer_get_id) across all same-direction tensors of
# a tile, so a tile with >1 input (e.g. weight + grad) emits one get_id but
# multiple pushes -> the 2nd+ transfers run on channels that are never waited
# nor freed -> mchan_transfer_wait() hangs forever. PerTensor gives each tensor
# its own get_id immediately before its push, matching mchan's
# 1 get_id : 1 push : 1 wait : 1 free contract.
_waitingStrategy = PerTensorWaitingStrategy(MchanTransferFuture)

def __init__(self, transferTemplates: Dict[int, NodeTemplate] = _transferTemplates) -> None:
super().__init__(transferTemplates)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#
# SPDX-License-Identifier: Apache-2.0

from typing import Tuple
from typing import Optional, Tuple

from Deeploy.DeeployTypes import CodeGenVerbosity, CodeTransformationPass, ExecutionBlock, NetworkContext, _NoVerbosity
from Deeploy.TilingExtension.AsyncDma import AsyncDma
Expand Down Expand Up @@ -30,11 +30,19 @@ class ProfilingPULPL3TilingGenerationDB(DoubleBufferingTilingCodeGeneration, Pro

class PULPL3Tiling(CodeTransformationPass):

def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma):
def __init__(self, externalMemory: str, localMemory: str, dma: AsyncDma, dbDma: Optional[AsyncDma] = None):
# SB and DB can use different DMA backends. Async DMA only ever helps DB
# (it overlaps the next-tile prefetch with compute); SB waits on each tile
# before computing, so async gives SB no benefit but all the risk (strided
# 2D L3 transfers can corrupt under deferred waits). Defaulting dbDma to dma
# keeps backward compatibility; pass an async dma as dbDma for real L3<->L2
# overlap on the DB path while keeping SB on the safe blocking dma.
if dbDma is None:
dbDma = dma
self.SB = PULPL3TilingGenerationSB(externalMemory, localMemory, dma)
self.DB = PULPL3TilingGenerationDB(externalMemory, localMemory, dma)
self.DB = PULPL3TilingGenerationDB(externalMemory, localMemory, dbDma)
self.profilingSB = ProfilingPULPL3TilingGenerationSB(externalMemory, localMemory, dma)
self.profilingDB = ProfilingPULPL3TilingGenerationDB(externalMemory, localMemory, dma)
self.profilingDB = ProfilingPULPL3TilingGenerationDB(externalMemory, localMemory, dbDma)

def apply(self,
ctxt: NetworkContext,
Expand Down
6 changes: 6 additions & 0 deletions DeeployTest/Platforms/GAP9/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,12 @@ if(POWER_MEASUREMENT)
target_compile_definitions(${ProjectId} PRIVATE POWER_MEASUREMENT)
endif()

# L1-memory knob — size the cluster-controller (CC / master) stack from the build.
# Example (CI / command line): cmake ... -DCC_STACK_SIZE=8192 (see deeploytest.c).
if(CC_STACK_SIZE)
target_compile_definitions(${ProjectId} PRIVATE CC_STACK_SIZE=${CC_STACK_SIZE})
endif()

# RW: Waive sign comparison warnings from pulp_nn_utils.h
target_compile_options(network PRIVATE
-Wno-sign-compare
Expand Down
4 changes: 4 additions & 0 deletions DeeployTest/Platforms/GAP9/sdk_gvsoc.config
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,8 @@ CONFIG_PLATFORM_GVSOC=y
# GAP9 cluster stack size configuration
# Uncomment and adjust these values if you need to modify stack sizes:
# CONFIG_CL_MASTER_CORE_STACK_SIZE=14000
# L1-memory knob B — shrink the SDK's per-PE (slave) cluster stacks to free L1
# TCDM for the tile arena. Uncomment and lower if kernel stack use is small.
# (deeploytest.c instead hands its own L2 slave-stack buffer via SET_SLAVE_STACK,
# which bypasses this kconfig; use one knob or the other.)
# CONFIG_CL_SLAVE_CORE_STACK_SIZE=1000
60 changes: 54 additions & 6 deletions DeeployTest/Platforms/GAP9/src/deeploytest.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,36 @@
// RW: Remove MAINSTACKSIZE because gap9-sdk does not use it
#define SLAVESTACKSIZE 3800

/* L1-memory knob A — place the cluster slave (PE) stacks in L2 instead of L1.
* The GAP9 SDK pi_cl_l1_malloc's the slave stacks only when task->stacks ==
* NULL
* (__pi_cluster_task_set_stack in the SDK cluster driver); handing it our own
* buffer makes it skip that L1 allocation, freeing ~30 KB of L1 (8 cores x
* SLAVESTACKSIZE) for the Deeploy tile arena. The static array lands in .bss
* (L2). Use SET_SLAVE_STACK(task) instead of setting slave_stack_size directly.
* Knob B (alternative): shrink the SDK's own L1 slave stacks via
* CONFIG_CL_SLAVE_CORE_STACK_SIZE in the sdk .config (see sdk_gvsoc.config). */
#define CLUSTER_MAX_CORES 9
static uint8_t cluster_slave_stacks[SLAVESTACKSIZE * CLUSTER_MAX_CORES]
__attribute__((aligned(16)));
#define SET_SLAVE_STACK(task) \
do { \
(task).slave_stack_size = SLAVESTACKSIZE; \
(task).stacks = cluster_slave_stacks; \
} while (0)

/* On-chip memory windows. HyperRAM/L3 (cl_ram_malloc) is NOT CPU-addressable,
* so a raw memcpy / CPU-deref of an L3 pointer faults ("Invalid fetch") on real
* silicon — GVSoC models HyperRAM as flat RAM and hides the bug. Only the real
* on-chip ranges may be touched directly by the FC. The previous `>=
* 0x10000000` / `< 0x10000000` tests wrongly matched HyperRAM (≥ 0x10000000)
* too. */
#define IS_L1(ptr) \
((uint32_t)(ptr) >= 0x10000000u && (uint32_t)(ptr) < 0x10040000u)
#define IS_L2(ptr) \
(((uint32_t)(ptr) >= 0x1C000000u && (uint32_t)(ptr) < 0x1C200000u) || \
IS_L1(ptr))

#ifdef POWER_MEASUREMENT
unsigned int GPIOs = 89;
#define WRITE_GPIO(x) pi_gpio_pin_write(GPIOs, x)
Expand Down Expand Up @@ -98,6 +128,17 @@ int main(void) {

pi_cluster_conf_init(&conf);
conf.id = 0;
/* L1-memory knob C — cluster-controller (CC / master) stack size. The CC
* stack is carved from the bottom of L1 (grows down toward the L1 base). The
* SDK default PI_CL_CC_STACK_SIZE (0x800 = 2 KB) can be too small for deep
* tiling call chains and overflows below the L1 base (silent clobber /
* invalid write). pi_cluster_task takes the size from conf.cc_stack_size (NOT
* the AutoTiler-only CONFIG_CL_MASTER_CORE_STACK_SIZE kconfig). Override per
* build with -DCC_STACK_SIZE=<bytes> (see CMakeLists). */
#ifndef CC_STACK_SIZE
#define CC_STACK_SIZE 8192
#endif
conf.cc_stack_size = CC_STACK_SIZE;
pi_open_from_conf(&cluster_dev, &conf);
if (pi_cluster_open(&cluster_dev))
return -1;
Expand All @@ -112,14 +153,18 @@ int main(void) {
struct pi_cluster_task cluster_task;

pi_cluster_task(&cluster_task, InitNetworkWrapper, NULL);
cluster_task.slave_stack_size = SLAVESTACKSIZE;
SET_SLAVE_STACK(cluster_task);
pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task);

#ifndef CI
printf("Initialized\r\n");
#endif
for (uint32_t buf = 0; buf < DeeployNetwork_num_inputs; buf++) {
if ((uint32_t)DeeployNetwork_inputs[buf] >= 0x10000000) {
/* L3 inputs are loaded at runtime from the readfs hex inside InitNetwork
* (testInputVector[buf] == NULL) and already live in HyperRAM, which the FC
* cannot memcpy into — skip them. Only on-chip (L1/L2) inputs are copied
* from the baked testInputVector. */
if (testInputVector[buf] != NULL && IS_L2(DeeployNetwork_inputs[buf])) {
memcpy(DeeployNetwork_inputs[buf], testInputVector[buf],
DeeployNetwork_inputs_bytes[buf]);
}
Expand All @@ -130,7 +175,7 @@ int main(void) {
#endif

pi_cluster_task(&cluster_task, RunNetworkWrapper, NULL);
cluster_task.slave_stack_size = SLAVESTACKSIZE;
SET_SLAVE_STACK(cluster_task);

#ifdef POWER_MEASUREMENT
WRITE_GPIO(1);
Expand All @@ -156,7 +201,10 @@ int main(void) {
for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) {
tot_tested += DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE);

if ((uint32_t)DeeployNetwork_outputs[buf] < 0x10000000) {
/* L3 outputs live in HyperRAM (not CPU-addressable) — DMA them into an L2
* scratch before the compare. On-chip (L1/L2) outputs are compared in
* place. */
if (!IS_L2(DeeployNetwork_outputs[buf])) {
compbuf = pi_l2_malloc(DeeployNetwork_outputs_bytes[buf]);
ram_read(compbuf, DeeployNetwork_outputs[buf],
DeeployNetwork_outputs_bytes[buf]);
Expand All @@ -174,7 +222,7 @@ int main(void) {
float_compare_args.err_count = (int *)&float_error_count;

pi_cluster_task(&cluster_task, CL_CompareFloat, &float_compare_args);
cluster_task.slave_stack_size = SLAVESTACKSIZE;
SET_SLAVE_STACK(cluster_task);
pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task);

tot_err += float_error_count;
Expand All @@ -194,7 +242,7 @@ int main(void) {
}
}
}
if ((uint32_t)DeeployNetwork_outputs[buf] < 0x10000000) {
if (!IS_L2(DeeployNetwork_outputs[buf])) {
pi_l2_free(compbuf, DeeployNetwork_outputs_bytes[buf]);
}
}
Expand Down
25 changes: 25 additions & 0 deletions DeeployTest/testUtils/core/execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,28 @@ def run_simulation(config: DeeployTestConfig, skip: bool = False) -> TestResult:
return test_result


def _gap9_memcheck_gate(config: DeeployTestConfig) -> None:
"""GAP9 build-time memory gate.

Run the L1/L2 budget + InitNetwork alloc-order validator after the build and
before the simulation, so memory over-subscription (which otherwise shows up
as a multi-minute GVSoC hang in os_evt_release or a wild-pointer crash) fails
fast with the exact knob to turn. GAP9-only; bypass with DEPLOY_SKIP_MEMCHECK=1.
"""
if config.platform != "GAP9" or os.environ.get("DEPLOY_SKIP_MEMCHECK") == "1":
return
script = Path(__file__).parent.parent / "gap9_memcheck.py"
if not script.exists():
return
cmd = ["python", str(script), config.build_dir, config.gen_dir]
log.debug(f"[Execution] GAP9 memcheck: {' '.join(cmd)}")
result = subprocess.run(cmd, check = False)
if result.returncode == 1:
raise RuntimeError(f"GAP9 memory check failed for {config.test_name} (L1/L2 over-subscription "
f"or InitNetwork alloc-order race - see output above). "
f"Set DEPLOY_SKIP_MEMCHECK=1 to bypass.")


def run_complete_test(config: DeeployTestConfig, skipgen: bool = False, skipsim: bool = False) -> TestResult:
"""
Run a complete test: generate, configure, build, and simulate.
Expand All @@ -242,6 +264,9 @@ def run_complete_test(config: DeeployTestConfig, skipgen: bool = False, skipsim:
# Step 3: Build binary
build_binary(config)

# Step 3b: GAP9 build-time memory gate (fast-fail before simulation)
_gap9_memcheck_gate(config)

# Step 4: Run simulation
result = run_simulation(config, skip = skipsim)

Expand Down
Loading
Loading