diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 600c94a5b..7a4c116e4 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -14,7 +14,10 @@ jobs: run-test: uses: ./.github/workflows/test_workflow.yml with: - configuration: '["asan"]' # Ignoring tsan for now '["asan", "tsan"]' + configuration: '["asan"]' + # C++ gtests (ASan + TSan) run on every PR via native-sanitizer-tests in ci.yml. + # Skip them here so the nightly focuses on Java functional tests under ASan. + skip_gtest: true report-failures: runs-on: ubuntu-latest needs: run-test diff --git a/.github/workflows/test_workflow.yml b/.github/workflows/test_workflow.yml index 62287dbd6..ca403aead 100644 --- a/.github/workflows/test_workflow.yml +++ b/.github/workflows/test_workflow.yml @@ -6,6 +6,11 @@ on: configuration: required: true type: string + skip_gtest: + description: "Skip C++ gtest execution (use when gtests run in a separate job)" + required: false + type: boolean + default: false permissions: contents: read @@ -111,7 +116,7 @@ jobs: for attempt in $(seq 1 $MAX_ATTEMPTS); do mkdir -p build/logs - ./gradlew -PCI -PkeepJFRs :ddprof-test:test${{ matrix.config }} --no-daemon --parallel --build-cache --no-watch-fs 2>&1 \ + ./gradlew -PCI -PkeepJFRs ${{ inputs.skip_gtest == true && '-Pskip-gtest' || '' }} :ddprof-test:test${{ matrix.config }} --no-daemon --parallel --build-cache --no-watch-fs 2>&1 \ | tee -a build/test-raw.log \ | python3 -u .github/scripts/filter_gradle_log.py EXIT_CODE=${PIPESTATUS[0]} @@ -399,7 +404,7 @@ jobs: for attempt in $(seq 1 $MAX_ATTEMPTS); do mkdir -p build/logs - ./gradlew -PCI -PkeepJFRs :ddprof-test:test${{ matrix.config }} --no-daemon --parallel --build-cache --no-watch-fs 2>&1 \ + ./gradlew -PCI -PkeepJFRs ${{ inputs.skip_gtest == true && '-Pskip-gtest' || '' }} :ddprof-test:test${{ matrix.config }} --no-daemon --parallel --build-cache --no-watch-fs 2>&1 \ | tee -a build/test-raw.log \ | python3 -u .github/scripts/filter_gradle_log.py EXIT_CODE=${PIPESTATUS[0]} diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 60dca3dfe..7ae3d7a66 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -21,6 +21,7 @@ stages: - images - generate-signing-key - prepare + - sanitizer - build - stresstest - deploy @@ -160,3 +161,4 @@ include: - local: .gitlab/benchmarks/.gitlab-ci.yml - local: .gitlab/reliability/.gitlab-ci.yml - local: .gitlab/dd-trace-integration/.gitlab-ci.yml + - local: .gitlab/sanitizer-tests/.gitlab-ci.yml diff --git a/.gitlab/build-deploy/.gitlab-ci.yml b/.gitlab/build-deploy/.gitlab-ci.yml index aea48c652..758f07d1e 100644 --- a/.gitlab/build-deploy/.gitlab-ci.yml +++ b/.gitlab/build-deploy/.gitlab-ci.yml @@ -207,6 +207,16 @@ build-artifact: artifacts: true - job: build:arm64-musl artifacts: true + - job: gtest-asan-amd64 + artifacts: false + - job: gtest-tsan-amd64 + artifacts: false + optional: true + - job: gtest-asan-arm64 + artifacts: false + - job: gtest-tsan-arm64 + artifacts: false + optional: true when: on_success tags: [ "arch:amd64" ] image: ${BUILD_IMAGE_X64} diff --git a/.gitlab/sanitizer-tests/.gitlab-ci.yml b/.gitlab/sanitizer-tests/.gitlab-ci.yml new file mode 100644 index 000000000..2c63425b2 --- /dev/null +++ b/.gitlab/sanitizer-tests/.gitlab-ci.yml @@ -0,0 +1,115 @@ +# C++ unit tests under ASan and TSan. +# +# These run on every branch push (not MR pipelines — GitHub Actions handles those). +# +# Strategy: use Gradle only for compile+link (buildGtest{Config}), then run +# each binary directly from the shell. This bypasses Gradle's daemon I/O +# which swallows child process output when fd 1/2 are not the terminal. + +.sanitizer_job: + stage: sanitizer + extends: .cache-config + needs: [] + timeout: 30m + variables: + GRADLE_USER_HOME: .gradle + rules: + - if: '$JDK_VERSION != null || $DEBUG_LEVEL != null || $HASH != null || $DOWNSTREAM != null' + when: never + - if: '$CI_PIPELINE_SOURCE == "merge_request_event"' + when: never + - when: on_success + interruptible: true + before_script: + - apt-get update -qq + - apt-get install -y -qq cmake libgtest-dev libgmock-dev binutils libc6-dbg llvm + script: + - ./gradlew :ddprof-lib:buildGtest${SANITIZER_CONFIG} --no-daemon --parallel --build-cache + - | + find ddprof-lib/build/bin/gtest -mindepth 2 -maxdepth 2 -type f -executable \ + | grep "/${SANITIZER_LC}_" \ + | sort \ + | while read binary; do + echo "" + echo "=== $(basename $binary) ===" + "$binary" + rc=$? + if [ $rc -ne 0 ]; then + echo "FAILED: $(basename $binary) exited $rc" + exit $rc + fi + done + artifacts: + when: always + paths: + - ddprof-lib/build/bin/gtest/${SANITIZER_LC}*/ + expire_in: 1 day + +gtest-asan-amd64: + extends: .sanitizer_job + allow_failure: true + tags: [ "arch:amd64" ] + image: $BUILD_IMAGE_X64 + variables: + SANITIZER_CONFIG: Asan + SANITIZER_LC: asan + +gtest-tsan-amd64: + extends: .sanitizer_job + allow_failure: true + # docker-in-docker:amd64 = Kata Containers (kata-qemu micro VMs). + # Kata maps host-guest communication structures at fixed high addresses + # that land in TSan's shadow region regardless of LLVM version or sysctl. + # TSan on amd64 requires a non-Kata runner (EC2 or bare metal). + # Kept allow_failure so it runs and provides coverage if the environment is fixed. + tags: [ "docker-in-docker:amd64" ] + image: $BUILD_IMAGE_X64 + variables: + SANITIZER_CONFIG: Tsan + SANITIZER_LC: tsan + script: + - ./gradlew :ddprof-lib:buildGtest${SANITIZER_CONFIG} --no-daemon --parallel --build-cache + - | + find ddprof-lib/build/bin/gtest -mindepth 2 -maxdepth 2 -type f -executable \ + | grep "/${SANITIZER_LC}_" | sort | while read binary; do + echo "=== $(basename $binary) ===" + GTEST_DEATH_TEST_STYLE=threadsafe "$binary" + rc=$? + [ $rc -ne 0 ] && { echo "FAILED: $(basename $binary) exited $rc"; exit $rc; } + done + +gtest-asan-arm64: + extends: .sanitizer_job + allow_failure: true + tags: [ "arch:arm64" ] + image: $BUILD_IMAGE_ARM64 + variables: + SANITIZER_CONFIG: Asan + SANITIZER_LC: asan + +gtest-tsan-arm64: + extends: .sanitizer_job + allow_failure: true + # docker-in-docker:arm64 = EC2 VM. sysctl works directly. + # vm.mmap_rnd_bits=28 is sufficient — TSan's LLVM re-exec handles the rare + # case where a library lands in the shadow region by re-running the process + # via personality(ADDR_NO_RANDOMIZE). + # Do NOT set kernel.randomize_va_space=0: with ASLR fully off, ld-linux-aarch64.so + # loads at its fixed default address (0x002000000000) which is exactly TSan's + # 39-bit shadow start — guaranteed conflict every time. + tags: [ "docker-in-docker:arm64" ] + image: $BUILD_IMAGE_ARM64 + variables: + SANITIZER_CONFIG: Tsan + SANITIZER_LC: tsan + script: + - ./gradlew :ddprof-lib:buildGtest${SANITIZER_CONFIG} --no-daemon --parallel --build-cache + - | + sysctl -w vm.mmap_rnd_bits=28 2>/dev/null || true + find ddprof-lib/build/bin/gtest -mindepth 2 -maxdepth 2 -type f -executable \ + | grep "/${SANITIZER_LC}_" | sort | while read binary; do + echo "=== $(basename $binary) ===" + GTEST_DEATH_TEST_STYLE=threadsafe "$binary" + rc=$? + [ $rc -ne 0 ] && { echo "FAILED: $(basename $binary) exited $rc"; exit $rc; } + done diff --git a/build-logic/conventions/src/main/kotlin/com/datadoghq/native/config/ConfigurationPresets.kt b/build-logic/conventions/src/main/kotlin/com/datadoghq/native/config/ConfigurationPresets.kt index 0f3e9dd10..275e99321 100644 --- a/build-logic/conventions/src/main/kotlin/com/datadoghq/native/config/ConfigurationPresets.kt +++ b/build-logic/conventions/src/main/kotlin/com/datadoghq/native/config/ConfigurationPresets.kt @@ -187,17 +187,23 @@ object ConfigurationPresets { config.compilerArgs.set(asanCompilerArgs + commonLinuxCompilerArgs(version)) val libasan = PlatformUtils.locateLibasan(compiler) + // Link against the sanitizer runtime that matches the compiler: + // - clang: locateLibasan returns libclang_rt.asan-.so, which + // includes UBSan symbols; -lclang_rt.asan- satisfies -z defs + // for both __asan_* and __ubsan_* and matches the runtime that + // -fsanitize=address links into executables — one runtime, no conflict. + // - gcc: locateLibasan returns libasan.so; -lasan + -lubsan as before. val asanLinkerArgs = if (libasan != null) { - listOf( - "-L${File(libasan).parent}", - "-lasan", - "-lubsan", - "-fsanitize=address", - "-fsanitize=undefined", - "-fno-omit-frame-pointer" - ) + val asanLibDir = File(libasan).parent + val asanLibName = File(libasan).nameWithoutExtension.removePrefix("lib") + val ubsanLibs = if (asanLibName.startsWith("clang_rt")) emptyList() + else listOf("-lubsan") + listOf("-L$asanLibDir", "-l$asanLibName", + "-Wl,-rpath,$asanLibDir") + + ubsanLibs + + listOf("-fsanitize=address", "-fsanitize=undefined", "-fno-omit-frame-pointer") } else { - emptyList() + listOf("-fsanitize=address", "-fsanitize=undefined", "-fno-omit-frame-pointer") } config.linkerArgs.set(commonLinuxLinkerArgs() + asanLinkerArgs) @@ -205,8 +211,8 @@ object ConfigurationPresets { if (libasan != null) { config.testEnvironment.apply { put("LD_PRELOAD", libasan) - put("ASAN_OPTIONS", "allocator_may_return_null=1:unwind_abort_on_malloc=1:use_sigaltstack=0:detect_stack_use_after_return=0:handle_segv=0:halt_on_error=0:abort_on_error=0:print_stacktrace=1:symbolize=1:log_path=/tmp/asan_%p.log:suppressions=$rootDir/gradle/sanitizers/asan.supp") - put("UBSAN_OPTIONS", "halt_on_error=0:abort_on_error=0:print_stacktrace=1:log_path=/tmp/ubsan_%p.log:suppressions=$rootDir/gradle/sanitizers/ubsan.supp") + put("ASAN_OPTIONS", "allocator_may_return_null=1:unwind_abort_on_malloc=1:use_sigaltstack=0:detect_stack_use_after_return=0:handle_segv=0:halt_on_error=0:abort_on_error=0:print_stacktrace=1:symbolize=1:suppressions=$rootDir/gradle/sanitizers/asan.supp") + put("UBSAN_OPTIONS", "halt_on_error=0:abort_on_error=0:print_stacktrace=1:suppressions=$rootDir/gradle/sanitizers/ubsan.supp") put("LSAN_OPTIONS", "detect_leaks=0") } } @@ -260,7 +266,7 @@ object ConfigurationPresets { if (libtsan != null) { config.testEnvironment.apply { put("LD_PRELOAD", libtsan) - put("TSAN_OPTIONS", "suppressions=$rootDir/gradle/sanitizers/tsan.supp:log_path=/tmp/tsan_%p.log") + put("TSAN_OPTIONS", "suppressions=$rootDir/gradle/sanitizers/tsan.supp") } } } diff --git a/build-logic/conventions/src/main/kotlin/com/datadoghq/native/gtest/GtestPlugin.kt b/build-logic/conventions/src/main/kotlin/com/datadoghq/native/gtest/GtestPlugin.kt index 793ff75b1..d2196cf8d 100644 --- a/build-logic/conventions/src/main/kotlin/com/datadoghq/native/gtest/GtestPlugin.kt +++ b/build-logic/conventions/src/main/kotlin/com/datadoghq/native/gtest/GtestPlugin.kt @@ -236,7 +236,7 @@ class GtestPlugin : Plugin { } testDir.listFiles()?.filter { it.name.endsWith(".cpp") }?.forEach { testFile -> - val executeTask = GtestTaskBuilder(project, extension, config) + val taskBundle = GtestTaskBuilder(project, extension, config) .forTest(testFile) .withCompiler(compiler) .withIncludes(includeFiles) @@ -244,8 +244,8 @@ class GtestPlugin : Plugin { .onlyIfGtest(hasGtest) .build() - gtestConfigTask.configure { dependsOn(executeTask) } - gtestAll.configure { dependsOn(executeTask) } + gtestConfigTask.configure { dependsOn(taskBundle) } + gtestAll.configure { dependsOn(taskBundle) } // buildGtest depends on the link task, not the run task buildGtestConfigTask.configure { dependsOn("linkGtest${config.capitalizedName()}_${testFile.nameWithoutExtension}") diff --git a/build-logic/conventions/src/main/kotlin/com/datadoghq/native/gtest/GtestTaskBuilder.kt b/build-logic/conventions/src/main/kotlin/com/datadoghq/native/gtest/GtestTaskBuilder.kt index a56de4ae7..064ed6ba0 100644 --- a/build-logic/conventions/src/main/kotlin/com/datadoghq/native/gtest/GtestTaskBuilder.kt +++ b/build-logic/conventions/src/main/kotlin/com/datadoghq/native/gtest/GtestTaskBuilder.kt @@ -76,7 +76,7 @@ class GtestTaskBuilder( /** * Provide the shared library compile task whose objects are linked into - * every test binary. Allows the library sources to be compiled once + * every test binary. Allows the 59 library sources to be compiled once * instead of once per test file. */ fun withSharedLibObjects(task: TaskProvider): GtestTaskBuilder { @@ -112,8 +112,8 @@ class GtestTaskBuilder( this.compiler.set(this@GtestTaskBuilder.compiler) this.compilerArgs.set(compilerArgs) - // When a shared library compile task is provided, library sources - // are compiled once there. Only compile the test file itself here. + // When a shared library compile task is provided, library sources are + // compiled once there. Only compile the test file itself here. if (sharedLibCompileTask != null) { sources.from(testFile) } else { @@ -128,7 +128,14 @@ class GtestTaskBuilder( } private fun buildLinkTask(compileTask: TaskProvider): TaskProvider { - val linkerArgs = config.linkerArgs.get() + // For executables, clang's -fsanitize=address statically embeds the full + // ASan runtime (--whole-archive libclang_rt.asan*.a). Adding an explicit + // -lclang_rt.asan or -lasan on top produces a second dynamic NEEDED entry, + // which triggers "incompatible ASan runtimes" at startup (two __asan_init + // calls). Strip the explicit sanitizer -l/-L/-rpath flags here so the + // executable relies solely on clang's automatic static embedding. + val sanitizerLibPattern = Regex("^(-lasan|-lubsan|-lclang_rt\\.asan.*|-lclang_rt\\.ubsan.*|-L.*/clang.*/|-Wl,-rpath,.*/clang.*/)") + val linkerArgs = config.linkerArgs.get().filter { !sanitizerLibPattern.containsMatchIn(it) } val objDir = project.file("${project.layout.buildDirectory.get()}/obj/gtest/${config.name}/$testName") val binary = project.file("${project.layout.buildDirectory.get()}/bin/gtest/${config.name}_$testName/$testName") @@ -191,6 +198,21 @@ class GtestTaskBuilder( inputs.files(binary) + // Gradle's default Exec task buffers child output and discards it on + // failure. /dev/std* bypass the logging infrastructure and stream + // bytes directly to fd 1/2 of the Gradle JVM so sanitizer reports + // are always visible in CI. + if (PlatformUtils.currentPlatform == Platform.LINUX) { + val devStdout = java.io.FileOutputStream("/dev/stdout") + val devStderr = java.io.FileOutputStream("/dev/stderr") + standardOutput = devStdout + errorOutput = devStderr + doLast { + devStdout.flush(); devStdout.close() + devStderr.flush(); devStderr.close() + } + } + if (extension.alwaysRun.get()) { outputs.upToDateWhen { false } } diff --git a/build-logic/conventions/src/main/kotlin/com/datadoghq/native/util/PlatformUtils.kt b/build-logic/conventions/src/main/kotlin/com/datadoghq/native/util/PlatformUtils.kt index 919d4fbf8..86a187893 100644 --- a/build-logic/conventions/src/main/kotlin/com/datadoghq/native/util/PlatformUtils.kt +++ b/build-logic/conventions/src/main/kotlin/com/datadoghq/native/util/PlatformUtils.kt @@ -125,7 +125,25 @@ object PlatformUtils { return null } - fun locateLibasan(compiler: String = "gcc"): String? = locateLibrary("libasan", compiler) + fun locateLibasan(compiler: String = "gcc"): String? { + if (currentPlatform != Platform.LINUX) return null + // For clang, prefer the architecture-specific clang_rt.asan library over + // GCC's libasan. Using GCC's runtime alongside clang's libclang_rt.asan + // (which -fsanitize=address links for executables) causes "incompatible + // ASan runtimes" at startup. The clang runtime also includes UBSan symbols, + // so no separate -lubsan is needed. + if (compiler.contains("clang")) { + val archSuffix = when (currentArchitecture) { + Architecture.X64 -> "x86_64" + Architecture.ARM64 -> "aarch64" + Architecture.X86 -> "i386" + Architecture.ARM -> "arm" + } + val clangAsan = locateLibrary("libclang_rt.asan-$archSuffix", compiler) + if (clangAsan != null) return clangAsan + } + return locateLibrary("libasan", compiler) + } fun locateLibtsan(compiler: String = "gcc"): String? = locateLibrary("libtsan", compiler) diff --git a/ddprof-lib/src/main/cpp/gtest_crash_handler.h b/ddprof-lib/src/main/cpp/gtest_crash_handler.h index 6f75343ce..7afa6c5bd 100644 --- a/ddprof-lib/src/main/cpp/gtest_crash_handler.h +++ b/ddprof-lib/src/main/cpp/gtest_crash_handler.h @@ -118,29 +118,35 @@ void specificCrashHandler(int sig, siginfo_t *info, void *context) { gtestCrashHandler(sig, info, context, TestName); } -// Install crash handler for debugging +// Install crash handler for debugging. +// No-op under TSan: TSan installs its own SIGSEGV/SIGBUS/SIGABRT interceptors +// and overriding them causes TSan to crash before it can write its report. template void installGtestCrashHandler() { +#if !defined(__SANITIZE_THREAD__) struct sigaction sa; sa.sa_flags = SA_SIGINFO; // Get detailed info, keep handler active sigemptyset(&sa.sa_mask); sa.sa_sigaction = specificCrashHandler; - + // Install for various crash signals sigaction(SIGSEGV, &sa, nullptr); sigaction(SIGBUS, &sa, nullptr); sigaction(SIGABRT, &sa, nullptr); sigaction(SIGFPE, &sa, nullptr); sigaction(SIGILL, &sa, nullptr); +#endif } -// Restore default signal handlers +// Restore default signal handlers. inline void restoreDefaultSignalHandlers() { +#if !defined(__SANITIZE_THREAD__) signal(SIGSEGV, SIG_DFL); signal(SIGBUS, SIG_DFL); signal(SIGABRT, SIG_DFL); signal(SIGFPE, SIG_DFL); signal(SIGILL, SIG_DFL); +#endif } #endif // GTEST_CRASH_HANDLER_H \ No newline at end of file diff --git a/ddprof-lib/src/test/cpp/ddprof_ut.cpp b/ddprof-lib/src/test/cpp/ddprof_ut.cpp index fdb3bfae0..3a5db92e5 100644 --- a/ddprof-lib/src/test/cpp/ddprof_ut.cpp +++ b/ddprof-lib/src/test/cpp/ddprof_ut.cpp @@ -373,6 +373,9 @@ static DdprofGlobalSetup ddprof_global_setup; // This test exercises the exact race window by calling clearCurrentThreadTLS() // inside a live CriticalSection scope, then verifying the flag is cleared. // Without the fix tryEnterCriticalSection() returns false (exit 5). + // fork() is unsupported under TSan: the child inherits shadow memory in an + // inconsistent state and crashes before any TSan report can be written. + #if !defined(__SANITIZE_THREAD__) TEST(ProfiledThreadTeardown, CriticalSectionExitsEvenAfterTLSCleared) { pid_t pid = fork(); ASSERT_NE(-1, pid); @@ -410,6 +413,7 @@ static DdprofGlobalSetup ddprof_global_setup; ASSERT_TRUE(WIFEXITED(status)) << "child crashed (signal " << WTERMSIG(status) << ")"; ASSERT_EQ(0, WEXITSTATUS(status)) << "child exited with code " << WEXITSTATUS(status); } + #endif // !__SANITIZE_THREAD__ int main(int argc, char **argv) { ::testing::InitGoogleTest(&argc, argv); diff --git a/doc/README.md b/doc/README.md index 8f4661d1f..8400a1c6f 100644 --- a/doc/README.md +++ b/doc/README.md @@ -24,6 +24,7 @@ All documentation files use **PascalCase** naming (e.g., `BuildSystemGuide.md`). - [BuildSystemGuide](build/BuildSystemGuide.md) - Comprehensive build system documentation - [GradleTasks](build/GradleTasks.md) - Available Gradle tasks reference - [NativeBuildPlugin](build/NativeBuildPlugin.md) - Native C++ compilation plugin +- [TestingGuide](build/TestingGuide.md) - Test strategy: tiers, sanitizers, CI layout, and local workflows ### Reference - [ProfilerMemoryRequirements](reference/ProfilerMemoryRequirements.md) - Memory usage and limits diff --git a/doc/build/TestingGuide.md b/doc/build/TestingGuide.md new file mode 100644 index 000000000..a7073598c --- /dev/null +++ b/doc/build/TestingGuide.md @@ -0,0 +1,303 @@ +# Testing Guide + +This document describes the complete test strategy for the java-profiler project: +what runs where, what each tier is designed to catch, and how to run each tier +locally. + +## Overview + +Tests are split across four tiers based on what they detect and what infrastructure +they require: + +| Tier | System | When | Sanitizers | Purpose | +|------|--------|------|-----------|---------| +| **C++ unit tests** | GitLab | Every branch push | ASan + TSan | Data races and memory errors in native internals | +| **Java functional tests** | GitHub Actions | Nightly | ASan | Correctness + memory errors in JVMTI paths | +| **dd-trace integration** | GitLab | Every branch push | None | Compatibility with the tracer agent | +| **Chaos / reliability** | GitLab | Nightly scheduled | None | Long-duration stability and probabilistic crash detection | + +--- + +## Tier 1 — C++ Unit Tests (Every Branch Push) + +**Pipeline:** `.gitlab/sanitizer-tests/.gitlab-ci.yml`, `build` stage + +**Jobs:** `gtest-asan-amd64`, `gtest-tsan-amd64`, `gtest-asan-arm64`, `gtest-tsan-arm64` + +**Gradle tasks:** `:ddprof-lib:gtestAsan`, `:ddprof-lib:gtestTsan` + +**Runs on:** amd64 and aarch64, using the standard `BUILD_IMAGE_X64` / `BUILD_IMAGE_ARM64` +images, on every branch push (same trigger as the dd-trace integration tests) + +The C++ gtest suite in `ddprof-lib/src/test/cpp/` exercises profiler internals +directly, without a JVM. This makes both ASan and TSan effective: + +- **ASan** (`-fsanitize=address,undefined`) detects buffer overflows, use-after-free, + and pointer arithmetic errors in the signal handler path and native data structures. +- **TSan** (`-fsanitize=thread`) detects data races between signal handlers, profiling + threads, and class-unload callbacks — exactly the class of bug most likely to + produce intermittent JVM crashes in the field. + +TSan is only viable at this tier. The JVM binary contains intentional unsynchronized +patterns (lock-free GC internals, biased locking) that produce too many false +positives in the Java functional tier. `gradle/sanitizers/tsan.supp` captures +suppressions from earlier attempts; it exists for the benefit of any future JVM-level +TSan runs, but is not applied here since these tests never load a JVM. + +**Why GitLab and not GitHub Actions:** TSan requires `vm.mmap_rnd_bits ≤ 28` and its +re-exec fallback (`personality(ADDR_NO_RANDOMIZE)`) to handle ASLR conflicts. GitHub +Actions' ubuntu-latest runners have `vm.mmap_rnd_bits=32` and their seccomp profile +blocks the `personality` syscall. The Datadog GitLab runners have stable kernel +settings tuned for benchmark workloads. + +**Key test files:** + +| File | Covers | +|------|--------| +| `dictionary_concurrent_ut.cpp` | Concurrent read/write/clear of `Dictionary` — the `std::_Rb_tree_increment` race path | +| `thread_teardown_safety_ut.cpp` | Signal delivery during `ProfiledThread` TLS clear and delete | +| `profiler_null_calltrace_buffer_ut.cpp` | Null calltrace buffer guard in the JVMTI sample path (PROF-14679) | +| `stress_callTraceStorage.cpp` | `CallTraceStorage` under concurrent write pressure | +| `test_callTraceStorage.cpp` | `CallTraceStorage` buffer swap correctness | +| `sigaction_interception_ut.cpp` | `sigaction` interception correctness and re-entrancy | +| `signalOrigin_ut.cpp` | Signal origin detection and classification | +| `spinlock_bounded_ut.cpp` | `SpinLock` / `BoundedOptionalSharedLockGuard` under contention | + +**Local run:** +```bash +# Individual sanitizer configs +./gradlew :ddprof-lib:gtestAsan +./gradlew :ddprof-lib:gtestTsan + +# All configs (debug, release, asan, tsan) +./gradlew :ddprof-lib:gtest + +# Specific test +./gradlew :ddprof-lib:gtestAsan_dictionary_concurrent_ut +``` + +Prerequisites on Ubuntu: +```bash +sudo apt-get install -y libgtest-dev libgmock-dev cmake g++ clang +``` + +The sanitizer runtimes are bundled with `g++` and `clang` on modern Ubuntu — no +separate `libasan` or `libtsan` package is needed. + +On TSan failure the report is written to stderr and appears directly in the GitLab +job log. + +--- + +## Tier 2 — Java Functional Tests (Nightly) + +**Workflow:** `.github/workflows/nightly.yml` → `test_workflow.yml` + +**Gradle task:** `:ddprof-test:testAsan -Pskip-gtest` + +**Runs on:** amd64 and aarch64 × glibc and musl × +HotSpot / OpenJ9 / GraalVM / IBM / Liberica across JDK 8–25 + +**Triggers:** nightly at 03:00 UTC; also `workflow_dispatch` for manual runs + +The Java functional tests run the profiler as a JVMTI agent attached to a real JVM +and assert correctness: allocation profiling reports the right classes, CPU samples +land on the right frames, class unloading is handled cleanly, wall-clock profiling +produces expected output. + +ASan is applied here even though the JVM is not instrumented, because +`libjavaProfiler.so` is instrumented and ASan intercepts memory errors in JVMTI +callback paths — actual `GetStackTrace` calls, real `SampledObjectAlloc` events, real +class load/unload sequences — that cannot be fully replicated in C++ unit tests. + +TSan is not run against the Java functional tests (see Tier 1 rationale above). + +C++ gtests are skipped (`-Pskip-gtest`) because they already run on every PR in +Tier 1. + +**Test configurations triggered by PR labels** (optional, in addition to the always-on +debug build): + +| Label | Effect | +|-------|--------| +| `test:release` | Run Java functional tests with release library | +| `test:asan` | Run Java functional tests with ASan library on the PR | +| `test:tsan` | Run Java functional tests with TSan library on the PR (expect JVM false positives) | + +**Local run:** +```bash +# Match the nightly configuration +./gradlew :ddprof-test:testAsan -Pskip-gtest + +# Run against a specific JDK and libc via Docker (matches CI exactly) +./utils/run-docker-tests.sh --config=asan --jdk=21 --libc=glibc + +# Run a single test +./gradlew :ddprof-test:testAsan -Ptests=AllocationProfilerTest -Pskip-gtest +``` + +On failure the workflow reports affected scenarios to Slack and uploads test reports +as artifacts. + +--- + +## Tier 3 — dd-trace Integration Tests (GitLab, Every Push) + +**Pipeline:** `.gitlab/dd-trace-integration/.gitlab-ci.yml` + +**Runs on:** amd64 and aarch64 × glibc and musl × HotSpot + OpenJ9, JDK 8–25 + +**Triggers:** every branch push; skipped when `CI_PIPELINE_SOURCE` is +`merge_request_event` (GitLab merge-request pipeline) or when JDK integration +variables are set (`JDK_VERSION`, `DEBUG_LEVEL`, `HASH`, `DOWNSTREAM`) + +This tier patches the latest `dd-java-agent.jar` snapshot with the locally built +`ddprof.jar` and runs integration tests against the combined agent. The patch +replaces the bundled (relocated) profiler classes inside the agent with the version +under test, keeping the classloader/relocation path identical to production. + +It tests end-to-end agent startup, profiling data collection, and tracer/profiler +co-existence across the full JDK × libc matrix. Failures are posted as PR comments +and published to GitHub Pages as a compatibility matrix. + +No sanitizers are applied here. The goal is compatibility verification, not crash +or race detection. + +**Manual trigger:** The `DD_TRACE_VERSION` variable can be set to test against a +specific dd-java-agent snapshot version rather than auto-detecting the latest. + +--- + +## Tier 4 — Chaos and Reliability (GitLab, Nightly Scheduled) + +**Pipeline:** `.gitlab/reliability/.gitlab-ci.yml` + +**Runs on:** amd64 and aarch64, nightly via GitLab pipeline schedule + +This tier runs long-duration workloads designed to provoke probabilistic crashes and +stability regressions that bounded-time unit tests cannot reliably trigger. + +### Reliability variants (`jit` and `memory`) + +Runs `renaissance.jar akka-uct` repeatedly under the profiler for up to 6 hours. +Tests `profiler` and `profiler+tracer` configurations against `gmalloc`, `jemalloc`, +and `tcmalloc` allocators. Detects crashes that require sustained JIT compilation +churn and GC pressure to manifest. + +The `memory` variant additionally monitors RSS over time (via `memwatch.log`) and +runs `memory_trend_check.py` to detect upward memory trends. + +### Chaos variant + +Patches the latest `dd-java-agent.jar` with the locally built `ddprof.jar` (same +patch mechanism as Tier 3) and runs the `ddprof-stresstest` chaos harness under +continuous antagonist load: + +| Antagonist | What it stresses | +|-----------|-----------------| +| `thread-churn` | 64 short-lived threads racing signal delivery, `RefCountGuard` slot allocation | +| `classloader-churn` | Rapid class definition and GC, `StringDictionary` insert/collect/clear races | +| `alloc-storm` | Continuous allocation pressure against the allocation profiler | +| `vthread-churn` | Virtual thread mount/unmount lifecycle against wall-clock profiling | +| `trace-context` | Trace context propagation under concurrent profiling (requires `profiler+tracer`) | + +Failure criterion: a non-zero exit code (JVM crash), captured as an `hs_err.log` +artifact. Crashes are also reported to Slack. + +No sanitizers are used. Tier 4 catches races that require hours at production-scale +concurrency to trigger with meaningful probability. + +### JDK integration tests + +`.gitlab/jdk-integration/.gitlab-ci.yml` handles upstream testing against custom JDK +builds. It is triggered externally (from the `async-profiler-build` pipeline) with +specific `JDK_VERSION`, `DEBUG_LEVEL`, and `HASH` parameters and runs `testDebug` +against that JDK build. This is used to validate compatibility with unreleased JDK +versions. + +--- + +## Why the Split + +| Bug class | Caught by | +|-----------|-----------| +| Data race in native data structures (signal handler vs. mutator) | Tier 1 — TSan gtest | +| Memory corruption in signal handler path | Tier 1 — ASan gtest | +| Memory error in JVMTI callback path | Tier 2 — ASan Java functional | +| Correctness regression (wrong profiling output) | Tier 2 — Java functional | +| Tracer / profiler incompatibility | Tier 3 — dd-trace integration | +| Probabilistic crash under sustained load | Tier 4 — chaos / reliability | +| JDK-version-specific crash | Tier 4 — JDK integration | + +**Tier 1** provides the fastest feedback (every PR, minutes). TSan without a JVM is +definitive for the class of race that has caused the most production crashes: signal +handlers accessing shared data structures concurrently with writers on other threads. + +**Tier 2** covers correctness and integration with real JVM behaviour. Some paths +(actual `GetStackTrace` interleaving with class unload, real `SampledObjectAlloc` +callback ordering) are impractical to replicate in C++ unit tests. + +**Tier 3** catches regressions in the tracer/profiler integration boundary that would +otherwise only surface after a combined dd-trace-java release. + +**Tier 4** provides long-duration soak coverage at realistic concurrency levels, +catching races with per-second probability too low for any bounded CI window. + +--- + +## Local Development + +### Quick feedback cycle + +```bash +# C++ unit tests — debug build, fast +./gradlew :ddprof-lib:gtestDebug + +# Java functional tests — debug build +./gradlew :ddprof-test:testDebug + +# Single test +./gradlew :ddprof-test:testDebug -Ptests=WallclockDumpSmokeTest +``` + +### Sanitizer builds + +```bash +# C++ ASan + TSan (no JVM needed) +./gradlew :ddprof-lib:gtestAsan +./gradlew :ddprof-lib:gtestTsan + +# Java functional tests under ASan (JVM required) +./gradlew :ddprof-test:testAsan -Pskip-gtest +``` + +### Using Docker to match CI exactly + +```bash +# Matches the nightly configuration +./utils/run-docker-tests.sh --config=asan --jdk=21 --libc=glibc + +# Debug build against a specific JDK +./utils/run-docker-tests.sh --config=debug --jdk=17-j9 --libc=glibc + +# Musl build +./utils/run-docker-tests.sh --config=debug --jdk=21-librca --libc=musl + +# With C++ gtests enabled (disabled by default in run-docker-tests.sh) +./utils/run-docker-tests.sh --config=asan --jdk=21 --libc=glibc --gtest +``` + +### Running the chaos harness locally + +```bash +# Build the chaos jar (auto-detected by chaos_check.sh when present) +./gradlew :ddprof-stresstest:chaosJar + +# Run the chaos check (uses the local build artifact; downloads dd-java-agent.jar) +.gitlab/reliability/chaos_check.sh 300 profiler+tracer gmalloc +``` + +`chaos_check.sh` looks for `ddprof-lib/build/libs/ddprof-*.jar` first and only +falls back to downloading from Maven snapshots if not found (requiring +`CURRENT_VERSION` to be set in that case). Build the jar locally to skip the +Maven download. diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index df6a6ad76..96fd28ba8 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,9 +1,9 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists distributionUrl=https\://services.gradle.org/distributions/gradle-9.5.1-bin.zip -networkTimeout=10000 -retries=0 -retryBackOffMs=500 +networkTimeout=30000 +retries=5 +retryBackOffMs=2000 validateDistributionUrl=true zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists