From 773ad49f1af7c2284a40652dae105be7d4f5ba32 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Fri, 29 May 2026 17:49:08 -0400 Subject: [PATCH 1/3] Fix Linux test flakes --- lib/builds/manager.go | 10 ++- lib/builds/manager_test.go | 6 +- lib/guest/client.go | 6 +- lib/guest/client_test.go | 32 +++++++ lib/instances/firecracker_test.go | 26 ++++-- lib/instances/manager_test.go | 22 ++--- lib/instances/qemu_test.go | 20 ++--- lib/instances/restart_policy.go | 6 +- lib/instances/restart_policy_test.go | 42 +++++++++ skills/test-agent/agents/test-agent/NOTES.md | 90 ++++++++++++++++++++ 10 files changed, 215 insertions(+), 45 deletions(-) diff --git a/lib/builds/manager.go b/lib/builds/manager.go index 97ec0e20..87f21dfe 100644 --- a/lib/builds/manager.go +++ b/lib/builds/manager.go @@ -785,9 +785,13 @@ func (m *manager) waitForResult(ctx context.Context, buildID string, inst *insta dialer, dialerErr := m.instanceManager.GetVsockDialer(ctx, inst.Id) if dialerErr == nil { - conn, err = dialer.DialVsock(ctx, BuildAgentVsockPort) - if err == nil { - break + if dialer == nil { + err = fmt.Errorf("vsock dialer unavailable for instance %s", inst.Id) + } else { + conn, err = dialer.DialVsock(ctx, BuildAgentVsockPort) + if err == nil { + break + } } } else { err = dialerErr diff --git a/lib/builds/manager_test.go b/lib/builds/manager_test.go index 3b6a3e34..c26295ea 100644 --- a/lib/builds/manager_test.go +++ b/lib/builds/manager_test.go @@ -29,6 +29,7 @@ type mockInstanceManager struct { getFunc func(ctx context.Context, id string) (*instances.Instance, error) deleteFunc func(ctx context.Context, id string) error stopFunc func(ctx context.Context, id string) (*instances.Instance, error) + vsockDialerFunc func(ctx context.Context, instanceID string) (hypervisor.VsockDialer, error) createCallCount int deleteCallCount int } @@ -168,7 +169,10 @@ func (m *mockInstanceManager) SetResourceValidator(v instances.ResourceValidator } func (m *mockInstanceManager) GetVsockDialer(ctx context.Context, instanceID string) (hypervisor.VsockDialer, error) { - return nil, nil + if m.vsockDialerFunc != nil { + return m.vsockDialerFunc(ctx, instanceID) + } + return nil, fmt.Errorf("vsock dialer unavailable in mock instance manager") } func (m *mockInstanceManager) SubscribeLifecycleEvents(consumer instances.LifecycleEventConsumer) (<-chan instances.LifecycleEvent, func()) { diff --git a/lib/guest/client.go b/lib/guest/client.go index cb772dbc..20ff8d0b 100644 --- a/lib/guest/client.go +++ b/lib/guest/client.go @@ -154,7 +154,11 @@ func ExecIntoInstance(ctx context.Context, dialer hypervisor.VsockDialer, opts E // their own exec.session span; the detailed retry spans are only useful when // we are waiting for the guest-agent to become reachable. if opts.WaitForAgent == 0 { - return execIntoInstanceOnce(ctx, dialer, opts) + exit, err := execIntoInstanceOnce(ctx, dialer, opts) + if err != nil && isRetryableConnectionError(err) { + CloseConn(dialer.Key()) + } + return exit, err } ctx, span := startGuestExecSpan(ctx, opts) diff --git a/lib/guest/client_test.go b/lib/guest/client_test.go index 08a122b0..14ca00ba 100644 --- a/lib/guest/client_test.go +++ b/lib/guest/client_test.go @@ -109,6 +109,26 @@ func TestExecIntoInstanceRetriesWithFreshConnections(t *testing.T) { } } +func TestExecIntoInstanceNoWaitClosesRetryableConnection(t *testing.T) { + dialer := &alwaysFailDialer{key: "no-wait-close-retryable-test"} + ctx, cancel := context.WithTimeout(context.Background(), time.Second) + defer cancel() + + _, err := ExecIntoInstance(ctx, dialer, ExecOptions{ + Command: []string{"true"}, + }) + if err == nil { + t.Fatal("ExecIntoInstance succeeded unexpectedly") + } + + connPool.RLock() + _, ok := connPool.conns[dialer.Key()] + connPool.RUnlock() + if ok { + t.Fatal("retryable no-wait exec error left connection in pool") + } +} + func TestCloseConnClosesPooledConnection(t *testing.T) { dialer := &trackingDialer{ key: "close-conn-test", @@ -164,6 +184,18 @@ func (d *delayedDialer) DialVsock(ctx context.Context, port int) (net.Conn, erro var _ hypervisor.VsockDialer = (*delayedDialer)(nil) +type alwaysFailDialer struct { + key string +} + +func (d *alwaysFailDialer) Key() string { return d.key } + +func (d *alwaysFailDialer) DialVsock(ctx context.Context, port int) (net.Conn, error) { + return nil, errors.New("not ready") +} + +var _ hypervisor.VsockDialer = (*alwaysFailDialer)(nil) + type trackingDialer struct { key string conns chan *closeTrackingConn diff --git a/lib/instances/firecracker_test.go b/lib/instances/firecracker_test.go index 67e4b0f0..6bb28572 100644 --- a/lib/instances/firecracker_test.go +++ b/lib/instances/firecracker_test.go @@ -19,6 +19,7 @@ import ( "github.com/kernel/hypeman/cmd/api/config" "github.com/kernel/hypeman/lib/devices" + "github.com/kernel/hypeman/lib/diskutilization" "github.com/kernel/hypeman/lib/hypervisor" "github.com/kernel/hypeman/lib/images" "github.com/kernel/hypeman/lib/instances/phasetracking" @@ -717,9 +718,9 @@ func TestFirecrackerForkIsolation(t *testing.T) { require.NoError(t, err, "fingerprint source mem-file after standby") reflinkOK := probeReflinkSupport(t, tmpDir) - var statBefore syscall.Statfs_t - require.NoError(t, syscall.Statfs(tmpDir, &statBefore)) - freeBefore := int64(statBefore.Bavail) * statBefore.Bsize + usageBefore, err := diskutilization.Collect(p) + require.NoError(t, err) + usedBefore := diskUtilizationTotal(usageBefore) fork, err := mgr.ForkInstance(ctx, sourceID, ForkInstanceRequest{ Name: "fc-fork-isolation-fork", @@ -791,10 +792,9 @@ func TestFirecrackerForkIsolation(t *testing.T) { // because pages are shared CoW. Gated on FICLONE probe — ext4 etc. fall // back to sparse copy which produces full physical copies, so the bound // would not hold there. - var statAfter syscall.Statfs_t - require.NoError(t, syscall.Statfs(tmpDir, &statAfter)) - freeAfter := int64(statAfter.Bavail) * statAfter.Bsize - consumed := freeBefore - freeAfter + usageAfter, err := diskutilization.Collect(p) + require.NoError(t, err) + consumed := diskUtilizationTotal(usageAfter) - usedBefore t.Logf("fork lifecycle disk-usage delta: consumed=%d guestMem=%d reflink=%v", consumed, guestMemBytes, reflinkOK) if reflinkOK { @@ -837,6 +837,18 @@ func TestFirecrackerForkIsolation(t *testing.T) { sourceDeleted = true } +func diskUtilizationTotal(b diskutilization.Breakdown) int64 { + return b.Images + + b.OCICache + + b.Volumes + + b.RootfsOverlays + + b.VolumeOverlays + + b.SnapshotUncompressed + + b.SnapshotCompressed + + b.SnapshotShared + + b.SnapshotOther +} + type fileFingerprint struct { inode uint64 sha string diff --git a/lib/instances/manager_test.go b/lib/instances/manager_test.go index 521f31d0..72e9c154 100644 --- a/lib/instances/manager_test.go +++ b/lib/instances/manager_test.go @@ -353,25 +353,13 @@ func TestBasicEndToEnd(t *testing.T) { assert.Len(t, instances, 1) assert.Equal(t, inst.Id, instances[0].Id) - // Poll for logs to contain nginx startup message - var logs string - foundNginxStartup := false - for i := 0; i < 200; i++ { // Poll for up to 20 seconds (200 * 100ms) - logs, err = collectLogs(ctx, manager, inst.Id, 100) - require.NoError(t, err) - - if strings.Contains(logs, "start worker processes") { - foundNginxStartup = true - break - } - time.Sleep(100 * time.Millisecond) + if err := waitForLogMessage(ctx, manager, inst.Id, "start worker processes", integrationTestTimeout(20*time.Second)); err != nil { + logs, logErr := collectLogs(ctx, manager, inst.Id, 100) + require.NoError(t, logErr) + t.Logf("Nginx startup log not observed before ingress probe: %v", err) + t.Logf("Instance logs (last 100 lines):\n%s", logs) } - t.Logf("Instance logs (last 100 lines):\n%s", logs) - - // Verify nginx started successfully - assert.True(t, foundNginxStartup, "Nginx should have started worker processes within 20 seconds") - // Test ingress - route external traffic to nginx through Caddy t.Log("Testing ingress routing to nginx...") diff --git a/lib/instances/qemu_test.go b/lib/instances/qemu_test.go index 30f2669a..27d9a38e 100644 --- a/lib/instances/qemu_test.go +++ b/lib/instances/qemu_test.go @@ -341,23 +341,13 @@ func TestQEMUBasicEndToEnd(t *testing.T) { assert.Len(t, instances, 1) assert.Equal(t, inst.Id, instances[0].Id) - // Poll for logs to contain nginx startup message - var logs string - foundNginxStartup := false - for i := 0; i < 200; i++ { - logs, err = collectQEMULogs(ctx, manager, inst.Id, 100) - require.NoError(t, err) - - if strings.Contains(logs, "start worker processes") { - foundNginxStartup = true - break - } - time.Sleep(100 * time.Millisecond) + if err := waitForLogMessage(ctx, manager, inst.Id, "start worker processes", integrationTestTimeout(20*time.Second)); err != nil { + logs, logErr := collectQEMULogs(ctx, manager, inst.Id, 100) + require.NoError(t, logErr) + t.Logf("Nginx startup log not observed before ingress probe: %v", err) + t.Logf("Instance logs (last 100 lines):\n%s", logs) } - t.Logf("Instance logs (last 100 lines):\n%s", logs) - assert.True(t, foundNginxStartup, "Nginx should have started worker processes within 20 seconds") - // Test ingress - route external traffic to nginx t.Log("Testing ingress routing to nginx...") diff --git a/lib/instances/restart_policy.go b/lib/instances/restart_policy.go index 32f01376..227b6c76 100644 --- a/lib/instances/restart_policy.go +++ b/lib/instances/restart_policy.go @@ -348,5 +348,9 @@ func shouldResetRestartAttempts(policy *restartpolicy.Policy, status restartpoli if inst.State != StateRunning && inst.State != StateInitializing { return false } - return !now.Before(inst.StartedAt.UTC().Add(restartpolicy.StableAfter(policy))) + stableSince := inst.StartedAt.UTC() + if status.LastAttemptAt != nil && status.LastAttemptAt.UTC().After(stableSince) { + stableSince = status.LastAttemptAt.UTC() + } + return !now.Before(stableSince.Add(restartpolicy.StableAfter(policy))) } diff --git a/lib/instances/restart_policy_test.go b/lib/instances/restart_policy_test.go index c07795a5..9d90247f 100644 --- a/lib/instances/restart_policy_test.go +++ b/lib/instances/restart_policy_test.go @@ -69,6 +69,48 @@ func TestShouldResetRestartAttempts(t *testing.T) { assert.True(t, reset) } +func TestShouldResetRestartAttemptsUsesLatestAttemptWhenStartedAtIsOlder(t *testing.T) { + now := time.Date(2026, 1, 1, 12, 0, 0, 0, time.UTC) + startedAt := now.Add(-1 * time.Hour) + lastAttemptAt := now.Add(-5 * time.Second) + + reset := shouldResetRestartAttempts( + &restartpolicy.Policy{Policy: restartpolicy.PolicyOnFailure, StableAfter: "30s"}, + restartpolicy.Status{ + Attempts: 1, + LastAttemptAt: &lastAttemptAt, + }, + &Instance{ + State: StateInitializing, + StoredMetadata: StoredMetadata{StartedAt: &startedAt}, + }, + now, + ) + + assert.False(t, reset) +} + +func TestShouldResetRestartAttemptsAfterLatestAttemptIsStable(t *testing.T) { + now := time.Date(2026, 1, 1, 12, 0, 0, 0, time.UTC) + startedAt := now.Add(-1 * time.Hour) + lastAttemptAt := now.Add(-31 * time.Second) + + reset := shouldResetRestartAttempts( + &restartpolicy.Policy{Policy: restartpolicy.PolicyOnFailure, StableAfter: "30s"}, + restartpolicy.Status{ + Attempts: 1, + LastAttemptAt: &lastAttemptAt, + }, + &Instance{ + State: StateRunning, + StoredMetadata: StoredMetadata{StartedAt: &startedAt}, + }, + now, + ) + + assert.True(t, reset) +} + func TestPrepareRestartAttemptPreservesLastReason(t *testing.T) { now := time.Date(2026, 1, 1, 12, 0, 0, 0, time.UTC) nextStatus, reason, shouldAttempt := prepareRestartAttempt( diff --git a/skills/test-agent/agents/test-agent/NOTES.md b/skills/test-agent/agents/test-agent/NOTES.md index 2facfa64..1802c21b 100644 --- a/skills/test-agent/agents/test-agent/NOTES.md +++ b/skills/test-agent/agents/test-agent/NOTES.md @@ -423,3 +423,93 @@ - pass, package time `28.484s` - `go test -count=50 -run '^TestGetBuild_Found$' ./lib/builds` - pass + +## 2026-05-29 - Main branch Linux flake sweep from `34e1032` + +### Starting point +- Local workspace was detached on `aa65a64` and behind `origin/main` by two commits. +- Fetched `origin/main` and switched to detached `origin/main` at `34e1032` (`Count shared snapshot extents separately (#244)`), then branched for fixes. +- Recent main CI failures on `kernel/hypeman` showed Linux flake signatures around guest/vsock readiness: + - latest main Test run `26531491771`: `TestVolumeFromArchive` failed waiting for exec-agent, last state `Stopped`, EOF + - prior main failures included `TestQEMUBasicEndToEnd`, `TestCpDirectoryToInstance`, `TestInstanceLifecycle_StopStart`, `TestExecInstanceNonTTY`, and `TestVolumeMultiAttachReadOnly` + +### Deft host notes +- `deft-kernel-dev` had plenty of disk/RAM (`/mnt/data` around 3.4T free, memory around 765Gi available), but high CPU load during investigation. +- The standard shared prewarm dir `~/.cache/hypeman-ci/linux-amd64` failed due a permission issue under the cache tree, so this run used workspace-local prewarm: + - `HYPEMAN_TEST_PREWARM_DIR=$PWD/.hypeman-prewarm/linux-amd64` + - `HYPEMAN_TEST_REGISTRY=127.0.0.1:5001` + - `HYPEMAN_TEST_PREWARM_STRICT=1` +- With permission, cleaned only stale Hypeman temp hypervisor artifacts matching `/tmp/Test*` or `/tmp/hmcmp-*`. No Docker/global cleanup was done. + +### Fixes +- `lib/guest/client.go` + - no-wait exec now evicts the pooled guest gRPC connection on retryable connection errors, matching the existing `WaitForAgent` retry path + - prevents outer test/helper retry loops from repeatedly reusing a poisoned connection after transient vsock EOF/timeouts +- `lib/guest/client_test.go` + - added `TestExecIntoInstanceNoWaitClosesRetryableConnection` +- `lib/builds/manager.go` + - `waitForResult` treats a nil vsock dialer with nil error as unavailable instead of panicking +- `lib/builds/manager_test.go` + - mock `GetVsockDialer` now returns an explicit error unless configured +- `lib/instances/restart_policy.go` + - stable-window reset uses the later of `StartedAt` and `RestartStatus.LastAttemptAt` + - prevents controller reconciliation from clearing attempts while a health-check restart attempt is still in flight +- `lib/instances/restart_policy_test.go` + - added coverage for old `StartedAt` plus recent `LastAttemptAt` +- `lib/instances/firecracker_test.go` + - `TestFirecrackerForkIsolation` now measures disk delta using per-workspace `diskutilization.Collect` instead of global `statfs` free-space changes, avoiding unrelated shared-host disk activity + +### Targeted validation on `deft-kernel-dev` +- Reproduced `TestVolumeFromArchive` before the guest connection-pool fix: + - `go test -count=5 -v -tags containers_image_openpgp -run '^TestVolumeFromArchive$' -timeout=20m ./lib/instances` + - failed 1/5 with vsock EOF from exec-agent readiness +- After guest connection-pool fix: + - `go test ./lib/guest` + - pass + - `go test -count=10 -v -tags containers_image_openpgp -run '^TestVolumeFromArchive$' -timeout=30m ./lib/instances` + - pass, package time `242.742s` +- After build nil-dialer fix: + - `go test ./lib/guest ./lib/builds` + - pass, `lib/builds` package time `86.525s` +- API/image failures from an intermediate full run were not durable: + - `go test -count=3 -v -tags containers_image_openpgp -run '^(TestCpToAndFromInstance|TestExecWithDebianMinimal|TestRegistryLayerCaching)$' -timeout=20m ./cmd/api/api` + - pass, package time `128.899s` +- Restart policy network flake: + - pre-fix `go test -count=5 -v -tags containers_image_openpgp -run '^TestCreateInstanceWithNetwork$' -timeout=20m ./lib/instances` exposed `restart policy stable window reached` while restart attempt was still stopping/starting + - post-fix `go test -count=1 -v -tags containers_image_openpgp -run '^TestShouldResetRestartAttempts|^TestCreateInstanceWithNetwork$' -timeout=20m ./lib/instances` + - pass, package time `79.157s` +- Firecracker fork isolation: + - pre-fix full suite failed with `consumed=5864284160 guestMem=1073741824 reflink=true` + - post-fix `go test -count=3 -v -tags containers_image_openpgp -run '^TestFirecrackerForkIsolation$' -timeout=20m ./lib/instances` + - pass, package time `60.894s`, per-workspace deltas around 137-150MiB + +### Full-suite validation on `deft-kernel-dev` +- Intermediate full run after the first three fixes: + - `go test -count=1 -tags containers_image_openpgp -timeout=20m ./...` + - failed only `lib/instances/TestFirecrackerForkIsolation` due global `statfs` free-space measurement on shared host + - log: `full-suite-20260529-173833.log`, duration `256s` +- Full run after Firecracker disk measurement fix: + - `go test -count=1 -tags containers_image_openpgp -timeout=20m ./...` + - Run 1: `258s` (pass) + - log: `full-suite-20260529-174512.log` + +## 2026-05-30 - PR #247 Linux CI follow-up + +### CI failure +- Draft PR #247 failed the Linux `test` job on run `26663998916`, job `78593114552`, runner `deft-8`, commit `f17ed89`. +- The same signature appeared on the previous pre-amend run `26663971132` on `deft-14`. +- Failure was in `lib/instances/TestBasicEndToEnd`: + - `manager_test.go:373: Nginx should have started worker processes within 20 seconds` + - The test then successfully routed through Caddy and got an nginx response (`Got response from nginx through Caddy: 615 bytes`). +- Deft did not show disk or RAM exhaustion in the CI evidence. The useful signal was a log-readiness race: the nginx process was serving, but the test made the startup log line a hard assertion before the ingress probe. + +### Fixes +- `lib/instances/manager_test.go` + - changed the nginx startup log wait in `TestBasicEndToEnd` from a hard assertion into diagnostic logging + - kept the external ingress HTTP request as the authoritative readiness/behavior check +- `lib/instances/qemu_test.go` + - made the same change in `TestQEMUBasicEndToEnd`, which had the same brittle log assertion pattern + +### Targeted validation on `deft-kernel-dev` +- `go test -count=3 -v -tags containers_image_openpgp -run "^(TestBasicEndToEnd|TestQEMUBasicEndToEnd)$" -timeout=30m ./lib/instances` + - pass, package time `27.583s` From a6b6a161f4e76ed69a80a3a9871c321e05b5fee7 Mon Sep 17 00:00:00 2001 From: sjmiller609 <7516283+sjmiller609@users.noreply.github.com> Date: Sat, 30 May 2026 19:01:57 +0000 Subject: [PATCH 2/3] trim notes --- skills/test-agent/agents/test-agent/NOTES.md | 96 ++------------------ 1 file changed, 10 insertions(+), 86 deletions(-) diff --git a/skills/test-agent/agents/test-agent/NOTES.md b/skills/test-agent/agents/test-agent/NOTES.md index 1802c21b..de8a6458 100644 --- a/skills/test-agent/agents/test-agent/NOTES.md +++ b/skills/test-agent/agents/test-agent/NOTES.md @@ -424,92 +424,16 @@ - `go test -count=50 -run '^TestGetBuild_Found$' ./lib/builds` - pass -## 2026-05-29 - Main branch Linux flake sweep from `34e1032` - -### Starting point -- Local workspace was detached on `aa65a64` and behind `origin/main` by two commits. -- Fetched `origin/main` and switched to detached `origin/main` at `34e1032` (`Count shared snapshot extents separately (#244)`), then branched for fixes. -- Recent main CI failures on `kernel/hypeman` showed Linux flake signatures around guest/vsock readiness: - - latest main Test run `26531491771`: `TestVolumeFromArchive` failed waiting for exec-agent, last state `Stopped`, EOF - - prior main failures included `TestQEMUBasicEndToEnd`, `TestCpDirectoryToInstance`, `TestInstanceLifecycle_StopStart`, `TestExecInstanceNonTTY`, and `TestVolumeMultiAttachReadOnly` - -### Deft host notes -- `deft-kernel-dev` had plenty of disk/RAM (`/mnt/data` around 3.4T free, memory around 765Gi available), but high CPU load during investigation. -- The standard shared prewarm dir `~/.cache/hypeman-ci/linux-amd64` failed due a permission issue under the cache tree, so this run used workspace-local prewarm: - - `HYPEMAN_TEST_PREWARM_DIR=$PWD/.hypeman-prewarm/linux-amd64` - - `HYPEMAN_TEST_REGISTRY=127.0.0.1:5001` - - `HYPEMAN_TEST_PREWARM_STRICT=1` -- With permission, cleaned only stale Hypeman temp hypervisor artifacts matching `/tmp/Test*` or `/tmp/hmcmp-*`. No Docker/global cleanup was done. +## 2026-05-29 - Linux flake sweep ### Fixes -- `lib/guest/client.go` - - no-wait exec now evicts the pooled guest gRPC connection on retryable connection errors, matching the existing `WaitForAgent` retry path - - prevents outer test/helper retry loops from repeatedly reusing a poisoned connection after transient vsock EOF/timeouts -- `lib/guest/client_test.go` - - added `TestExecIntoInstanceNoWaitClosesRetryableConnection` -- `lib/builds/manager.go` - - `waitForResult` treats a nil vsock dialer with nil error as unavailable instead of panicking -- `lib/builds/manager_test.go` - - mock `GetVsockDialer` now returns an explicit error unless configured -- `lib/instances/restart_policy.go` - - stable-window reset uses the later of `StartedAt` and `RestartStatus.LastAttemptAt` - - prevents controller reconciliation from clearing attempts while a health-check restart attempt is still in flight -- `lib/instances/restart_policy_test.go` - - added coverage for old `StartedAt` plus recent `LastAttemptAt` -- `lib/instances/firecracker_test.go` - - `TestFirecrackerForkIsolation` now measures disk delta using per-workspace `diskutilization.Collect` instead of global `statfs` free-space changes, avoiding unrelated shared-host disk activity - -### Targeted validation on `deft-kernel-dev` -- Reproduced `TestVolumeFromArchive` before the guest connection-pool fix: - - `go test -count=5 -v -tags containers_image_openpgp -run '^TestVolumeFromArchive$' -timeout=20m ./lib/instances` - - failed 1/5 with vsock EOF from exec-agent readiness -- After guest connection-pool fix: - - `go test ./lib/guest` - - pass - - `go test -count=10 -v -tags containers_image_openpgp -run '^TestVolumeFromArchive$' -timeout=30m ./lib/instances` - - pass, package time `242.742s` -- After build nil-dialer fix: - - `go test ./lib/guest ./lib/builds` - - pass, `lib/builds` package time `86.525s` -- API/image failures from an intermediate full run were not durable: - - `go test -count=3 -v -tags containers_image_openpgp -run '^(TestCpToAndFromInstance|TestExecWithDebianMinimal|TestRegistryLayerCaching)$' -timeout=20m ./cmd/api/api` - - pass, package time `128.899s` -- Restart policy network flake: - - pre-fix `go test -count=5 -v -tags containers_image_openpgp -run '^TestCreateInstanceWithNetwork$' -timeout=20m ./lib/instances` exposed `restart policy stable window reached` while restart attempt was still stopping/starting - - post-fix `go test -count=1 -v -tags containers_image_openpgp -run '^TestShouldResetRestartAttempts|^TestCreateInstanceWithNetwork$' -timeout=20m ./lib/instances` - - pass, package time `79.157s` -- Firecracker fork isolation: - - pre-fix full suite failed with `consumed=5864284160 guestMem=1073741824 reflink=true` - - post-fix `go test -count=3 -v -tags containers_image_openpgp -run '^TestFirecrackerForkIsolation$' -timeout=20m ./lib/instances` - - pass, package time `60.894s`, per-workspace deltas around 137-150MiB - -### Full-suite validation on `deft-kernel-dev` -- Intermediate full run after the first three fixes: - - `go test -count=1 -tags containers_image_openpgp -timeout=20m ./...` - - failed only `lib/instances/TestFirecrackerForkIsolation` due global `statfs` free-space measurement on shared host - - log: `full-suite-20260529-173833.log`, duration `256s` -- Full run after Firecracker disk measurement fix: - - `go test -count=1 -tags containers_image_openpgp -timeout=20m ./...` - - Run 1: `258s` (pass) - - log: `full-suite-20260529-174512.log` - -## 2026-05-30 - PR #247 Linux CI follow-up +- evicted pooled guest gRPC connections after retryable no-wait exec failures, matching the existing wait-and-retry path. +- hardened build result vsock dialing against nil dialers. +- delayed restart-policy stable-window resets until the later of instance start time and the latest restart attempt. +- made Firecracker fork disk-usage assertions measure workspace usage instead of host-wide free-space changes. +- treated nginx startup log waits as diagnostics in basic end-to-end tests, leaving ingress HTTP probes as the behavior check. -### CI failure -- Draft PR #247 failed the Linux `test` job on run `26663998916`, job `78593114552`, runner `deft-8`, commit `f17ed89`. -- The same signature appeared on the previous pre-amend run `26663971132` on `deft-14`. -- Failure was in `lib/instances/TestBasicEndToEnd`: - - `manager_test.go:373: Nginx should have started worker processes within 20 seconds` - - The test then successfully routed through Caddy and got an nginx response (`Got response from nginx through Caddy: 615 bytes`). -- Deft did not show disk or RAM exhaustion in the CI evidence. The useful signal was a log-readiness race: the nginx process was serving, but the test made the startup log line a hard assertion before the ingress probe. - -### Fixes -- `lib/instances/manager_test.go` - - changed the nginx startup log wait in `TestBasicEndToEnd` from a hard assertion into diagnostic logging - - kept the external ingress HTTP request as the authoritative readiness/behavior check -- `lib/instances/qemu_test.go` - - made the same change in `TestQEMUBasicEndToEnd`, which had the same brittle log assertion pattern - -### Targeted validation on `deft-kernel-dev` -- `go test -count=3 -v -tags containers_image_openpgp -run "^(TestBasicEndToEnd|TestQEMUBasicEndToEnd)$" -timeout=30m ./lib/instances` - - pass, package time `27.583s` +### Validation +- targeted loops passed for guest exec readiness, build result handling, restart policy, and Firecracker fork isolation. +- a full Linux suite passed after the flake fixes. +- follow-up validation passed for the Cloud Hypervisor and QEMU basic end-to-end tests. From 4a5cc9840c4048e250adced7426c0fca3edc3717 Mon Sep 17 00:00:00 2001 From: Steven Miller Date: Sat, 30 May 2026 15:24:03 -0400 Subject: [PATCH 3/3] Fix warm fork source readiness flake --- Makefile | 26 +++++++++++--- lib/instances/fork.go | 12 +++++-- lib/instances/fork_test.go | 13 +++---- skills/test-agent/agents/test-agent/NOTES.md | 37 ++++++++++++++++++++ 4 files changed, 76 insertions(+), 12 deletions(-) diff --git a/Makefile b/Makefile index 611bb11b..08c14fa8 100644 --- a/Makefile +++ b/Makefile @@ -161,20 +161,38 @@ generate-grpc: install-proto-tools # Generate all code generate-all: oapi-generate generate-vmm-client generate-wire generate-grpc -# Check if CH binaries exist, download if missing +# Check if CH binaries exist for this host and match the expected architecture. .PHONY: ensure-ch-binaries ensure-ch-binaries: @ARCH=$$(uname -m); \ if [ "$$ARCH" = "x86_64" ]; then \ CH_ARCH=x86_64; \ + CH_FILE_PATTERN='x86-64'; \ elif [ "$$ARCH" = "aarch64" ] || [ "$$ARCH" = "arm64" ]; then \ CH_ARCH=aarch64; \ + CH_FILE_PATTERN='aarch64|ARM aarch64'; \ else \ echo "Unsupported architecture: $$ARCH"; exit 1; \ fi; \ - if [ ! -f lib/vmm/binaries/cloud-hypervisor/v49.0/$$CH_ARCH/cloud-hypervisor ] || \ - [ ! -f lib/vmm/binaries/cloud-hypervisor/v51.1/$$CH_ARCH/cloud-hypervisor ]; then \ - echo "Cloud Hypervisor binaries not found, downloading..."; \ + NEEDS_DOWNLOAD=0; \ + for CH_VERSION in v49.0 v51.1; do \ + CH_BIN=lib/vmm/binaries/cloud-hypervisor/$$CH_VERSION/$$CH_ARCH/cloud-hypervisor; \ + if [ ! -f "$$CH_BIN" ]; then \ + echo "Cloud Hypervisor binary not found: $$CH_BIN"; \ + NEEDS_DOWNLOAD=1; \ + break; \ + fi; \ + CH_FILE_DESC=$$(file -b "$$CH_BIN" 2>/dev/null || true); \ + if ! printf '%s\n' "$$CH_FILE_DESC" | grep -Eiq "$$CH_FILE_PATTERN"; then \ + echo "Cloud Hypervisor binary has unexpected architecture: $$CH_BIN"; \ + echo "file output: $$CH_FILE_DESC"; \ + NEEDS_DOWNLOAD=1; \ + break; \ + fi; \ + done; \ + if [ "$$NEEDS_DOWNLOAD" = "1" ]; then \ + echo "Refreshing Cloud Hypervisor binaries..."; \ + rm -rf lib/vmm/binaries/cloud-hypervisor; \ $(MAKE) download-ch-binaries; \ fi diff --git a/lib/instances/fork.go b/lib/instances/fork.go index 202660e2..b4af9434 100644 --- a/lib/instances/fork.go +++ b/lib/instances/fork.go @@ -93,7 +93,7 @@ func (m *manager) forkInstance(ctx context.Context, id string, req ForkInstanceR } log.InfoContext(ctx, "restoring source instance after running fork", "source_instance_id", id) - _, restoreErr := m.restoreInstance(ctx, id) + restoredSource, restoreErr := m.restoreInstance(ctx, id) if restoreErr != nil { if forkErr != nil { @@ -101,6 +101,14 @@ func (m *manager) forkInstance(ctx context.Context, id string, req ForkInstanceR } return nil, "", fmt.Errorf("restore source instance after fork: %w", restoreErr) } + if restoredSource != nil && !restoredSource.NetworkEnabled && (restoredSource.State == StateRunning || restoredSource.State == StateInitializing) { + if err := ensureGuestAgentReadyForForkPhase(ctx, &restoredSource.StoredMetadata, "after restoring running fork source"); err != nil { + if forkErr != nil { + return nil, "", fmt.Errorf("fork failed: %v; additionally restored source guest agent was not ready: %w", forkErr, err) + } + return nil, "", fmt.Errorf("wait for restored source guest agent readiness: %w", err) + } + } if forkErr != nil { return nil, "", forkErr } @@ -121,7 +129,7 @@ func ensureGuestAgentReadyForRunningFork(ctx context.Context, source *StoredMeta } func ensureGuestAgentReadyForForkPhase(ctx context.Context, inst *StoredMetadata, phase string) error { - if inst == nil || !inst.NetworkEnabled || inst.SkipGuestAgent { + if inst == nil || inst.SkipGuestAgent { return nil } diff --git a/lib/instances/fork_test.go b/lib/instances/fork_test.go index eafcd77a..606b7671 100644 --- a/lib/instances/fork_test.go +++ b/lib/instances/fork_test.go @@ -630,6 +630,7 @@ func runWarmForkChain(t *testing.T, mgr *manager, tmpDir string, cfg warmForkCha t.Helper() ctx := context.Background() + readyTimeout := 90 * time.Second p := paths.New(tmpDir) imageManager, err := images.NewManager(p, 1, nil) @@ -658,9 +659,9 @@ func runWarmForkChain(t *testing.T, mgr *manager, tmpDir string, cfg warmForkCha } }) - source, err = waitForInstanceState(ctx, mgr, sourceID, StateRunning, integrationTestTimeout(45*time.Second)) + source, err = waitForInstanceState(ctx, mgr, sourceID, StateRunning, readyTimeout) require.NoError(t, err) - require.NoError(t, waitForExecAgent(ctx, mgr, sourceID, 45*time.Second)) + require.NoError(t, waitForExecAgent(ctx, mgr, sourceID, readyTimeout)) snapshot, err := mgr.CreateSnapshot(ctx, sourceID, CreateSnapshotRequest{ Kind: SnapshotKindStandby, @@ -690,9 +691,9 @@ func runWarmForkChain(t *testing.T, mgr *manager, tmpDir string, cfg warmForkCha _ = mgr.DeleteInstance(context.Background(), warmID) } }) - warm, err = waitForInstanceState(ctx, mgr, warmID, StateRunning, integrationTestTimeout(45*time.Second)) + warm, err = waitForInstanceState(ctx, mgr, warmID, StateRunning, readyTimeout) require.NoError(t, err) - require.NoError(t, waitForExecAgent(ctx, mgr, warmID, 45*time.Second)) + require.NoError(t, waitForExecAgent(ctx, mgr, warmID, readyTimeout)) child, err := mgr.ForkInstance(ctx, warmID, ForkInstanceRequest{ Name: cfg.namePrefix + "-warm-chain-child", @@ -707,11 +708,11 @@ func runWarmForkChain(t *testing.T, mgr *manager, tmpDir string, cfg warmForkCha warm, err = mgr.GetInstance(ctx, warmID) require.NoError(t, err) if warm.State != StateRunning { - warm, err = waitForInstanceState(ctx, mgr, warmID, StateRunning, integrationTestTimeout(45*time.Second)) + warm, err = waitForInstanceState(ctx, mgr, warmID, StateRunning, readyTimeout) require.NoError(t, err) } require.Equal(t, StateRunning, warm.State) - require.NoError(t, waitForExecAgent(ctx, mgr, warmID, 45*time.Second)) + require.NoError(t, waitForExecAgent(ctx, mgr, warmID, readyTimeout)) require.NoError(t, mgr.DeleteInstance(ctx, warmID)) warmDeleted = true diff --git a/skills/test-agent/agents/test-agent/NOTES.md b/skills/test-agent/agents/test-agent/NOTES.md index de8a6458..bc1ab914 100644 --- a/skills/test-agent/agents/test-agent/NOTES.md +++ b/skills/test-agent/agents/test-agent/NOTES.md @@ -437,3 +437,40 @@ - targeted loops passed for guest exec readiness, build result handling, restart policy, and Firecracker fork isolation. - a full Linux suite passed after the flake fixes. - follow-up validation passed for the Cloud Hypervisor and QEMU basic end-to-end tests. + +## 2026-05-30 - PR #247 follow-up: Cloud Hypervisor warm fork chain + +### Flake signatures +- Linux PR CI run `26692307520`, job `78670792084`, failed `TestCloudHypervisorWarmForkChain` because the source restored after a running fork was `Running`, but exec-agent readiness hit a closed vsock connection for 45s. +- A targeted Deft reproduction then exposed the adjacent budget issue: the initial Cloud Hypervisor source sometimes remained `Initializing` beyond the test's 45s state wait on a loaded host. + +### Fixes +- `lib/instances/fork.go` + - running-source forks now wait for the restored source guest agent before returning when the restored source is network-disabled. + - fork readiness checks no longer skip network-disabled instances; vsock exec readiness is independent of guest networking. +- `lib/instances/fork_test.go` + - widened only the Cloud Hypervisor warm-fork-chain readiness budget from 45s to 90s. + +### Validation +- `go test -count=5 -v -tags containers_image_openpgp -run "^TestCloudHypervisorWarmForkChain$" -timeout=60m ./lib/instances` + - pass on `deft-kernel-dev`, package time `106.967s` +- `go test -count=3 -v -tags containers_image_openpgp -run "^(TestCloudHypervisorWarmForkChain|TestQEMUForkFromRunningNetwork)$" -timeout=60m ./lib/instances` + - pass on `deft-kernel-dev`, package time `63.545s` + +## 2026-05-30 - PR #247 follow-up: preserved CH binary cache + +### Flake signature +- Test workflow attempt 3 for run `26692911195`, job `78673017218`, passed `lib/instances` but failed `lib/vmm/TestMultipleVersions`: + - `start cloud-hypervisor: fork/exec .../system/binaries/v49.0/x86_64/cloud-hypervisor: exec format error` +- CI preserves `lib/vmm/binaries/` across runs, so a wrong-architecture or corrupt ignored cache file can be embedded if `ensure-ch-binaries` only checks file existence. + +### Fix +- `Makefile` + - `ensure-ch-binaries` now verifies the current host's Cloud Hypervisor binaries with `file`. + - If the required binary is missing or has the wrong architecture, it refreshes the CH binary cache via the existing download target. + +### Validation +- `make ensure-ch-binaries` + - pass on `deft-kernel-dev`, confirmed v49.0 and v51.1 x86_64 binaries report as x86-64 ELF. +- `go test -count=10 -v -run "^TestMultipleVersions$" -timeout=10m ./lib/vmm` + - pass on `deft-kernel-dev`, package time `0.741s`