diff --git a/lib/instances/manager.go b/lib/instances/manager.go index b0842e6a..abdd2165 100644 --- a/lib/instances/manager.go +++ b/lib/instances/manager.go @@ -134,6 +134,7 @@ type manager struct { nativeCodecMu sync.Mutex nativeCodecPaths map[string]string imageUsageRecorder ImageUsageRecorder + guestAgentReadyProbe func(context.Context, *StoredMetadata) bool // Shared lifecycle event subscriptions for internal consumers. lifecycleEvents *lifecycleSubscribers @@ -184,27 +185,28 @@ func NewManagerWithConfig(p *paths.Paths, imageManager images.Manager, systemMan } m := &manager{ - paths: p, - imageManager: imageManager, - systemManager: systemManager, - networkManager: networkManager, - deviceManager: deviceManager, - volumeManager: volumeManager, - limits: limits, - instanceLocks: sync.Map{}, - bootMarkerScans: sync.Map{}, - hostTopology: detectHostTopology(), // Detect and cache host topology - vmStarters: vmStarters, - defaultHypervisor: defaultHypervisor, - now: time.Now, - writeFile: os.WriteFile, - meter: meter, - tracer: tracer, - guestMemoryPolicy: policy, - snapshotDefaults: snapshotDefaults, - compressionJobs: make(map[string]*compressionJob), - nativeCodecPaths: make(map[string]string), - lifecycleEvents: newLifecycleSubscribersWithBufferSize(managerConfig.LifecycleEventBufferSize), + paths: p, + imageManager: imageManager, + systemManager: systemManager, + networkManager: networkManager, + deviceManager: deviceManager, + volumeManager: volumeManager, + limits: limits, + instanceLocks: sync.Map{}, + bootMarkerScans: sync.Map{}, + hostTopology: detectHostTopology(), // Detect and cache host topology + vmStarters: vmStarters, + defaultHypervisor: defaultHypervisor, + now: time.Now, + writeFile: os.WriteFile, + meter: meter, + tracer: tracer, + guestMemoryPolicy: policy, + snapshotDefaults: snapshotDefaults, + compressionJobs: make(map[string]*compressionJob), + nativeCodecPaths: make(map[string]string), + lifecycleEvents: newLifecycleSubscribersWithBufferSize(managerConfig.LifecycleEventBufferSize), + guestAgentReadyProbe: probeGuestAgentReady, } m.deleteSnapshotFn = m.deleteSnapshot diff --git a/lib/instances/query.go b/lib/instances/query.go index 403d74cb..99d67d48 100644 --- a/lib/instances/query.go +++ b/lib/instances/query.go @@ -14,6 +14,7 @@ import ( "syscall" "time" + "github.com/kernel/hypeman/lib/guest" "github.com/kernel/hypeman/lib/healthcheck" "github.com/kernel/hypeman/lib/hypervisor" "github.com/kernel/hypeman/lib/instances/phasetracking" @@ -23,10 +24,12 @@ import ( // exitSentinelPrefix is the machine-parseable prefix written by init to serial console. const ( - exitSentinelPrefix = "HYPEMAN-EXIT " - programStartSentinelPrefix = "HYPEMAN-PROGRAM-START " - agentReadySentinelPrefix = "HYPEMAN-AGENT-READY " - bootMarkerRescanInterval = 1 * time.Second + exitSentinelPrefix = "HYPEMAN-EXIT " + programStartSentinelPrefix = "HYPEMAN-PROGRAM-START " + agentReadySentinelPrefix = "HYPEMAN-AGENT-READY " + bootMarkerRescanInterval = 1 * time.Second + guestAgentReadyProbeWait = 250 * time.Millisecond + guestAgentReadyProbeTimeout = 1 * time.Second // hypervisorStateCacheTTL bounds how long a cached hypervisor state may be // reused before a fresh GetVMInfo call is required. Short enough to detect // guest-driven shutdowns promptly, long enough that bursty list calls @@ -275,6 +278,8 @@ func advancePhaseIfRunning(stored *StoredMetadata) { } // hydrateBootMarkersFromLogs fills missing boot markers from serial logs. +// Guest-agent readiness also falls back to a direct vsock probe so systemd +// services do not need to forward stdout/stderr to the serial console. // Returns true when at least one missing marker was found and populated. func (m *manager) hydrateBootMarkersFromLogs(ctx context.Context, stored *StoredMetadata) bool { needProgram := stored.ProgramStartedAt == nil @@ -302,6 +307,9 @@ func (m *manager) hydrateBootMarkersFromLogs(ctx context.Context, stored *Stored stored.GuestAgentReadyAt = guestAgentReadyAt hydrated = true } + if needAgent && stored.GuestAgentReadyAt == nil && stored.ProgramStartedAt != nil && m.hydrateGuestAgentReadyFromProbe(ctx, stored) { + hydrated = true + } if hydrated { advancePhaseIfRunning(stored) m.clearBootMarkerRescan(stored.Id) @@ -401,6 +409,42 @@ func (m *manager) nowUTC() time.Time { return time.Now().UTC() } +func (m *manager) hydrateGuestAgentReadyFromProbe(ctx context.Context, stored *StoredMetadata) bool { + if stored == nil || stored.SkipGuestAgent || stored.GuestAgentReadyAt != nil { + return false + } + probe := m.guestAgentReadyProbe + if probe == nil { + probe = probeGuestAgentReady + } + if !probe(ctx, stored) { + return false + } + readyAt := m.nowUTC() + stored.GuestAgentReadyAt = &readyAt + return true +} + +func probeGuestAgentReady(ctx context.Context, stored *StoredMetadata) bool { + if stored == nil || stored.SkipGuestAgent { + return false + } + dialer, err := hypervisor.NewVsockDialer(stored.HypervisorType, stored.VsockSocket, stored.VsockCID) + if err != nil { + return false + } + + probeCtx, cancel := context.WithTimeout(ctx, guestAgentReadyProbeTimeout) + defer cancel() + + exit, err := guest.ExecIntoInstance(probeCtx, dialer, guest.ExecOptions{ + Command: []string{"/bin/true"}, + Timeout: int32(guestAgentReadyProbeTimeout / time.Second), + WaitForAgent: guestAgentReadyProbeWait, + }) + return err == nil && exit != nil && exit.Code == 0 +} + // appLogPathsForMarkerScan returns app log paths in chronological order // (oldest rotated file to newest active file). func (m *manager) appLogPathsForMarkerScan(id string) []string { @@ -654,6 +698,9 @@ func (m *manager) persistBootMarkers(ctx context.Context, id string) { meta.GuestAgentReadyAt = guestAgentReadyAt updated = true } + if needAgent && meta.GuestAgentReadyAt == nil && meta.ProgramStartedAt != nil && m.hydrateGuestAgentReadyFromProbe(ctx, &meta.StoredMetadata) { + updated = true + } if !updated { return } diff --git a/lib/instances/query_test.go b/lib/instances/query_test.go index 6bc8745b..8bb0bb46 100644 --- a/lib/instances/query_test.go +++ b/lib/instances/query_test.go @@ -1,6 +1,7 @@ package instances import ( + "context" "os" "path/filepath" "testing" @@ -404,6 +405,39 @@ func TestHydrateBootMarkersFromLogs_RescanThrottle(t *testing.T) { require.NotNil(t, meta.GuestAgentReadyAt) } +func TestHydrateBootMarkersUsesGuestAgentProbeWhenReadyMarkerMissing(t *testing.T) { + t.Parallel() + + tmpDir := t.TempDir() + readyAt := time.Date(2026, 3, 8, 12, 0, 2, 0, time.UTC) + probeCalls := 0 + m := &manager{ + paths: paths.New(tmpDir), + now: func() time.Time { return readyAt }, + guestAgentReadyProbe: func(context.Context, *StoredMetadata) bool { + probeCalls++ + return true + }, + } + + meta := &StoredMetadata{ + Id: "systemd-instance", + SkipGuestAgent: false, + } + logPath := m.paths.InstanceAppLog(meta.Id) + require.NoError(t, os.MkdirAll(filepath.Dir(logPath), 0o755)) + require.NoError(t, os.WriteFile(logPath, []byte( + "HYPEMAN-PROGRAM-START ts=2026-03-08T12:00:01Z mode=systemd\n", + ), 0o644)) + + hydrated := m.hydrateBootMarkersFromLogs(context.Background(), meta) + require.True(t, hydrated) + require.NotNil(t, meta.ProgramStartedAt) + require.NotNil(t, meta.GuestAgentReadyAt) + assert.Equal(t, readyAt.Format(time.RFC3339Nano), meta.GuestAgentReadyAt.UTC().Format(time.RFC3339Nano)) + assert.Equal(t, 1, probeCalls) +} + func TestParseBootMarkers_IgnoresStaleMarkersBeforeBootStart(t *testing.T) { t.Parallel() diff --git a/lib/system/init/mode_systemd.go b/lib/system/init/mode_systemd.go index 9f47223f..8013cb80 100644 --- a/lib/system/init/mode_systemd.go +++ b/lib/system/init/mode_systemd.go @@ -94,8 +94,8 @@ ExecStart=/opt/hypeman/guest-agent EnvironmentFile=-/etc/hypeman/env Restart=always RestartSec=3 -StandardOutput=journal+console -StandardError=journal+console +StandardOutput=journal +StandardError=journal [Install] WantedBy=multi-user.target @@ -182,8 +182,8 @@ ExecStart=/opt/hypeman/hypeman-init --headers-worker-guest Nice=10 IOSchedulingClass=best-effort IOSchedulingPriority=7 -StandardOutput=journal+console -StandardError=journal+console +StandardOutput=journal +StandardError=journal [Install] WantedBy=multi-user.target diff --git a/lib/system/init/mode_systemd_test.go b/lib/system/init/mode_systemd_test.go index 725a4082..80ec3698 100644 --- a/lib/system/init/mode_systemd_test.go +++ b/lib/system/init/mode_systemd_test.go @@ -189,4 +189,7 @@ func TestInjectAgentServiceOmitsNetworkTargetDependency(t *testing.T) { data, err := os.ReadFile(servicePath) assert.NoError(t, err) assert.NotContains(t, string(data), "network.target") + assert.Contains(t, string(data), "StandardOutput=journal\n") + assert.Contains(t, string(data), "StandardError=journal\n") + assert.NotContains(t, string(data), "journal+console") }