Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 23 additions & 21 deletions lib/instances/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ type manager struct {
nativeCodecMu sync.Mutex
nativeCodecPaths map[string]string
imageUsageRecorder ImageUsageRecorder
guestAgentReadyProbe func(context.Context, *StoredMetadata) bool

// Shared lifecycle event subscriptions for internal consumers.
lifecycleEvents *lifecycleSubscribers
Expand Down Expand Up @@ -184,27 +185,28 @@ func NewManagerWithConfig(p *paths.Paths, imageManager images.Manager, systemMan
}

m := &manager{
paths: p,
imageManager: imageManager,
systemManager: systemManager,
networkManager: networkManager,
deviceManager: deviceManager,
volumeManager: volumeManager,
limits: limits,
instanceLocks: sync.Map{},
bootMarkerScans: sync.Map{},
hostTopology: detectHostTopology(), // Detect and cache host topology
vmStarters: vmStarters,
defaultHypervisor: defaultHypervisor,
now: time.Now,
writeFile: os.WriteFile,
meter: meter,
tracer: tracer,
guestMemoryPolicy: policy,
snapshotDefaults: snapshotDefaults,
compressionJobs: make(map[string]*compressionJob),
nativeCodecPaths: make(map[string]string),
lifecycleEvents: newLifecycleSubscribersWithBufferSize(managerConfig.LifecycleEventBufferSize),
paths: p,
imageManager: imageManager,
systemManager: systemManager,
networkManager: networkManager,
deviceManager: deviceManager,
volumeManager: volumeManager,
limits: limits,
instanceLocks: sync.Map{},
bootMarkerScans: sync.Map{},
hostTopology: detectHostTopology(), // Detect and cache host topology
vmStarters: vmStarters,
defaultHypervisor: defaultHypervisor,
now: time.Now,
writeFile: os.WriteFile,
meter: meter,
tracer: tracer,
guestMemoryPolicy: policy,
snapshotDefaults: snapshotDefaults,
compressionJobs: make(map[string]*compressionJob),
nativeCodecPaths: make(map[string]string),
lifecycleEvents: newLifecycleSubscribersWithBufferSize(managerConfig.LifecycleEventBufferSize),
guestAgentReadyProbe: probeGuestAgentReady,
}
m.deleteSnapshotFn = m.deleteSnapshot

Expand Down
55 changes: 51 additions & 4 deletions lib/instances/query.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"syscall"
"time"

"github.com/kernel/hypeman/lib/guest"
"github.com/kernel/hypeman/lib/healthcheck"
"github.com/kernel/hypeman/lib/hypervisor"
"github.com/kernel/hypeman/lib/instances/phasetracking"
Expand All @@ -23,10 +24,12 @@ import (

// exitSentinelPrefix is the machine-parseable prefix written by init to serial console.
const (
exitSentinelPrefix = "HYPEMAN-EXIT "
programStartSentinelPrefix = "HYPEMAN-PROGRAM-START "
agentReadySentinelPrefix = "HYPEMAN-AGENT-READY "
bootMarkerRescanInterval = 1 * time.Second
exitSentinelPrefix = "HYPEMAN-EXIT "
programStartSentinelPrefix = "HYPEMAN-PROGRAM-START "
agentReadySentinelPrefix = "HYPEMAN-AGENT-READY "
bootMarkerRescanInterval = 1 * time.Second
guestAgentReadyProbeWait = 250 * time.Millisecond
guestAgentReadyProbeTimeout = 1 * time.Second
// hypervisorStateCacheTTL bounds how long a cached hypervisor state may be
// reused before a fresh GetVMInfo call is required. Short enough to detect
// guest-driven shutdowns promptly, long enough that bursty list calls
Expand Down Expand Up @@ -275,6 +278,8 @@ func advancePhaseIfRunning(stored *StoredMetadata) {
}

// hydrateBootMarkersFromLogs fills missing boot markers from serial logs.
// Guest-agent readiness also falls back to a direct vsock probe so systemd
// services do not need to forward stdout/stderr to the serial console.
// Returns true when at least one missing marker was found and populated.
func (m *manager) hydrateBootMarkersFromLogs(ctx context.Context, stored *StoredMetadata) bool {
needProgram := stored.ProgramStartedAt == nil
Expand Down Expand Up @@ -302,6 +307,9 @@ func (m *manager) hydrateBootMarkersFromLogs(ctx context.Context, stored *Stored
stored.GuestAgentReadyAt = guestAgentReadyAt
hydrated = true
}
if needAgent && stored.GuestAgentReadyAt == nil && stored.ProgramStartedAt != nil && m.hydrateGuestAgentReadyFromProbe(ctx, stored) {
hydrated = true
}
if hydrated {
advancePhaseIfRunning(stored)
m.clearBootMarkerRescan(stored.Id)
Expand Down Expand Up @@ -401,6 +409,42 @@ func (m *manager) nowUTC() time.Time {
return time.Now().UTC()
}

func (m *manager) hydrateGuestAgentReadyFromProbe(ctx context.Context, stored *StoredMetadata) bool {
if stored == nil || stored.SkipGuestAgent || stored.GuestAgentReadyAt != nil {
return false
}
probe := m.guestAgentReadyProbe
if probe == nil {
probe = probeGuestAgentReady
}
if !probe(ctx, stored) {
return false
}
readyAt := m.nowUTC()
stored.GuestAgentReadyAt = &readyAt
return true
}

func probeGuestAgentReady(ctx context.Context, stored *StoredMetadata) bool {
if stored == nil || stored.SkipGuestAgent {
return false
}
dialer, err := hypervisor.NewVsockDialer(stored.HypervisorType, stored.VsockSocket, stored.VsockCID)
if err != nil {
return false
}

probeCtx, cancel := context.WithTimeout(ctx, guestAgentReadyProbeTimeout)
defer cancel()

exit, err := guest.ExecIntoInstance(probeCtx, dialer, guest.ExecOptions{
Command: []string{"/bin/true"},
Timeout: int32(guestAgentReadyProbeTimeout / time.Second),
WaitForAgent: guestAgentReadyProbeWait,
})
return err == nil && exit != nil && exit.Code == 0
}

// appLogPathsForMarkerScan returns app log paths in chronological order
// (oldest rotated file to newest active file).
func (m *manager) appLogPathsForMarkerScan(id string) []string {
Expand Down Expand Up @@ -654,6 +698,9 @@ func (m *manager) persistBootMarkers(ctx context.Context, id string) {
meta.GuestAgentReadyAt = guestAgentReadyAt
updated = true
}
if needAgent && meta.GuestAgentReadyAt == nil && meta.ProgramStartedAt != nil && m.hydrateGuestAgentReadyFromProbe(ctx, &meta.StoredMetadata) {
updated = true
}
if !updated {
return
}
Expand Down
34 changes: 34 additions & 0 deletions lib/instances/query_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package instances

import (
"context"
"os"
"path/filepath"
"testing"
Expand Down Expand Up @@ -404,6 +405,39 @@ func TestHydrateBootMarkersFromLogs_RescanThrottle(t *testing.T) {
require.NotNil(t, meta.GuestAgentReadyAt)
}

func TestHydrateBootMarkersUsesGuestAgentProbeWhenReadyMarkerMissing(t *testing.T) {
t.Parallel()

tmpDir := t.TempDir()
readyAt := time.Date(2026, 3, 8, 12, 0, 2, 0, time.UTC)
probeCalls := 0
m := &manager{
paths: paths.New(tmpDir),
now: func() time.Time { return readyAt },
guestAgentReadyProbe: func(context.Context, *StoredMetadata) bool {
probeCalls++
return true
},
}

meta := &StoredMetadata{
Id: "systemd-instance",
SkipGuestAgent: false,
}
logPath := m.paths.InstanceAppLog(meta.Id)
require.NoError(t, os.MkdirAll(filepath.Dir(logPath), 0o755))
require.NoError(t, os.WriteFile(logPath, []byte(
"HYPEMAN-PROGRAM-START ts=2026-03-08T12:00:01Z mode=systemd\n",
), 0o644))

hydrated := m.hydrateBootMarkersFromLogs(context.Background(), meta)
require.True(t, hydrated)
require.NotNil(t, meta.ProgramStartedAt)
require.NotNil(t, meta.GuestAgentReadyAt)
assert.Equal(t, readyAt.Format(time.RFC3339Nano), meta.GuestAgentReadyAt.UTC().Format(time.RFC3339Nano))
assert.Equal(t, 1, probeCalls)
}

func TestParseBootMarkers_IgnoresStaleMarkersBeforeBootStart(t *testing.T) {
t.Parallel()

Expand Down
8 changes: 4 additions & 4 deletions lib/system/init/mode_systemd.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,8 @@ ExecStart=/opt/hypeman/guest-agent
EnvironmentFile=-/etc/hypeman/env
Restart=always
RestartSec=3
StandardOutput=journal+console
StandardError=journal+console
StandardOutput=journal
StandardError=journal

[Install]
WantedBy=multi-user.target
Expand Down Expand Up @@ -182,8 +182,8 @@ ExecStart=/opt/hypeman/hypeman-init --headers-worker-guest
Nice=10
IOSchedulingClass=best-effort
IOSchedulingPriority=7
StandardOutput=journal+console
StandardError=journal+console
StandardOutput=journal
StandardError=journal

[Install]
WantedBy=multi-user.target
Expand Down
3 changes: 3 additions & 0 deletions lib/system/init/mode_systemd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,4 +189,7 @@ func TestInjectAgentServiceOmitsNetworkTargetDependency(t *testing.T) {
data, err := os.ReadFile(servicePath)
assert.NoError(t, err)
assert.NotContains(t, string(data), "network.target")
assert.Contains(t, string(data), "StandardOutput=journal\n")
assert.Contains(t, string(data), "StandardError=journal\n")
assert.NotContains(t, string(data), "journal+console")
}
Loading