Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ jobs:
local sleep_seconds=30
local n=1
while [ "$n" -le "$attempts" ]; do
if sudo apt-get update; then
if timeout 120s sudo apt-get update; then
Comment thread
sjmiller609 marked this conversation as resolved.
return 0
fi
if [ "$n" -eq "$attempts" ]; then
Expand All @@ -53,7 +53,7 @@ jobs:
! command -v qemu-system-x86_64 &> /dev/null || \
! qemu-system-x86_64 --version >/dev/null 2>&1; then
apt_update_with_retry
sudo apt-get install -y erofs-utils e2fsprogs iptables qemu-system-x86 qemu-utils
timeout 300s sudo apt-get install -y erofs-utils e2fsprogs iptables qemu-system-x86 qemu-utils
fi
go mod download

Expand All @@ -69,6 +69,13 @@ jobs:
sudo env "PATH=$TEST_PATH" bash -lc "command -v '$bin'"
done

- name: Create run-scoped temp directory
run: |
TEST_NETWORK_TMPDIR="/tmp/hm-net-${{ github.run_id }}-${{ github.run_attempt }}"
sudo rm -rf "$TEST_NETWORK_TMPDIR"
mkdir -p "$TEST_NETWORK_TMPDIR"
sudo chown -R "$(id -u):$(id -g)" "$TEST_NETWORK_TMPDIR"

# Avoids rate limits when running the tests
# Tests includes pulling, then converting to disk images
- name: Login to Docker Hub
Expand Down Expand Up @@ -106,6 +113,7 @@ jobs:

- name: Run tests
env:
HYPEMAN_TEST_NETWORK_TMPDIR: /tmp/hm-net-${{ github.run_id }}-${{ github.run_attempt }}
GO_TEST_TIMEOUT: 600s
# Docker auth for tests running as root (sudo)
DOCKER_CONFIG: /home/debianuser/.docker
Expand All @@ -122,6 +130,10 @@ jobs:
export HYPEMAN_TEST_PREWARM_DIR="$HOME/.cache/hypeman-ci/linux-amd64"
make test TEST_TIMEOUT=20m

- name: Cleanup
if: always()
run: sudo rm -rf "/tmp/hm-net-${{ github.run_id }}-${{ github.run_attempt }}"

test-darwin:
runs-on: [self-hosted, macos, arm64]
concurrency:
Expand Down
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -295,12 +295,14 @@ test-linux: ensure-ch-binaries ensure-firecracker-binaries ensure-caddy-binaries
if [ -n "$(TEST)" ]; then \
echo "Running specific test: $(TEST)"; \
sudo env "PATH=$$TEST_PATH" "DOCKER_CONFIG=$${DOCKER_CONFIG:-$$HOME/.docker}" "CI=$${CI:-}" \
"HYPEMAN_TEST_NETWORK_TMPDIR=$${HYPEMAN_TEST_NETWORK_TMPDIR:-}" \
"HYPEMAN_TEST_PREWARM_DIR=$${HYPEMAN_TEST_PREWARM_DIR:-}" \
"HYPEMAN_TEST_PREWARM_STRICT=$${HYPEMAN_TEST_PREWARM_STRICT:-}" \
"HYPEMAN_TEST_REGISTRY=$${HYPEMAN_TEST_REGISTRY:-}" \
go test -tags containers_image_openpgp -run=$(TEST) $$VERBOSE_FLAG -timeout=$(TEST_TIMEOUT) ./...; \
else \
sudo env "PATH=$$TEST_PATH" "DOCKER_CONFIG=$${DOCKER_CONFIG:-$$HOME/.docker}" "CI=$${CI:-}" \
"HYPEMAN_TEST_NETWORK_TMPDIR=$${HYPEMAN_TEST_NETWORK_TMPDIR:-}" \
"HYPEMAN_TEST_PREWARM_DIR=$${HYPEMAN_TEST_PREWARM_DIR:-}" \
"HYPEMAN_TEST_PREWARM_STRICT=$${HYPEMAN_TEST_PREWARM_STRICT:-}" \
"HYPEMAN_TEST_REGISTRY=$${HYPEMAN_TEST_REGISTRY:-}" \
Expand Down
4 changes: 3 additions & 1 deletion cmd/api/api/snapshots.go
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,9 @@ func (s *ApiService) ForkSnapshot(ctx context.Context, request oapi.ForkSnapshot
return oapi.ForkSnapshot400JSONResponse{Code: "invalid_request", Message: "request body is required"}, nil
}

domainReq := instances.ForkSnapshotRequest{Name: request.Body.Name}
domainReq := instances.ForkSnapshotRequest{
Name: request.Body.Name,
}
if request.Body.TargetState != nil {
domainReq.TargetState = instances.State(*request.Body.TargetState)
}
Expand Down
57 changes: 57 additions & 0 deletions cmd/api/api/snapshots_test.go
Original file line number Diff line number Diff line change
@@ -1,14 +1,35 @@
package api

import (
"context"
"testing"
"time"

"github.com/kernel/hypeman/lib/hypervisor"
"github.com/kernel/hypeman/lib/instances"
"github.com/kernel/hypeman/lib/oapi"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

type captureForkSnapshotManager struct {
instances.Manager
lastID string
lastReq *instances.ForkSnapshotRequest
result *instances.Instance
err error
}

func (m *captureForkSnapshotManager) ForkSnapshot(ctx context.Context, snapshotID string, req instances.ForkSnapshotRequest) (*instances.Instance, error) {
reqCopy := req
m.lastID = snapshotID
m.lastReq = &reqCopy
if m.err != nil {
return nil, m.err
}
return m.result, nil
}

func TestSnapshotScheduleToOAPIPreservesZeroMaxCount(t *testing.T) {
t.Parallel()

Expand All @@ -30,3 +51,39 @@ func TestSnapshotScheduleToOAPIPreservesZeroMaxCount(t *testing.T) {
require.NotNil(t, out.Retention.MaxAge)
assert.Equal(t, "24h0m0s", *out.Retention.MaxAge)
}

func TestForkSnapshotSuccess(t *testing.T) {
t.Parallel()
svc := newTestService(t)

forked := &instances.Instance{
StoredMetadata: instances.StoredMetadata{
Id: "forked-instance",
Name: "forked-instance",
Image: "docker.io/library/alpine:latest",
CreatedAt: time.Now(),
HypervisorType: hypervisor.TypeFirecracker,
},
State: instances.StateRunning,
}
mockMgr := &captureForkSnapshotManager{
Manager: svc.InstanceManager,
result: forked,
}
svc.InstanceManager = mockMgr

resp, err := svc.ForkSnapshot(ctx(), oapi.ForkSnapshotRequestObject{
SnapshotId: "snap-123",
Body: &oapi.ForkSnapshotRequest{
Name: "forked-instance",
},
})
require.NoError(t, err)

created, ok := resp.(oapi.ForkSnapshot201JSONResponse)
require.True(t, ok, "expected 201 response")
assert.Equal(t, "forked-instance", created.Name)
assert.Equal(t, "snap-123", mockMgr.lastID)
require.NotNil(t, mockMgr.lastReq)
assert.Equal(t, "forked-instance", mockMgr.lastReq.Name)
}
17 changes: 17 additions & 0 deletions lib/forkvm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,23 @@ to work across implementations.
For networked forks, the fork gets a fresh host/guest identity (IP, MAC, TAP)
instead of reusing the source identity.

## Resume network handoff

Networked standby/running forks need a new host-side allocation, but the guest
memory snapshot still contains the source VM's old interface state. On restore,
Hypeman prepares the fork's TAP/IP/MAC before the VM resumes, then hands the new
guest network config to the guest through a small mailbox embedded in snapshot
memory. After resume, VMGenID tells the guest-agent that this is a restored VM;
the guest-agent reads the mailbox and applies the new MAC, address, route, and
neighbor state with netlink.

For API calls that return a running fork, Hypeman waits for a guest UDP
"applied" ack before returning, so the fast path still avoids making
host-initiated guest RPC/vsock contact as the first post-resume dependency. If
the mailbox path is unavailable, cannot start its ack listener, or does not ack
in time, restore falls back to the older host-initiated guest network
reconfigure path.

## Fork data copy behavior

- Guest directory copy is **sparse-only** for regular files.
Expand Down
111 changes: 111 additions & 0 deletions lib/guest/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,12 @@ func GetOrCreateConn(ctx context.Context, dialer hypervisor.VsockDialer) (*grpc.
}

// Create new connection using the VsockDialer
traceCtx := ctx
conn, err := grpc.Dial("passthrough:///vsock",
grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) {
if span := trace.SpanFromContext(traceCtx); span.SpanContext().IsValid() {
ctx = trace.ContextWithSpan(ctx, span)
}
netConn, err := dialer.DialVsock(ctx, vsockGuestPort)
if err != nil {
return nil, &AgentVSockDialError{Err: err}
Expand Down Expand Up @@ -146,6 +150,113 @@ type ExecOptions struct {
ResizeChan <-chan *WindowSize // Optional: channel to receive resize events (pointer to avoid copying mutex)
}

type ReconfigureNetworkOptions struct {
InterfaceName string
MAC string
IPv4 string
Prefix uint32
Gateway string
WaitForAgent time.Duration
}

func ReconfigureNetworkInInstance(ctx context.Context, dialer hypervisor.VsockDialer, opts ReconfigureNetworkOptions) error {
if opts.WaitForAgent == 0 {
err := reconfigureNetworkOnce(ctx, dialer, opts)
if err != nil && isRetryableConnectionError(err) {
CloseConn(dialer.Key())
}
return err
}

ctx, span := otel.Tracer("hypeman/guest").Start(ctx, "guest.reconfigure_network", trace.WithAttributes(
attribute.Bool("wait_for_agent", true),
attribute.Int64("wait_for_agent_ms", opts.WaitForAgent.Milliseconds()),
))
defer span.End()

deadline := time.Now().Add(opts.WaitForAgent)
start := time.Now()
attempts := 0
retryableAttempts := 0
firstRetryableErrorType := ""
lastRetryableErrorType := ""
lastRetryInterval := time.Duration(0)

for {
attempts++
err := reconfigureNetworkOnce(ctx, dialer, opts)
if err == nil {
recordGuestExecWait(span, start, attempts, retryableAttempts, firstRetryableErrorType, lastRetryableErrorType, lastRetryInterval)
span.SetStatus(otelcodes.Ok, "")
return nil
}
if !isRetryableConnectionError(err) {
recordGuestExecWait(span, start, attempts, retryableAttempts, firstRetryableErrorType, lastRetryableErrorType, lastRetryInterval)
span.RecordError(err)
span.SetStatus(otelcodes.Error, err.Error())
return err
}

retryableAttempts++
errType := retryableConnectionErrorType(err)
if firstRetryableErrorType == "" {
firstRetryableErrorType = errType
}
lastRetryableErrorType = errType
CloseConn(dialer.Key())

if time.Now().After(deadline) {
recordGuestExecWait(span, start, attempts, retryableAttempts, firstRetryableErrorType, lastRetryableErrorType, lastRetryInterval)
span.RecordError(err)
span.SetStatus(otelcodes.Error, err.Error())
return err
}

retryInterval := guestExecRetryInterval(time.Since(start))
lastRetryInterval = retryInterval
select {
case <-ctx.Done():
recordGuestExecWait(span, start, attempts, retryableAttempts, firstRetryableErrorType, lastRetryableErrorType, lastRetryInterval)
span.RecordError(ctx.Err())
span.SetStatus(otelcodes.Error, ctx.Err().Error())
return ctx.Err()
case <-time.After(retryInterval):
}
}
}

func reconfigureNetworkOnce(ctx context.Context, dialer hypervisor.VsockDialer, opts ReconfigureNetworkOptions) error {
grpcConn, err := GetOrCreateConn(ctx, dialer)
if err != nil {
return fmt.Errorf("get grpc connection: %w", err)
}
client := NewGuestServiceClient(grpcConn)

_, span := otel.Tracer("hypeman/guest").Start(ctx, "guest.reconfigure_network.rpc")
_, err = client.ReconfigureNetwork(ctx, &ReconfigureNetworkRequest{
InterfaceName: opts.InterfaceName,
Mac: opts.MAC,
Ipv4: opts.IPv4,
Prefix: opts.Prefix,
Gateway: opts.Gateway,
})
finishGuestNetworkStepSpan(span, err)
if err != nil {
return fmt.Errorf("reconfigure network rpc: %w", err)
}
return nil
}

func finishGuestNetworkStepSpan(span trace.Span, err error) {
if err != nil {
span.RecordError(err)
span.SetStatus(otelcodes.Error, err.Error())
} else {
span.SetStatus(otelcodes.Ok, "")
}
span.End()
}

// ExecIntoInstance executes command in instance via vsock using gRPC.
// The dialer is a hypervisor-specific VsockDialer that knows how to connect to the guest.
// If WaitForAgent is set, it will retry on connection errors until the timeout.
Expand Down
Loading
Loading