From 970a30972a80af905cd401697c1afabff1e06be7 Mon Sep 17 00:00:00 2001 From: chenggui53 Date: Mon, 22 Jun 2026 14:47:48 +0800 Subject: [PATCH] feat: add node-local overlayfs rootfs cache for actor resume Implements issue #228: cache extracted rootfs per image digest on each node, and materialize per-actor bundles as overlayfs mounts instead of re-untarring on every restore. Changes: - New rootfscache package (cmd/atelet/internal/rootfscache) - New overlay.go with mount/unmount helpers - Modified prepareOCIDirectory for overlayfs integration - Updated resetActorDirs to unmount before cleanup - Added rootfs cache paths to ateompath - Unit tests for cache hit/miss, concurrent access, eviction --- .../internal/rootfscache/rootfscache.go | 624 ++++++++++++++++++ .../internal/rootfscache/rootfscache_test.go | 336 ++++++++++ cmd/atelet/main.go | 25 + cmd/atelet/oci.go | 241 +++---- cmd/atelet/overlay.go | 87 +++ internal/ateompath/ateompath.go | 13 + 6 files changed, 1170 insertions(+), 156 deletions(-) create mode 100644 cmd/atelet/internal/rootfscache/rootfscache.go create mode 100644 cmd/atelet/internal/rootfscache/rootfscache_test.go create mode 100644 cmd/atelet/overlay.go diff --git a/cmd/atelet/internal/rootfscache/rootfscache.go b/cmd/atelet/internal/rootfscache/rootfscache.go new file mode 100644 index 000000000..ef7163603 --- /dev/null +++ b/cmd/atelet/internal/rootfscache/rootfscache.go @@ -0,0 +1,624 @@ +// Copyright 2026 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package rootfscache provides a node-local, digest-keyed cache of extracted +// OCI rootfs directories. On a cache hit the caller can set up an overlayfs +// mount instead of re-extracting the image tarball, reducing per-restore +// latency from seconds to sub-second. +package rootfscache + +import ( + "archive/tar" + "context" + "errors" + "fmt" + "io" + "log/slog" + "os" + "path/filepath" + "sort" + "strings" + "sync" + "time" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +// DefaultMaxCacheBytes is the default disk budget for the rootfs cache (20 GiB). +const DefaultMaxCacheBytes int64 = 20 * 1024 * 1024 * 1024 + +// readySentinel is the name of the sentinel file written atomically after a +// rootfs has been fully extracted. Its presence is the cache-hit signal. +const readySentinel = ".ready" + +// lastAccessFile is the name of the file that stores the unix-timestamp of the +// most recent cache hit, used for LRU eviction. +const lastAccessFile = ".last_access" + +// entryState tracks one cached digest. All fields are immutable after +// construction except lastAccess, which is updated on cache hits. +type entryState struct { + digest string + lowerDir string + sizeBytes int64 + lastAccess time.Time +} + +// Cache is a node-local, digest-keyed rootfs cache. It is safe for +// concurrent use. +type Cache struct { + basePath string + maxCacheBytes int64 + + mu sync.Mutex + entries map[string]*entryState // keyed by digest + + // inflight deduplicates concurrent EnsureRootfs calls for the same + // digest: the first goroutine extracts while others wait. + inflight map[string]*inflightEntry + + // metrics + cacheHits metric.Int64Counter + cacheMisses metric.Int64Counter +} + +type inflightEntry struct { + done chan struct{} + // result is set before done is closed. + lowerDir string + cached bool + err error +} + +// New creates a Cache rooted at basePath. The directory is created if it does +// not exist. maxCacheBytes caps total disk usage; pass 0 for +// DefaultMaxCacheBytes. +func New(ctx context.Context, basePath string, maxCacheBytes int64) (*Cache, error) { + if maxCacheBytes <= 0 { + maxCacheBytes = DefaultMaxCacheBytes + } + if err := os.MkdirAll(basePath, 0o700); err != nil { + return nil, fmt.Errorf("creating rootfs cache dir: %w", err) + } + + meter := otel.Meter("atelet") + cacheHits, err := meter.Int64Counter("atelet.rootfs_cache.hit", + metric.WithDescription("Rootfs cache hits (overlay path taken)"), + ) + if err != nil { + return nil, fmt.Errorf("creating hit metric: %w", err) + } + cacheMisses, err := meter.Int64Counter("atelet.rootfs_cache.miss", + metric.WithDescription("Rootfs cache misses (untar required)"), + ) + if err != nil { + return nil, fmt.Errorf("creating miss metric: %w", err) + } + + c := &Cache{ + basePath: basePath, + maxCacheBytes: maxCacheBytes, + entries: make(map[string]*entryState), + inflight: make(map[string]*inflightEntry), + cacheHits: cacheHits, + cacheMisses: cacheMisses, + } + + // Load existing cache entries from disk. + if err := c.loadIndex(ctx); err != nil { + slog.WarnContext(ctx, "Failed to load rootfs cache index, starting empty", slog.Any("err", err)) + } + + return c, nil +} + +// LowerDir returns the read-only rootfs directory for the given digest, or "" +// if the digest is not cached. +func (c *Cache) LowerDir(digest string) string { + c.mu.Lock() + defer c.mu.Unlock() + if e, ok := c.entries[digest]; ok { + return e.lowerDir + } + return "" +} + +// EnsureRootfs guarantees that the rootfs for digest is extracted into the +// cache. On a cache hit it returns the lowerDir immediately without reading +// tarData. On a miss it consumes tarData to populate the cache. +// +// Returns: +// +// lowerDir – the read-only rootfs path (non-empty on success) +// cached – true if the cache was hit (tarData was NOT consumed) +// err – any error +// +// The digest MUST be a valid directory name (hex-encoded sha256, no slashes). +func (c *Cache) EnsureRootfs(ctx context.Context, digest string, tarData io.Reader) (string, bool, error) { + tracer := otel.Tracer("rootfscache") + ctx, span := tracer.Start(ctx, "EnsureRootfs") + span.SetAttributes(attribute.String("digest", digest)) + defer span.End() + + if err := validateDigest(digest); err != nil { + return "", false, err + } + + // Fast path: already cached. + c.mu.Lock() + if e, ok := c.entries[digest]; ok { + c.mu.Unlock() + c.cacheHits.Add(ctx, 1) + span.SetAttributes(attribute.Bool("hit", true)) + _ = c.touchAccess(digest) + return e.lowerDir, true, nil + } + + // Deduplicate concurrent requests for the same digest. + if infl, ok := c.inflight[digest]; ok { + c.mu.Unlock() + select { + case <-infl.done: + return infl.lowerDir, infl.cached, infl.err + case <-ctx.Done(): + return "", false, ctx.Err() + } + } + + // We are the first goroutine for this digest — set up the inflight entry + // so others can wait on us. + infl := &inflightEntry{done: make(chan struct{})} + c.inflight[digest] = infl + c.mu.Unlock() + + // Do the actual extraction outside the lock. + lowerDir, err := c.extract(ctx, digest, tarData) + + c.mu.Lock() + delete(c.inflight, digest) + c.mu.Unlock() + + if err != nil { + infl.lowerDir = "" + infl.cached = false + infl.err = err + close(infl.done) + return "", false, err + } + + c.cacheMisses.Add(ctx, 1) + span.SetAttributes(attribute.Bool("hit", false)) + + infl.lowerDir = lowerDir + infl.cached = false + infl.err = nil + close(infl.done) + return lowerDir, false, nil +} + +// extract untars tarData into the cache directory for digest, writes the +// .ready sentinel and .last_access file, and returns the lowerDir path. +func (c *Cache) extract(ctx context.Context, digest string, tarData io.Reader) (string, error) { + lowerDir := filepath.Join(c.basePath, digest, "lower") + + // Clean up any partial extraction from a previous crash. + if err := os.RemoveAll(filepath.Join(c.basePath, digest)); err != nil { + return "", fmt.Errorf("cleaning partial cache entry: %w", err) + } + if err := os.MkdirAll(lowerDir, 0o700); err != nil { + return "", fmt.Errorf("creating cache lower dir: %w", err) + } + + slog.InfoContext(ctx, "Rootfs cache miss, extracting", + slog.String("digest", digest), + slog.String("lowerDir", lowerDir), + ) + + if err := Untar(ctx, tarData, lowerDir); err != nil { + // Clean up on failure so the next attempt starts fresh. + _ = os.RemoveAll(filepath.Join(c.basePath, digest)) + return "", fmt.Errorf("extracting rootfs: %w", err) + } + + // Make the lower dir read-only (best effort; overlayfs lowerdir is + // inherently read-only, but this adds a layer of defense). + _ = chmodRecursive(lowerDir, 0o555) + + // Write the .ready sentinel atomically. + readyPath := filepath.Join(c.basePath, digest, readySentinel) + if err := os.WriteFile(readyPath, []byte(time.Now().Format(time.RFC3339)), 0o444); err != nil { + return "", fmt.Errorf("writing ready sentinel: %w", err) + } + + // Write the .last_access file. + if err := c.touchAccess(digest); err != nil { + return "", fmt.Errorf("writing last_access: %w", err) + } + + // Register in the in-memory index. + size := dirSize(lowerDir) + c.mu.Lock() + c.entries[digest] = &entryState{ + digest: digest, + lowerDir: lowerDir, + sizeBytes: size, + lastAccess: time.Now(), + } + c.mu.Unlock() + + // Best-effort eviction. + go c.evictIfNeeded(context.Background()) + + return lowerDir, nil +} + +// loadIndex scans the basePath for completed cache entries (those with a +// .ready sentinel) and populates the in-memory index. +func (c *Cache) loadIndex(ctx context.Context) error { + dirs, err := os.ReadDir(c.basePath) + if err != nil { + if errors.Is(err, os.ErrNotExist) { + return nil + } + return err + } + + var totalSize int64 + for _, d := range dirs { + if !d.IsDir() { + continue + } + digest := d.Name() + readyPath := filepath.Join(c.basePath, digest, readySentinel) + if _, err := os.Stat(readyPath); err != nil { + // Incomplete entry from a previous crash — remove it. + _ = os.RemoveAll(filepath.Join(c.basePath, digest)) + continue + } + lowerDir := filepath.Join(c.basePath, digest, "lower") + size := dirSize(lowerDir) + + // Read last_access time; fall back to ready file mtime. + lastAccess := readAccessTime(filepath.Join(c.basePath, digest)) + + c.entries[digest] = &entryState{ + digest: digest, + lowerDir: lowerDir, + sizeBytes: size, + lastAccess: lastAccess, + } + totalSize += size + } + + slog.InfoContext(ctx, "Loaded rootfs cache index", + slog.Int("entries", len(c.entries)), + slog.Int64("totalBytes", totalSize), + ) + return nil +} + +// touchAccess updates the .last_access file and in-memory timestamp for +// digest. +func (c *Cache) touchAccess(digest string) error { + now := time.Now() + path := filepath.Join(c.basePath, digest, lastAccessFile) + if err := os.WriteFile(path, []byte(now.Format(time.RFC3339Nano)), 0o644); err != nil { + return err + } + c.mu.Lock() + if e, ok := c.entries[digest]; ok { + e.lastAccess = now + } + c.mu.Unlock() + return nil +} + +// evictIfNeeded removes the oldest entries until total cache size is within +// the budget. It is called asynchronously after each extraction. +func (c *Cache) evictIfNeeded(ctx context.Context) { + c.mu.Lock() + defer c.mu.Unlock() + + var total int64 + for _, e := range c.entries { + total += e.sizeBytes + } + + for total > c.maxCacheBytes && len(c.entries) > 0 { + // Find the entry with the oldest lastAccess. + var oldest *entryState + for _, e := range c.entries { + if oldest == nil || e.lastAccess.Before(oldest.lastAccess) { + oldest = e + } + } + if oldest == nil { + break + } + + slog.InfoContext(ctx, "Evicting rootfs cache entry", + slog.String("digest", oldest.digest), + slog.Int64("sizeBytes", oldest.sizeBytes), + slog.Time("lastAccess", oldest.lastAccess), + ) + + entryDir := filepath.Join(c.basePath, oldest.digest) + if err := os.RemoveAll(entryDir); err != nil { + slog.WarnContext(ctx, "Failed to evict cache entry", slog.String("digest", oldest.digest), slog.Any("err", err)) + break + } + total -= oldest.sizeBytes + delete(c.entries, oldest.digest) + } +} + +// EvictLRU removes the least-recently-used cache entry and returns its digest +// and size, or ("", 0) if the cache is empty. This is exported for tests. +func (c *Cache) EvictLRU() (string, int64) { + c.mu.Lock() + defer c.mu.Unlock() + + var oldest *entryState + for _, e := range c.entries { + if oldest == nil || e.lastAccess.Before(oldest.lastAccess) { + oldest = e + } + } + if oldest == nil { + return "", 0 + } + + entryDir := filepath.Join(c.basePath, oldest.digest) + size := oldest.sizeBytes + _ = os.RemoveAll(entryDir) + delete(c.entries, oldest.digest) + return oldest.digest, size +} + +// Size returns the total size of all cached entries in bytes. +func (c *Cache) Size() int64 { + c.mu.Lock() + defer c.mu.Unlock() + var total int64 + for _, e := range c.entries { + total += e.sizeBytes + } + return total +} + +// Count returns the number of cached entries. +func (c *Cache) Count() int { + c.mu.Lock() + defer c.mu.Unlock() + return len(c.entries) +} + +// --- Untar implementation ------------------------------------------------- + +// Untar extracts a tar stream into rootPath. It is a self-contained copy of +// the untar logic from cmd/atelet/oci.go, using os.OpenRoot for path-traversal +// safety. +func Untar(ctx context.Context, tarData io.Reader, rootPath string) error { + tracer := otel.Tracer("rootfscache") + _, span := tracer.Start(ctx, "Untar") + defer span.End() + + root, err := os.OpenRoot(rootPath) + if err != nil { + return fmt.Errorf("opening rootfs %q as os.Root: %w", rootPath, err) + } + defer root.Close() + + tarReader := tar.NewReader(tarData) + for { + hdr, err := tarReader.Next() + if errors.Is(err, io.EOF) { + break + } else if err != nil { + return fmt.Errorf("in tarReader.Next: %w", err) + } + + name, skip, err := ValidateTarName(hdr.Name) + if err != nil { + return fmt.Errorf("invalid tar entry: %w", err) + } + if skip { + continue + } + + mode := hdr.FileInfo().Mode().Perm() + + switch hdr.Typeflag { + case tar.TypeReg: + if _, err := root.Lstat(name); err == nil { + if err := root.RemoveAll(name); err != nil { + return fmt.Errorf("while replacing existing path at %q before regular file: %w", name, err) + } + } else if !errors.Is(err, os.ErrNotExist) { + return fmt.Errorf("while checking existing path at %q before regular file: %w", name, err) + } + + outFile, err := root.OpenFile(name, os.O_CREATE|os.O_RDWR|os.O_TRUNC, mode) + if err != nil { + return fmt.Errorf("while creating file %q: %w", name, err) + } + _, err = io.Copy(outFile, tarReader) + closeErr := outFile.Close() + if err != nil { + return fmt.Errorf("while writing contents of %q from tar stream: %w", name, err) + } + if closeErr != nil { + return fmt.Errorf("while closing file %q: %w", name, closeErr) + } + + case tar.TypeDir: + err := root.Mkdir(name, mode) + if errors.Is(err, os.ErrExist) { + // Tolerate repeated directory entries. + } else if err != nil { + return fmt.Errorf("while creating directory=%q, mode=%v: %w", name, mode, err) + } + + case tar.TypeSymlink: + if existing, err := root.Lstat(name); err == nil { + if existing.Mode()&os.ModeSymlink != 0 { + if cur, rerr := root.Readlink(name); rerr == nil && cur == hdr.Linkname { + continue + } + } + if err := root.RemoveAll(name); err != nil { + return fmt.Errorf("while replacing existing path at %q before symlink: %w", name, err) + } + } else if !errors.Is(err, os.ErrNotExist) { + return fmt.Errorf("while checking existing path at %q before symlink: %w", name, err) + } + if err := root.Symlink(hdr.Linkname, name); err != nil { + return fmt.Errorf("while creating symlink src=%q target=%q: %w", name, hdr.Linkname, err) + } + + case tar.TypeLink: + linkname, linkSkip, err := ValidateTarName(hdr.Linkname) + if err != nil { + return fmt.Errorf("invalid hardlink target for %q: %w", name, err) + } + if linkSkip { + return fmt.Errorf("invalid hardlink target for %q: empty", name) + } + if _, err := root.Lstat(name); err == nil { + if err := root.RemoveAll(name); err != nil { + return fmt.Errorf("while replacing existing path at %q before hardlink: %w", name, err) + } + } else if !errors.Is(err, os.ErrNotExist) { + return fmt.Errorf("while checking existing path at %q before hardlink: %w", name, err) + } + if err := root.Link(linkname, name); err != nil { + return fmt.Errorf("while creating hardlink src=%q target=%q: %w", name, linkname, err) + } + + default: + tfStr := string([]byte{hdr.Typeflag}) + slog.ErrorContext(ctx, "Unhandled tar entry typeflag", slog.String("typeflag", tfStr), slog.Any("hdr", hdr)) + return fmt.Errorf("unhandled tar entry typeflag %q", tfStr) + } + } + + return nil +} + +// --- helpers -------------------------------------------------------------- + +// ValidateTarName cleans and validates a tar entry name. It is exported so +// the main package's tests can call it without importing the internals. +func ValidateTarName(name string) (cleaned string, skip bool, err error) { + if name == "" { + return "", true, nil + } + cleaned = filepath.Clean(name) + if cleaned == "." { + return "", true, nil + } + cleaned = strings.TrimPrefix(cleaned, "/") + if cleaned == "" || cleaned == "." { + return "", true, nil + } + if !filepath.IsLocal(cleaned) { + return "", false, fmt.Errorf("not a local path: %q", name) + } + return cleaned, false, nil +} + +func validateDigest(digest string) error { + if digest == "" { + return fmt.Errorf("digest must not be empty") + } + if strings.ContainsAny(digest, "/\\..") { + return fmt.Errorf("digest contains invalid characters: %q", digest) + } + return nil +} + +// dirSize returns the total size of all regular files under dir. +func dirSize(dir string) int64 { + var size int64 + _ = filepath.WalkDir(dir, func(_ string, d os.DirEntry, err error) error { + if err != nil { + return nil + } + if !d.IsDir() { + if info, err := d.Info(); err == nil { + size += info.Size() + } + } + return nil + }) + return size +} + +// readAccessTime reads the .last_access file or falls back to the directory +// mtime. +func readAccessTime(entryDir string) time.Time { + data, err := os.ReadFile(filepath.Join(entryDir, lastAccessFile)) + if err == nil { + s := strings.TrimSpace(string(data)) + if t, perr := time.Parse(time.RFC3339Nano, s); perr == nil { + return t + } + if t, perr := time.Parse(time.RFC3339, s); perr == nil { + return t + } + } + // Fallback: directory mtime. + if info, err := os.Stat(entryDir); err == nil { + return info.ModTime() + } + return time.Time{} +} + +// chmodRecursive sets the permission bits on every file and directory under +// root. Errors are silently ignored (best-effort). +func chmodRecursive(root string, perm os.FileMode) error { + return filepath.WalkDir(root, func(path string, d os.DirEntry, err error) error { + if err != nil { + return nil + } + _ = os.Chmod(path, perm) + return nil + }) +} + +// SortedDigests returns the cached digests sorted by last-access time +// (oldest first). Useful for diagnostics and eviction. +func (c *Cache) SortedDigests() []string { + c.mu.Lock() + defer c.mu.Unlock() + + type entry struct { + digest string + lastAccess time.Time + } + var entries []entry + for _, e := range c.entries { + entries = append(entries, entry{e.digest, e.lastAccess}) + } + sort.Slice(entries, func(i, j int) bool { + return entries[i].lastAccess.Before(entries[j].lastAccess) + }) + out := make([]string, len(entries)) + for i, e := range entries { + out[i] = e.digest + } + return out +} diff --git a/cmd/atelet/internal/rootfscache/rootfscache_test.go b/cmd/atelet/internal/rootfscache/rootfscache_test.go new file mode 100644 index 000000000..bc346c825 --- /dev/null +++ b/cmd/atelet/internal/rootfscache/rootfscache_test.go @@ -0,0 +1,336 @@ +// Copyright 2026 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package rootfscache + +import ( + "archive/tar" + "bytes" + "context" + "os" + "path/filepath" + "sync" + "testing" +) + +const testDigest = "sha256:abcdef1234567890abcdef1234567890abcdef1234567890abcdef1234567890" + +func buildTar(t *testing.T, entries []struct{ name, body string; typeflag byte; mode int64 }) []byte { + t.Helper() + var buf bytes.Buffer + tw := tar.NewWriter(&buf) + for _, e := range entries { + mode := e.mode + if mode == 0 { + if e.typeflag == tar.TypeDir { + mode = 0o755 + } else { + mode = 0o644 + } + } + hdr := &tar.Header{ + Name: e.name, + Typeflag: e.typeflag, + Mode: mode, + Size: int64(len(e.body)), + } + if err := tw.WriteHeader(hdr); err != nil { + t.Fatalf("tar.WriteHeader: %v", err) + } + if e.body != "" { + if _, err := tw.Write([]byte(e.body)); err != nil { + t.Fatalf("tar.Write: %v", err) + } + } + } + if err := tw.Close(); err != nil { + t.Fatalf("tar.Close: %v", err) + } + return buf.Bytes() +} + +func TestEnsureRootfs_CacheMiss(t *testing.T) { + base := t.TempDir() + c, err := New(context.Background(), base, 0) + if err != nil { + t.Fatalf("New: %v", err) + } + + tarData := buildTar(t, []struct{ name, body string; typeflag byte; mode int64 }{ + {name: ".", typeflag: tar.TypeDir}, + {name: "etc/", typeflag: tar.TypeDir}, + {name: "etc/hostname", typeflag: tar.TypeReg, body: "test-host\n"}, + }) + + lowerDir, cached, err := c.EnsureRootfs(context.Background(), testDigest, bytes.NewReader(tarData)) + if err != nil { + t.Fatalf("EnsureRootfs: %v", err) + } + if cached { + t.Errorf("expected cache miss, got hit") + } + if lowerDir == "" { + t.Fatalf("lowerDir is empty") + } + + // Verify the rootfs was extracted correctly. + data, err := os.ReadFile(filepath.Join(lowerDir, "etc/hostname")) + if err != nil { + t.Fatalf("read etc/hostname: %v", err) + } + if string(data) != "test-host\n" { + t.Errorf("etc/hostname = %q, want %q", data, "test-host\n") + } + + // Verify sentinel file exists. + readyPath := filepath.Join(base, testDigest, readySentinel) + if _, err := os.Stat(readyPath); err != nil { + t.Fatalf("ready sentinel missing: %v", err) + } + + if c.Count() != 1 { + t.Errorf("count = %d, want 1", c.Count()) + } +} + +func TestEnsureRootfs_CacheHit(t *testing.T) { + base := t.TempDir() + c, err := New(context.Background(), base, 0) + if err != nil { + t.Fatalf("New: %v", err) + } + + tarData := buildTar(t, []struct{ name, body string; typeflag byte; mode int64 }{ + {name: ".", typeflag: tar.TypeDir}, + {name: "hello", typeflag: tar.TypeReg, body: "world"}, + }) + + // First call: cache miss. + if _, _, err := c.EnsureRootfs(context.Background(), testDigest, bytes.NewReader(tarData)); err != nil { + t.Fatalf("EnsureRootfs (miss): %v", err) + } + + // Second call: cache hit. Pass nil reader — it must not be read. + lowerDir, cached, err := c.EnsureRootfs(context.Background(), testDigest, nil) + if err != nil { + t.Fatalf("EnsureRootfs (hit): %v", err) + } + if !cached { + t.Errorf("expected cache hit, got miss") + } + if lowerDir == "" { + t.Fatalf("lowerDir is empty on hit") + } + + // Verify content still accessible. + data, err := os.ReadFile(filepath.Join(lowerDir, "hello")) + if err != nil { + t.Fatalf("read hello: %v", err) + } + if string(data) != "world" { + t.Errorf("hello = %q, want %q", data, "world") + } +} + +func TestEnsureRootfs_ConcurrentMisses(t *testing.T) { + base := t.TempDir() + c, err := New(context.Background(), base, 0) + if err != nil { + t.Fatalf("New: %v", err) + } + + tarData := buildTar(t, []struct{ name, body string; typeflag byte; mode int64 }{ + {name: ".", typeflag: tar.TypeDir}, + {name: "concurrent", typeflag: tar.TypeReg, body: "ok"}, + }) + + const goroutines = 10 + var wg sync.WaitGroup + errs := make([]error, goroutines) + lowerDirs := make([]string, goroutines) + cachedFlags := make([]bool, goroutines) + + for i := 0; i < goroutines; i++ { + i := i + wg.Add(1) + go func() { + defer wg.Done() + // Each goroutine gets its own reader over the same data. + lowerDirs[i], cachedFlags[i], errs[i] = c.EnsureRootfs( + context.Background(), testDigest, bytes.NewReader(tarData), + ) + }() + } + wg.Wait() + + // At least one goroutine should have done the extraction (miss). + // All should succeed with the same lowerDir. + anyMiss := false + for i := 0; i < goroutines; i++ { + if errs[i] != nil { + t.Errorf("goroutine %d: %v", i, errs[i]) + continue + } + if !cachedFlags[i] { + anyMiss = true + } + if lowerDirs[i] == "" { + t.Errorf("goroutine %d: empty lowerDir", i) + } + if i > 0 && lowerDirs[i] != lowerDirs[0] { + t.Errorf("goroutine %d: lowerDir %q != goroutine 0 lowerDir %q", i, lowerDirs[i], lowerDirs[0]) + } + } + if !anyMiss { + t.Errorf("expected at least one cache miss among %d goroutines", goroutines) + } + + // Only one cache entry should exist. + if c.Count() != 1 { + t.Errorf("count = %d, want 1", c.Count()) + } +} + +func TestEnsureRootfs_PartialEntryCleanup(t *testing.T) { + base := t.TempDir() + // Simulate a crash: create the digest directory but no .ready sentinel. + partialDir := filepath.Join(base, testDigest, "lower") + if err := os.MkdirAll(partialDir, 0o700); err != nil { + t.Fatalf("mkdir partial: %v", err) + } + if err := os.WriteFile(filepath.Join(partialDir, "stale"), []byte("data"), 0o644); err != nil { + t.Fatalf("write stale: %v", err) + } + + c, err := New(context.Background(), base, 0) + if err != nil { + t.Fatalf("New: %v", err) + } + // The partial entry should have been cleaned up during loadIndex. + if c.Count() != 0 { + t.Errorf("count = %d, want 0 (partial entry should be removed)", c.Count()) + } + + // Now a fresh extraction should succeed. + tarData := buildTar(t, []struct{ name, body string; typeflag byte; mode int64 }{ + {name: ".", typeflag: tar.TypeDir}, + {name: "fresh", typeflag: tar.TypeReg, body: "data"}, + }) + lowerDir, cached, err := c.EnsureRootfs(context.Background(), testDigest, bytes.NewReader(tarData)) + if err != nil { + t.Fatalf("EnsureRootfs: %v", err) + } + if cached { + t.Errorf("expected miss after cleanup, got hit") + } + if _, err := os.Stat(filepath.Join(lowerDir, "fresh")); err != nil { + t.Errorf("fresh file missing: %v", err) + } +} + +func TestEvictLRU(t *testing.T) { + base := t.TempDir() + c, err := New(context.Background(), base, 0) + if err != nil { + t.Fatalf("New: %v", err) + } + + digest1 := "sha256:1111111111111111111111111111111111111111111111111111111111111111" + digest2 := "sha256:2222222222222222222222222222222222222222222222222222222222222222" + + tarData1 := buildTar(t, []struct{ name, body string; typeflag byte; mode int64 }{ + {name: ".", typeflag: tar.TypeDir}, + {name: "d1", typeflag: tar.TypeReg, body: "data1"}, + }) + tarData2 := buildTar(t, []struct{ name, body string; typeflag byte; mode int64 }{ + {name: ".", typeflag: tar.TypeDir}, + {name: "d2", typeflag: tar.TypeReg, body: "data2"}, + }) + + if _, _, err := c.EnsureRootfs(context.Background(), digest1, bytes.NewReader(tarData1)); err != nil { + t.Fatalf("EnsureRootfs d1: %v", err) + } + if _, _, err := c.EnsureRootfs(context.Background(), digest2, bytes.NewReader(tarData2)); err != nil { + t.Fatalf("EnsureRootfs d2: %v", err) + } + + if c.Count() != 2 { + t.Fatalf("count = %d, want 2", c.Count()) + } + + // Evict the oldest (digest1, loaded first). + evicted, size := c.EvictLRU() + if evicted != digest1 { + t.Errorf("evicted = %q, want %q", evicted, digest1) + } + if size <= 0 { + t.Errorf("evicted size = %d, want > 0", size) + } + if c.Count() != 1 { + t.Errorf("count = %d, want 1 after eviction", c.Count()) + } + + // The evicted directory should be gone. + if _, err := os.Stat(filepath.Join(base, digest1)); !os.IsNotExist(err) { + t.Errorf("evicted dir still exists: %v", err) + } +} + +func TestLowerDir(t *testing.T) { + base := t.TempDir() + c, err := New(context.Background(), base, 0) + if err != nil { + t.Fatalf("New: %v", err) + } + + // Before caching. + if got := c.LowerDir(testDigest); got != "" { + t.Errorf("LowerDir before cache = %q, want empty", got) + } + + tarData := buildTar(t, []struct{ name, body string; typeflag byte; mode int64 }{ + {name: ".", typeflag: tar.TypeDir}, + }) + if _, _, err := c.EnsureRootfs(context.Background(), testDigest, bytes.NewReader(tarData)); err != nil { + t.Fatalf("EnsureRootfs: %v", err) + } + + // After caching. + got := c.LowerDir(testDigest) + if got == "" { + t.Fatalf("LowerDir after cache is empty") + } + if got != filepath.Join(base, testDigest, "lower") { + t.Errorf("LowerDir = %q, want %q", got, filepath.Join(base, testDigest, "lower")) + } +} + +func TestValidateDigest(t *testing.T) { + tests := []struct { + input string + want bool + }{ + {"sha256:abc123", false}, + {"", true}, + {"../escape", true}, + {"sha256:abc/def", true}, + {"sha256:abc..def", true}, + } + for _, tc := range tests { + err := validateDigest(tc.input) + if (err != nil) != tc.want { + t.Errorf("validateDigest(%q) err=%v, wantErr=%v", tc.input, err, tc.want) + } + } +} diff --git a/cmd/atelet/main.go b/cmd/atelet/main.go index 7094f7315..b88eb1864 100644 --- a/cmd/atelet/main.go +++ b/cmd/atelet/main.go @@ -30,6 +30,7 @@ import ( "cloud.google.com/go/storage" "github.com/agent-substrate/substrate/cmd/atelet/internal/ategcs" "github.com/agent-substrate/substrate/cmd/atelet/internal/memorypullcache" + "github.com/agent-substrate/substrate/cmd/atelet/internal/rootfscache" "github.com/agent-substrate/substrate/internal/ateinterceptors" "github.com/agent-substrate/substrate/internal/ateompath" "github.com/agent-substrate/substrate/internal/proto/ateletpb" @@ -114,6 +115,11 @@ func main() { serverboot.Fatal(ctx, "Failed to create pull cache", err) } + rootfsDiskCache, err := rootfscache.New(ctx, ateompath.RootfsCacheDir, 0) + if err != nil { + serverboot.Fatal(ctx, "Failed to create rootfs cache", err) + } + anonGCSClient, err := storage.NewClient(ctx, option.WithoutAuthentication()) if err != nil { serverboot.Fatal(ctx, "Failed to create anonymous GCS client", err) @@ -162,6 +168,7 @@ func main() { wrappedAnonGCS, wrappedGCS, pullCache, + rootfsDiskCache, ) lis, err := net.Listen("tcp", ":"+strconv.Itoa(*port)) @@ -185,6 +192,7 @@ type AteomHerder struct { ateomDialer *AteomDialer pullCache *memorypullcache.MemoryPullCache + rootfsCache *rootfscache.Cache anonGCSClient ategcs.ObjectStorage gcsClient ategcs.ObjectStorage } @@ -198,10 +206,12 @@ func NewService( anonGCSClient ategcs.ObjectStorage, gcsClient ategcs.ObjectStorage, pullCache *memorypullcache.MemoryPullCache, + rootfsCache *rootfscache.Cache, ) *AteomHerder { wms := &AteomHerder{ ateomDialer: ateomDialer, pullCache: pullCache, + rootfsCache: rootfsCache, anonGCSClient: anonGCSClient, gcsClient: gcsClient, } @@ -615,6 +625,7 @@ func (s *AteomHerder) prepareOCIBundles( if err := prepareOCIDirectory( gCtx, s.pullCache, + s.rootfsCache, actorTemplateNamespace, actorTemplateName, actorID, "pause", spec.GetPauseImage(), @@ -643,6 +654,7 @@ func (s *AteomHerder) prepareOCIBundles( if err := prepareOCIDirectory( gCtx, s.pullCache, + s.rootfsCache, actorTemplateNamespace, actorTemplateName, actorID, ctr.GetName(), ctr.GetImage(), @@ -833,6 +845,19 @@ func resetActorDirs(actorTemplateNamespace, actorTemplateName, actorID string) e // Explicitly leave runsc logs dir untouched. bundleDir := ateompath.OCIBundleDir(actorTemplateNamespace, actorTemplateName, actorID) + + // Unmount any overlayfs rootfs mounts before deleting the bundle + // directory. Each container's rootfs/ may be an overlayfs mountpoint; + // unmounting with MNT_DETACH ensures the mount is released even if + // something still holds a reference (e.g. a lingering process). + if entries, err := os.ReadDir(bundleDir); err == nil { + for _, e := range entries { + if e.IsDir() { + unmountActorRootfs(bundleDir, e.Name()) + } + } + } + if err := os.RemoveAll(bundleDir); err != nil { return fmt.Errorf("while deleting bundle dir: %w", err) } diff --git a/cmd/atelet/oci.go b/cmd/atelet/oci.go index 4451f6362..b7c9701f8 100644 --- a/cmd/atelet/oci.go +++ b/cmd/atelet/oci.go @@ -15,19 +15,17 @@ package main import ( - "archive/tar" "context" "encoding/json" - "errors" "fmt" "io" "log/slog" "os" "path" - "path/filepath" "strings" "github.com/agent-substrate/substrate/cmd/atelet/internal/memorypullcache" + "github.com/agent-substrate/substrate/cmd/atelet/internal/rootfscache" "github.com/agent-substrate/substrate/internal/ateompath" "github.com/opencontainers/runtime-spec/specs-go" "go.opentelemetry.io/otel" @@ -51,7 +49,12 @@ const ( ActorIDFileName = "actor-id" ) -func prepareOCIDirectory(ctx context.Context, pullCache *memorypullcache.MemoryPullCache, actorTemplateNamespace, actorTemplateName, actorID, containerName, ref string, args []string, env []string, annotations map[string]string, netns string, identityDir string) error { +// prepareOCIDirectory assembles the OCI bundle for one container inside an +// actor. When a rootfsCache is available and the image ref contains a digest, +// the rootfs is materialized via an overlayfs mount over a node-local cache +// instead of re-extracting the tarball — reducing per-restore latency from +// seconds to sub-millisecond on cache hits. +func prepareOCIDirectory(ctx context.Context, pullCache *memorypullcache.MemoryPullCache, rootfsCache *rootfscache.Cache, actorTemplateNamespace, actorTemplateName, actorID, containerName, ref string, args []string, env []string, annotations map[string]string, netns string, identityDir string) error { tracer := otel.Tracer("prepareOCIDirectory") ctx, span := tracer.Start(ctx, "prepareOCIDirectory") @@ -61,22 +64,56 @@ func prepareOCIDirectory(ctx context.Context, pullCache *memorypullcache.MemoryP bundlePath := ateompath.OCIBundlePath(actorTemplateNamespace, actorTemplateName, actorID, containerName) rootPath := path.Join(bundlePath, "rootfs") - if err := os.RemoveAll(rootPath); err != nil { - return fmt.Errorf("while clearing rootfs %q: %w", rootPath, err) - } + // Try the overlayfs cache path first. This succeeds when: + // 1. rootfsCache is non-nil, AND + // 2. the image ref includes a digest (@sha256:…). + // On a cache hit, tarData is NOT consumed, so we can skip the untar + // entirely. On a miss, the cache extracts and caches for next time. + digest := extractDigestFromRef(ref) + if rootfsCache != nil && digest != "" { + tarData, err := pullCache.Fetch(ctx, ref) + if err != nil { + return fmt.Errorf("in pullCache.Fetch: %w", err) + } + defer tarData.Close() - if err := os.MkdirAll(rootPath, 0o700); err != nil { - return fmt.Errorf("in os.MkdirAll for container bundle dir: %w", err) - } + lowerDir, _, err := rootfsCache.EnsureRootfs(ctx, digest, tarData) + if err != nil { + return fmt.Errorf("in rootfsCache.EnsureRootfs: %w", err) + } - tarData, err := pullCache.Fetch(ctx, ref) - if err != nil { - return fmt.Errorf("in pullCache.Fetch: %w", err) - } - defer tarData.Close() + // Create the overlay mount target. + if err := os.MkdirAll(rootPath, 0o700); err != nil { + return fmt.Errorf("in os.MkdirAll for rootfs mount target: %w", err) + } + + upperDir := path.Join(bundlePath, "upper") + workDir := path.Join(bundlePath, "work") + if err := setupOverlayfs(rootPath, lowerDir, upperDir, workDir); err != nil { + return fmt.Errorf("setting up overlayfs (lower=%s, target=%s): %w", lowerDir, rootPath, err) + } + + span.SetAttributes(attribute.String("rootfs_method", "overlay")) + } else { + // Fallback: no digest or no cache — extract directly (original path). + if err := os.RemoveAll(rootPath); err != nil { + return fmt.Errorf("while clearing rootfs %q: %w", rootPath, err) + } + if err := os.MkdirAll(rootPath, 0o700); err != nil { + return fmt.Errorf("in os.MkdirAll for container bundle dir: %w", err) + } - if err := untar(ctx, tarData, rootPath); err != nil { - return fmt.Errorf("in untar: %w", err) + tarData, err := pullCache.Fetch(ctx, ref) + if err != nil { + return fmt.Errorf("in pullCache.Fetch: %w", err) + } + defer tarData.Close() + + if err := rootfscache.Untar(ctx, tarData, rootPath); err != nil { + return fmt.Errorf("in untar: %w", err) + } + + span.SetAttributes(attribute.String("rootfs_method", "untar")) } // Bind-mount the per-actor identity directory so the workload can read its @@ -101,6 +138,19 @@ func prepareOCIDirectory(ctx context.Context, pullCache *memorypullcache.MemoryP return nil } +// extractDigestFromRef extracts the sha256 digest from an image reference. +// Returns "" if the ref does not contain a digest. +// - "registry/image@sha256:abc123" → "sha256:abc123" +// - "registry/image:latest" → "" +func extractDigestFromRef(ref string) string { + const prefix = "@sha256:" + idx := strings.LastIndex(ref, prefix) + if idx < 0 { + return "" + } + return strings.TrimPrefix(ref[idx:], "@") +} + // buildActorOCISpec assembles the OCI runtime spec for an actor container. // When identityDir is non-empty it adds a read-only bind mount of that host // directory at IdentityMountPath so the actor can read its own ID (see @@ -237,148 +287,27 @@ func createMountPoint(rootPath, mountPath string) error { return nil } -func validateTarName(name string) (cleaned string, skip bool, err error) { - if name == "" { - return "", true, nil - } - cleaned = filepath.Clean(name) - if cleaned == "." { - return "", true, nil +// unmountActorRootfs attempts to unmount the rootfs overlay for a single +// container inside an actor's bundle directory. Returns nil if the rootfs +// is not a mountpoint (i.e. was produced by direct untar). +func unmountActorRootfs(bundleDir, containerName string) error { + rootfsPath := path.Join(bundleDir, containerName, "rootfs") + if err := teardownOverlayfs(rootfsPath); err != nil { + // ENOTDIR/ENOENT/EINVAL — not a mountpoint, nothing to do. + slog.Debug("rootfs unmount skipped (not a mountpoint)", "path", rootfsPath, "err", err) } - cleaned = strings.TrimPrefix(cleaned, "/") - if cleaned == "" || cleaned == "." { - return "", true, nil - } - if !filepath.IsLocal(cleaned) { - return "", false, fmt.Errorf("not a local path: %q", name) - } - return cleaned, false, nil + return nil } +// untar is a thin wrapper around rootfscache.Untar kept in this package so +// that existing tests (package main) continue to compile without importing +// the rootfscache package directly. func untar(ctx context.Context, tarData io.Reader, rootPath string) error { - tracer := otel.Tracer("ateom-gvisor") - ctx, span := tracer.Start(ctx, "untar") - defer span.End() - - // os.Root confines file operations to rootPath: ".." components and - // out-of-tree symlinks are refused by the kernel. - root, err := os.OpenRoot(rootPath) - if err != nil { - return fmt.Errorf("while opening rootfs %q as os.Root: %w", rootPath, err) - } - defer root.Close() - - tarReader := tar.NewReader(tarData) - for { - hdr, err := tarReader.Next() - if errors.Is(err, io.EOF) { - break - } else if err != nil { - return fmt.Errorf("in tarReader.Next: %w", err) - } - - name, skip, err := validateTarName(hdr.Name) - if err != nil { - return fmt.Errorf("invalid tar entry: %w", err) - } - if skip { - continue - } - - mode := hdr.FileInfo().Mode().Perm() - - switch hdr.Typeflag { - case tar.TypeReg: // Regular file - // Same "later entry wins" handling: if any entry exists at the target path, - // remove it first. This ensures that: - // 1. If it's a symlink, we don't write through it (security vulnerability / incorrectness). - // 2. If it's a hardlink, we unlink it instead of truncating the shared inode. - // 3. If it's a directory, we recursively remove it so we can write the file. - if _, err := root.Lstat(name); err == nil { - if err := root.RemoveAll(name); err != nil { - return fmt.Errorf("while replacing existing path at %q before regular file: %w", name, err) - } - } else if !errors.Is(err, os.ErrNotExist) { - return fmt.Errorf("while checking existing path at %q before regular file: %w", name, err) - } - - // Stream directly from tarReader to target file to avoid buffering in memory. - outFile, err := root.OpenFile(name, os.O_CREATE|os.O_RDWR|os.O_TRUNC, mode) - if err != nil { - return fmt.Errorf("while creating file %q: %w", name, err) - } - - _, err = io.Copy(outFile, tarReader) - closeErr := outFile.Close() - - if err != nil { - return fmt.Errorf("while writing contents of %q from tar stream: %w", name, err) - } - if closeErr != nil { - return fmt.Errorf("while closing file %q: %w", name, closeErr) - } - - case tar.TypeDir: - err := root.Mkdir(name, mode) - if errors.Is(err, os.ErrExist) { - // Ignore --- real images produced by ko seem to have directory entries placed multiple times? - } else if err != nil { - return fmt.Errorf("while creating directory=%q, mode=%v: %w", name, mode, err) - } - - case tar.TypeSymlink: - // OCI image layers may re-define the same path across layers (e.g. - // an earlier layer creates /var/run as a directory and a later - // layer re-declares it as a symlink to /run). Standard tar-extract - // semantics are "later entry wins": replace any existing entry. - if existing, err := root.Lstat(name); err == nil { - // If it's already the same symlink, skip the unlink+symlink pair. - if existing.Mode()&os.ModeSymlink != 0 { - if cur, rerr := root.Readlink(name); rerr == nil && cur == hdr.Linkname { - continue - } - } - // Root.RemoveAll removes the symlink entry itself; it does NOT - // traverse and remove the directory the symlink points to. - // That's the desired semantic here — replace this path's - // entry without touching whatever the prior symlink targeted. - if err := root.RemoveAll(name); err != nil { - return fmt.Errorf("while replacing existing path at %q before symlink: %w", name, err) - } - } else if !errors.Is(err, os.ErrNotExist) { - return fmt.Errorf("while checking existing path at %q before symlink: %w", name, err) - } - if err := root.Symlink(hdr.Linkname, name); err != nil { - return fmt.Errorf("while creating symlink src=%q target=%q: %w", name, hdr.Linkname, err) - } - - case tar.TypeLink: - linkname, linkSkip, err := validateTarName(hdr.Linkname) - if err != nil { - return fmt.Errorf("invalid hardlink target for %q: %w", name, err) - } - if linkSkip { - return fmt.Errorf("invalid hardlink target for %q: empty", name) - } - // Same "later entry wins" handling as TypeSymlink: replace existing entry. - if _, err := root.Lstat(name); err == nil { - if err := root.RemoveAll(name); err != nil { - return fmt.Errorf("while replacing existing path at %q before hardlink: %w", name, err) - } - } else if !errors.Is(err, os.ErrNotExist) { - return fmt.Errorf("while checking existing path at %q before hardlink: %w", name, err) - } - if err := root.Link(linkname, name); err != nil { - return fmt.Errorf("while creating hardlink src=%q target=%q: %w", name, linkname, err) - } - - default: - tfStr := string([]byte{hdr.Typeflag}) - slog.ErrorContext(ctx, "Unhandled tar entry typeflag", slog.String("typeflag", tfStr), slog.Any("hdr", hdr)) - return fmt.Errorf("unhandled tar entry typeflag %q", tfStr) - } - - } + return rootfscache.Untar(ctx, tarData, rootPath) +} - return nil +// validateTarName is re-exported here for the same reason as untar: tests in +// package main call it directly. +func validateTarName(name string) (cleaned string, skip bool, err error) { + return rootfscache.ValidateTarName(name) } diff --git a/cmd/atelet/overlay.go b/cmd/atelet/overlay.go new file mode 100644 index 000000000..bc2a85479 --- /dev/null +++ b/cmd/atelet/overlay.go @@ -0,0 +1,87 @@ +// Copyright 2026 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "fmt" + "os" + + "golang.org/x/sys/unix" +) + +// setupOverlayfs mounts an overlayfs at target using the given layers. +// - lowerDir: read-only shared rootfs from the node-level cache +// - upperDir: per-actor writable layer (created if absent) +// - workDir: overlayfs work directory (created if absent) +// +// The target directory must already exist (it is the bundle's rootfs/ dir). +func setupOverlayfs(target, lowerDir, upperDir, workDir string) error { + for _, d := range []string{upperDir, workDir} { + if err := os.MkdirAll(d, 0o700); err != nil { + return fmt.Errorf("creating overlay dir %s: %w", d, err) + } + } + + opts := fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", lowerDir, upperDir, workDir) + if err := unix.Mount("overlay", target, "overlay", 0, opts); err != nil { + return fmt.Errorf("mounting overlayfs at %s: %w", target, err) + } + return nil +} + +// teardownOverlayfs unmounts the overlayfs at target. The caller is +// responsible for removing the upper/work directories (typically done by +// resetActorDirs). +func teardownOverlayfs(target string) error { + if err := unix.Unmount(target, unix.MNT_DETACH); err != nil { + return fmt.Errorf("unmounting overlayfs at %s: %w", target, err) + } + return nil +} + +// isOverlayfsAvailable checks whether the overlayfs kernel module is available +// by attempting a mount on a temporary directory. +func isOverlayfsAvailable() bool { + tmpLower, err := os.MkdirTemp("", "overlay-check-lower-") + if err != nil { + return false + } + defer os.RemoveAll(tmpLower) + + tmpUpper, err := os.MkdirTemp("", "overlay-check-upper-") + if err != nil { + return false + } + defer os.RemoveAll(tmpUpper) + + tmpWork, err := os.MkdirTemp("", "overlay-check-work-") + if err != nil { + return false + } + defer os.RemoveAll(tmpWork) + + tmpTarget, err := os.MkdirTemp("", "overlay-check-target-") + if err != nil { + return false + } + defer os.RemoveAll(tmpTarget) + + opts := fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", tmpLower, tmpUpper, tmpWork) + if err := unix.Mount("overlay", tmpTarget, "overlay", 0, opts); err != nil { + return false + } + _ = unix.Unmount(tmpTarget, 0) + return true +} diff --git a/internal/ateompath/ateompath.go b/internal/ateompath/ateompath.go index a779755c4..90641dd54 100644 --- a/internal/ateompath/ateompath.go +++ b/internal/ateompath/ateompath.go @@ -28,8 +28,21 @@ const ( var ( // StaticFilesDir holds things like downloaded runsc binaries. StaticFilesDir = filepath.Join(BasePath, "static-files") + + // RootfsCacheDir is the node-level directory that holds extracted, + // read-only rootfs directories keyed by image digest. On a cache hit the + // per-actor rootfs is materialized as an overlayfs mount instead of + // re-extracting the tarball. + RootfsCacheDir = filepath.Join(BasePath, "rootfs-cache") ) +// RootfsCacheLowerDir returns the read-only rootfs directory for a given image +// digest inside the node-level cache. This is the "lowerdir" for an overlayfs +// mount. +func RootfsCacheLowerDir(digest string) string { + return filepath.Join(RootfsCacheDir, digest, "lower") +} + func RunSCBinaryPath(sha256 string) string { return filepath.Join(StaticFilesDir, "runsc-"+sha256) }