From ec6ab8323fb3d676a0c030b244d74a5907e7b107 Mon Sep 17 00:00:00 2001 From: Pratik Patel Date: Fri, 8 May 2026 12:25:37 -0700 Subject: [PATCH 1/4] fix(BRE2-940): additional Nebius capacity error mapping --- v1/providers/nebius/errors.go | 12 ++++++ v1/providers/nebius/errors_test.go | 68 ++++++++++++++++++++++++++++++ v1/providers/nebius/instance.go | 9 ++-- 3 files changed, 84 insertions(+), 5 deletions(-) create mode 100644 v1/providers/nebius/errors_test.go diff --git a/v1/providers/nebius/errors.go b/v1/providers/nebius/errors.go index baef441..5811b35 100644 --- a/v1/providers/nebius/errors.go +++ b/v1/providers/nebius/errors.go @@ -6,6 +6,7 @@ import ( v1 "github.com/brevdev/cloud/v1" "github.com/nebius/gosdk/operations" + "github.com/nebius/gosdk/serviceerror" "google.golang.org/grpc/codes" "google.golang.org/grpc/status" ) @@ -37,6 +38,17 @@ func handleErrToCloudErr(e error) error { if e == nil { return nil } + var serviceErr *serviceerror.Error + if errors.As(e, &serviceErr) { + for _, detail := range serviceErr.Details { + switch detail.(type) { + case *serviceerror.NotEnoughResources: + return v1.ErrInsufficientResources + case *serviceerror.QuotaFailure: + return v1.ErrOutOfQuota + } + } + } // Check for Nebius operations.Error for ResourceExhausted (returned by operation.Wait on async failures) var opErr *operations.Error if errors.As(e, &opErr) { diff --git a/v1/providers/nebius/errors_test.go b/v1/providers/nebius/errors_test.go new file mode 100644 index 0000000..65ed55a --- /dev/null +++ b/v1/providers/nebius/errors_test.go @@ -0,0 +1,68 @@ +package v1 + +import ( + "errors" + "testing" + + cloudv1 "github.com/brevdev/cloud/v1" + common "github.com/nebius/gosdk/proto/nebius/common/v1" + "github.com/nebius/gosdk/serviceerror" + "github.com/stretchr/testify/require" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" +) + +func TestHandleErrToCloudErrMapsNotEnoughResourcesToInsufficientResources(t *testing.T) { + t.Parallel() + + err := &serviceerror.Error{ + Wrapped: status.Error(codes.ResourceExhausted, "operation failed"), + Details: []serviceerror.Detail{ + serviceerror.NewDetail(&common.ServiceError{ + Service: "compute", + Code: "NotEnoughResources", + Details: &common.ServiceError_NotEnoughResources{ + NotEnoughResources: &common.NotEnoughResources{ + Violations: []*common.NotEnoughResources_Violation{ + { + ResourceType: "virtualMachine", + Requested: "1gpu-16vcpu-64gb", + Message: "VM schedule timeout, most likely due to insufficient hardware resources", + }, + }, + }, + }, + }), + }, + } + + require.True(t, errors.Is(handleErrToCloudErr(err), cloudv1.ErrInsufficientResources)) +} + +func TestHandleErrToCloudErrMapsQuotaFailureToOutOfQuota(t *testing.T) { + t.Parallel() + + err := &serviceerror.Error{ + Wrapped: status.Error(codes.ResourceExhausted, "operation failed"), + Details: []serviceerror.Detail{ + serviceerror.NewDetail(&common.ServiceError{ + Service: "compute", + Code: "QuotaFailure", + Details: &common.ServiceError_QuotaFailure{ + QuotaFailure: &common.QuotaFailure{ + Violations: []*common.QuotaFailure_Violation{ + { + Quota: "compute.instance.gpu.h100", + Limit: "0", + Requested: "1", + Message: "quota exceeded", + }, + }, + }, + }, + }), + }, + } + + require.True(t, errors.Is(handleErrToCloudErr(err), cloudv1.ErrOutOfQuota)) +} diff --git a/v1/providers/nebius/instance.go b/v1/providers/nebius/instance.go index 6bc058e..ab76e2e 100644 --- a/v1/providers/nebius/instance.go +++ b/v1/providers/nebius/instance.go @@ -868,7 +868,6 @@ func matchesTagFilters(instanceTags map[string]string, tagFilters map[string][]s return true } -//nolint:dupl // StopInstance and StartInstance have similar structure but different operations func (c *NebiusClient) StopInstance(ctx context.Context, instanceID v1.CloudProviderInstanceID) error { c.logger.Debug(ctx, "initiating instance stop operation", v1.LogField("instanceID", instanceID)) @@ -906,7 +905,6 @@ func (c *NebiusClient) StopInstance(ctx context.Context, instanceID v1.CloudProv return nil } -//nolint:dupl // StartInstance and StopInstance have similar structure but different operations func (c *NebiusClient) StartInstance(ctx context.Context, instanceID v1.CloudProviderInstanceID) error { c.logger.Debug(ctx, "initiating instance start operation", v1.LogField("instanceID", instanceID)) @@ -916,17 +914,18 @@ func (c *NebiusClient) StartInstance(ctx context.Context, instanceID v1.CloudPro Id: string(instanceID), }) if err != nil { - return fmt.Errorf("failed to initiate instance start: %w", err) + return fmt.Errorf("failed to initiate instance start: %w", handleErrToCloudErr(err)) } // Wait for the start operation to complete finalOp, err := operation.Wait(ctx) if err != nil { - return fmt.Errorf("failed to wait for instance start: %w", err) + return fmt.Errorf("failed to wait for instance start: %w", handleErrToCloudErr(err)) } if !finalOp.Successful() { - return fmt.Errorf("instance start failed: %v", finalOp.Status()) + statusErr := fmt.Errorf("instance start failed: %v", finalOp.Status()) + return handleErrToCloudErr(statusErr) } c.logger.Debug(ctx, "start operation completed, waiting for instance to reach RUNNING state", From d684d75663f7f5d1f49b2021c01232c1bd900fba Mon Sep 17 00:00:00 2001 From: Pratik Patel Date: Fri, 8 May 2026 17:26:47 -0700 Subject: [PATCH 2/4] fix ufw issue --- v1/providers/nebius/instance.go | 97 +++++++++++++++++++++------- v1/providers/nebius/instance_test.go | 20 ++++++ 2 files changed, 93 insertions(+), 24 deletions(-) diff --git a/v1/providers/nebius/instance.go b/v1/providers/nebius/instance.go index ab76e2e..d2c05de 100644 --- a/v1/providers/nebius/instance.go +++ b/v1/providers/nebius/instance.go @@ -1582,7 +1582,6 @@ func generateCloudInitUserData(publicKey string, firewallRules v1.FirewallRules) script := `#cloud-config packages: - ufw - - iptables-persistent ` // Add SSH key configuration if provided @@ -1594,33 +1593,19 @@ packages: var commands []string - // Fix a systemd race condition: ufw.service and netfilter-persistent.service - // both start in parallel (both are Before=network-pre.target with no mutual - // ordering). Both call iptables-restore concurrently, and with the iptables-nft - // backend the competing nftables transactions cause UFW to fail with - // "iptables-restore: line 4 failed". This drop-in forces UFW to wait for - // netfilter-persistent to finish first. - commands = append(commands, - "sudo mkdir -p /etc/systemd/system/ufw.service.d", - `printf '[Unit]\nAfter=netfilter-persistent.service\n' | sudo tee /etc/systemd/system/ufw.service.d/after-netfilter.conf > /dev/null`, - "sudo systemctl daemon-reload", - ) + // Install an idempotent Docker firewall hook before enabling UFW. Some + // Nebius images start Docker after cloud-init runcmd; Docker creates or + // resets DOCKER-USER during startup, so the rules need to be re-applied after + // docker.service starts instead of only once during runcmd. + commands = append(commands, generateDockerFirewallInstallCommands()...) // Generate UFW firewall commands (similar to Shadeform's approach) // UFW (Uncomplicated Firewall) is available on Ubuntu/Debian instances commands = append(commands, generateUFWCommands(firewallRules)...) - // Generate IPTables firewall commands to ensure docker ports are not made immediately - // accessible from the internet by default. - commands = append(commands, generateIPTablesCommands()...) - - // Save the complete iptables state (UFW chains + DOCKER-USER rules) so it - // survives instance stop/start cycles. Cloud-init runcmd only executes on - // first boot; on subsequent boots netfilter-persistent restores this snapshot, - // then UFW starts after it (due to the drop-in above) and re-applies its rules. - // This provides defense-in-depth: even if UFW fails for any reason, the - // netfilter-persistent snapshot ensures port 22 and DOCKER-USER rules persist. - commands = append(commands, "sudo netfilter-persistent save") + // Apply immediately for images where Docker is already running. The + // docker.service ExecStartPost hook handles images where Docker starts later. + commands = append(commands, "sudo /usr/local/sbin/brev-apply-docker-firewall.sh || true") if len(commands) > 0 { // Use runcmd to execute firewall setup commands @@ -1662,11 +1647,75 @@ func generateUFWCommands(firewallRules v1.FirewallRules) []string { return commands } +const ( + // Keep these paths stable: they are useful operator touchpoints when + // debugging instance firewall state with systemctl/cat/iptables. + dockerFirewallScriptPath = "/usr/local/sbin/brev-apply-docker-firewall.sh" + dockerServiceDropInDir = "/etc/systemd/system/docker.service.d" + dockerFirewallDropInPath = dockerServiceDropInDir + "/10-brev-firewall.conf" +) + +func generateDockerFirewallInstallCommands() []string { + // Docker published ports are not governed by UFW's INPUT policy. Docker adds + // NAT/FORWARD rules that can make `docker run -p host:container` reachable + // from the public internet even when UFW says incoming traffic is denied. + // + // DOCKER-USER is Docker's documented filter hook for this traffic. The + // ordering is important: some Nebius images run cloud-init before Docker has + // created DOCKER-USER, and Docker may create/reset the chain during daemon + // startup. We therefore install both: + // - an immediate cloud-init run for images where Docker is already active + // - a docker.service ExecStartPost hook for images where Docker starts later + // + // The generated script exits successfully even if an iptables command fails + // because failing Docker startup would be worse operationally. Validation + // tests assert that the rule set is actually present and blocks published + // ports. + scriptLines := append([]string{ + "#!/bin/sh", + "set +e", + }, generateIPTablesCommands()...) + scriptLines = append(scriptLines, "exit 0") + + return []string{ + generatePrintfToFileCommand(scriptLines, dockerFirewallScriptPath), + "sudo chmod 0755 " + dockerFirewallScriptPath, + "sudo mkdir -p " + dockerServiceDropInDir, + generatePrintfToFileCommand([]string{ + "[Service]", + "ExecStartPost=" + dockerFirewallScriptPath, + }, dockerFirewallDropInPath), + "sudo systemctl daemon-reload", + } +} + +func generatePrintfToFileCommand(lines []string, path string) string { + quotedLines := make([]string, 0, len(lines)) + for _, line := range lines { + quotedLines = append(quotedLines, shellSingleQuote(line)) + } + + return fmt.Sprintf("printf '%%s\\n' %s | sudo tee %s > /dev/null", strings.Join(quotedLines, " "), path) +} + +func shellSingleQuote(value string) string { + return "'" + strings.ReplaceAll(value, "'", `'\''`) + "'" +} + // generateIPTablesCommands generates IPTables firewall commands to ensure docker ports are not made immediately // accessible from the internet by default. func generateIPTablesCommands() []string { commands := []string{ - "iptables -F DOCKER-USER", + // CPU images can run cloud-init before Docker has created DOCKER-USER. + // Create it first so the immediate cloud-init run succeeds; when Docker + // starts later, the systemd ExecStartPost hook above re-applies the same + // rules after Docker has finished creating/resetting its own chains. + "iptables -N DOCKER-USER 2>/dev/null || true", + "iptables -F DOCKER-USER || true", + + // Preserve already-established connections, then allow container bridge + // egress and intra-bridge traffic. The final DROP below is what prevents + // externally-published Docker ports from being reachable by default. "iptables -A DOCKER-USER -m conntrack --ctstate ESTABLISHED,RELATED -j ACCEPT", "iptables -A DOCKER-USER -i docker0 ! -o docker0 -j ACCEPT", "iptables -A DOCKER-USER -i br+ ! -o br+ -j ACCEPT", diff --git a/v1/providers/nebius/instance_test.go b/v1/providers/nebius/instance_test.go index 6550484..3fb4c46 100644 --- a/v1/providers/nebius/instance_test.go +++ b/v1/providers/nebius/instance_test.go @@ -84,6 +84,26 @@ func TestNebiusClient_MergeInstanceForUpdate(t *testing.T) { assert.Equal(t, newInstance.Status, merged.Status) } +func TestGenerateCloudInitUserDataInstallsDockerFirewallHook(t *testing.T) { + script := generateCloudInitUserData("ssh-rsa test", v1.FirewallRules{}) + + assert.NotContains(t, script, "iptables-persistent") + assert.NotContains(t, script, "netfilter-persistent") + assert.Contains(t, script, "/usr/local/sbin/brev-apply-docker-firewall.sh") + assert.Contains(t, script, "/etc/systemd/system/docker.service.d") + assert.Contains(t, script, "ExecStartPost=/usr/local/sbin/brev-apply-docker-firewall.sh") + assert.Contains(t, script, "sudo /usr/local/sbin/brev-apply-docker-firewall.sh || true") +} + +func TestGenerateIPTablesCommandsCreateDockerUserChainBeforeFlush(t *testing.T) { + commands := generateIPTablesCommands() + + assert.GreaterOrEqual(t, len(commands), 2) + assert.Equal(t, "iptables -N DOCKER-USER 2>/dev/null || true", commands[0]) + assert.Equal(t, "iptables -F DOCKER-USER || true", commands[1]) + assert.Contains(t, strings.Join(commands, "\n"), "iptables -A DOCKER-USER -j DROP") +} + // BenchmarkCreateInstance benchmarks the CreateInstance method func BenchmarkCreateInstance(b *testing.B) { b.Skip("CreateInstance requires real SDK initialization - use integration tests instead") From 590764d30a328d40123f70878e8650c2f90746fa Mon Sep 17 00:00:00 2001 From: Pratik Patel Date: Fri, 8 May 2026 18:09:08 -0700 Subject: [PATCH 3/4] cleaner --- v1/providers/nebius/instance.go | 103 ++++++------------ v1/providers/nebius/instance_test.go | 24 ++-- .../nebius/scripts/10-brev-firewall.conf | 2 + .../scripts/brev-apply-docker-firewall.sh | 18 +++ 4 files changed, 72 insertions(+), 75 deletions(-) create mode 100644 v1/providers/nebius/scripts/10-brev-firewall.conf create mode 100644 v1/providers/nebius/scripts/brev-apply-docker-firewall.sh diff --git a/v1/providers/nebius/instance.go b/v1/providers/nebius/instance.go index d2c05de..322e32f 100644 --- a/v1/providers/nebius/instance.go +++ b/v1/providers/nebius/instance.go @@ -2,6 +2,8 @@ package v1 import ( "context" + _ "embed" + "encoding/base64" "fmt" "strings" "time" @@ -20,6 +22,12 @@ const ( nebiusCPUImageFamily = "ubuntu24.04-driverless" ) +//go:embed scripts/brev-apply-docker-firewall.sh +var dockerFirewallScript string + +//go:embed scripts/10-brev-firewall.conf +var dockerFirewallDropIn string + //nolint:gocyclo,funlen // Complex instance creation with resource management func (c *NebiusClient) CreateInstance(ctx context.Context, attrs v1.CreateInstanceAttrs) (*v1.Instance, error) { // Track created resources for automatic cleanup on failure @@ -1576,7 +1584,7 @@ func (c *NebiusClient) cleanupOrphanedBootDisks(ctx context.Context, testID stri } // generateCloudInitUserData generates a cloud-init user-data script for SSH key injection and firewall configuration -// This is inspired by Shadeform's LaunchConfiguration approach but uses cloud-init instead of base64 scripts +// This is inspired by Shadeform's LaunchConfiguration approach but uses cloud-init directly. func generateCloudInitUserData(publicKey string, firewallRules v1.FirewallRules) string { // Start with cloud-init header script := `#cloud-config @@ -1591,13 +1599,11 @@ packages: `, publicKey) } + script += generateDockerFirewallWriteFiles() + var commands []string - // Install an idempotent Docker firewall hook before enabling UFW. Some - // Nebius images start Docker after cloud-init runcmd; Docker creates or - // resets DOCKER-USER during startup, so the rules need to be re-applied after - // docker.service starts instead of only once during runcmd. - commands = append(commands, generateDockerFirewallInstallCommands()...) + commands = append(commands, "sudo systemctl daemon-reload") // Generate UFW firewall commands (similar to Shadeform's approach) // UFW (Uncomplicated Firewall) is available on Ubuntu/Debian instances @@ -1648,14 +1654,18 @@ func generateUFWCommands(firewallRules v1.FirewallRules) []string { } const ( - // Keep these paths stable: they are useful operator touchpoints when - // debugging instance firewall state with systemctl/cat/iptables. + // Keep these generated paths stable: cloud-init, systemd, and validation + // tests all depend on this Docker firewall handoff. dockerFirewallScriptPath = "/usr/local/sbin/brev-apply-docker-firewall.sh" dockerServiceDropInDir = "/etc/systemd/system/docker.service.d" dockerFirewallDropInPath = dockerServiceDropInDir + "/10-brev-firewall.conf" ) -func generateDockerFirewallInstallCommands() []string { +func generateDockerFirewallWriteFiles() string { + // This function emits the only write_files block in this cloud-config. If + // another generated file is added later, merge it into this block instead of + // adding a second top-level write_files key. + // // Docker published ports are not governed by UFW's INPUT policy. Docker adds // NAT/FORWARD rules that can make `docker run -p host:container` reachable // from the public internet even when UFW says incoming traffic is denied. @@ -1671,65 +1681,22 @@ func generateDockerFirewallInstallCommands() []string { // because failing Docker startup would be worse operationally. Validation // tests assert that the rule set is actually present and blocks published // ports. - scriptLines := append([]string{ - "#!/bin/sh", - "set +e", - }, generateIPTablesCommands()...) - scriptLines = append(scriptLines, "exit 0") - - return []string{ - generatePrintfToFileCommand(scriptLines, dockerFirewallScriptPath), - "sudo chmod 0755 " + dockerFirewallScriptPath, - "sudo mkdir -p " + dockerServiceDropInDir, - generatePrintfToFileCommand([]string{ - "[Service]", - "ExecStartPost=" + dockerFirewallScriptPath, - }, dockerFirewallDropInPath), - "sudo systemctl daemon-reload", - } -} - -func generatePrintfToFileCommand(lines []string, path string) string { - quotedLines := make([]string, 0, len(lines)) - for _, line := range lines { - quotedLines = append(quotedLines, shellSingleQuote(line)) - } - - return fmt.Sprintf("printf '%%s\\n' %s | sudo tee %s > /dev/null", strings.Join(quotedLines, " "), path) -} - -func shellSingleQuote(value string) string { - return "'" + strings.ReplaceAll(value, "'", `'\''`) + "'" -} - -// generateIPTablesCommands generates IPTables firewall commands to ensure docker ports are not made immediately -// accessible from the internet by default. -func generateIPTablesCommands() []string { - commands := []string{ - // CPU images can run cloud-init before Docker has created DOCKER-USER. - // Create it first so the immediate cloud-init run succeeds; when Docker - // starts later, the systemd ExecStartPost hook above re-applies the same - // rules after Docker has finished creating/resetting its own chains. - "iptables -N DOCKER-USER 2>/dev/null || true", - "iptables -F DOCKER-USER || true", - - // Preserve already-established connections, then allow container bridge - // egress and intra-bridge traffic. The final DROP below is what prevents - // externally-published Docker ports from being reachable by default. - "iptables -A DOCKER-USER -m conntrack --ctstate ESTABLISHED,RELATED -j ACCEPT", - "iptables -A DOCKER-USER -i docker0 ! -o docker0 -j ACCEPT", - "iptables -A DOCKER-USER -i br+ ! -o br+ -j ACCEPT", - "iptables -A DOCKER-USER -i cni+ ! -o cni+ -j ACCEPT", // TODO: add these back in when we have a way to test it - "iptables -A DOCKER-USER -i cali+ ! -o cali+ -j ACCEPT", - "iptables -A DOCKER-USER -i docker0 -o docker0 -j ACCEPT", - "iptables -A DOCKER-USER -i br+ -o br+ -j ACCEPT", - "iptables -A DOCKER-USER -i cni+ -o cni+ -j ACCEPT", - "iptables -A DOCKER-USER -i cali+ -o cali+ -j ACCEPT", - "iptables -A DOCKER-USER -i lo -j ACCEPT", - "iptables -A DOCKER-USER -j DROP", - "iptables -A DOCKER-USER -j RETURN", // Expected by Docker - } - return commands + // + // UFW persists its own rules in /etc/ufw; only DOCKER-USER needed a Docker + // startup hook after removing netfilter-persistent. + return fmt.Sprintf(` +write_files: + - path: %s + owner: root:root + permissions: '0755' + encoding: b64 + content: %s + - path: %s + owner: root:root + permissions: '0644' + encoding: b64 + content: %s +`, dockerFirewallScriptPath, base64.StdEncoding.EncodeToString([]byte(dockerFirewallScript)), dockerFirewallDropInPath, base64.StdEncoding.EncodeToString([]byte(dockerFirewallDropIn))) } // convertIngressRuleToUFW converts an ingress firewall rule to UFW command(s) diff --git a/v1/providers/nebius/instance_test.go b/v1/providers/nebius/instance_test.go index 3fb4c46..7b05dbc 100644 --- a/v1/providers/nebius/instance_test.go +++ b/v1/providers/nebius/instance_test.go @@ -89,19 +89,29 @@ func TestGenerateCloudInitUserDataInstallsDockerFirewallHook(t *testing.T) { assert.NotContains(t, script, "iptables-persistent") assert.NotContains(t, script, "netfilter-persistent") + assert.Contains(t, script, "write_files:") + assert.Contains(t, script, "encoding: b64") assert.Contains(t, script, "/usr/local/sbin/brev-apply-docker-firewall.sh") assert.Contains(t, script, "/etc/systemd/system/docker.service.d") - assert.Contains(t, script, "ExecStartPost=/usr/local/sbin/brev-apply-docker-firewall.sh") assert.Contains(t, script, "sudo /usr/local/sbin/brev-apply-docker-firewall.sh || true") + assert.NotContains(t, script, "content: |") + assert.NotContains(t, script, " #!/bin/sh") + assert.NotContains(t, script, "ExecStartPost=/usr/local/sbin/brev-apply-docker-firewall.sh") + assert.NotContains(t, script, "printf '%s\\n'") + assert.NotContains(t, script, "| sudo tee") } -func TestGenerateIPTablesCommandsCreateDockerUserChainBeforeFlush(t *testing.T) { - commands := generateIPTablesCommands() +func TestDockerFirewallScriptCreatesDockerUserChainBeforeFlush(t *testing.T) { + createChainIndex := strings.Index(dockerFirewallScript, "iptables -N DOCKER-USER") + flushChainIndex := strings.Index(dockerFirewallScript, "iptables -F DOCKER-USER") - assert.GreaterOrEqual(t, len(commands), 2) - assert.Equal(t, "iptables -N DOCKER-USER 2>/dev/null || true", commands[0]) - assert.Equal(t, "iptables -F DOCKER-USER || true", commands[1]) - assert.Contains(t, strings.Join(commands, "\n"), "iptables -A DOCKER-USER -j DROP") + assert.Greater(t, createChainIndex, -1) + assert.Greater(t, flushChainIndex, createChainIndex) + assert.Contains(t, dockerFirewallScript, "iptables -A DOCKER-USER -j DROP") +} + +func TestDockerFirewallDropInIgnoresExecStartPostFailure(t *testing.T) { + assert.Contains(t, dockerFirewallDropIn, "ExecStartPost=-/usr/local/sbin/brev-apply-docker-firewall.sh") } // BenchmarkCreateInstance benchmarks the CreateInstance method diff --git a/v1/providers/nebius/scripts/10-brev-firewall.conf b/v1/providers/nebius/scripts/10-brev-firewall.conf new file mode 100644 index 0000000..ac8134c --- /dev/null +++ b/v1/providers/nebius/scripts/10-brev-firewall.conf @@ -0,0 +1,2 @@ +[Service] +ExecStartPost=-/usr/local/sbin/brev-apply-docker-firewall.sh diff --git a/v1/providers/nebius/scripts/brev-apply-docker-firewall.sh b/v1/providers/nebius/scripts/brev-apply-docker-firewall.sh new file mode 100644 index 0000000..001797c --- /dev/null +++ b/v1/providers/nebius/scripts/brev-apply-docker-firewall.sh @@ -0,0 +1,18 @@ +#!/bin/sh + +iptables -N DOCKER-USER 2>/dev/null || true +iptables -F DOCKER-USER || true +iptables -A DOCKER-USER -m conntrack --ctstate ESTABLISHED,RELATED -j ACCEPT +iptables -A DOCKER-USER -i docker0 ! -o docker0 -j ACCEPT +iptables -A DOCKER-USER -i br+ ! -o br+ -j ACCEPT +iptables -A DOCKER-USER -i cni+ ! -o cni+ -j ACCEPT +iptables -A DOCKER-USER -i cali+ ! -o cali+ -j ACCEPT +iptables -A DOCKER-USER -i docker0 -o docker0 -j ACCEPT +iptables -A DOCKER-USER -i br+ -o br+ -j ACCEPT +iptables -A DOCKER-USER -i cni+ -o cni+ -j ACCEPT +iptables -A DOCKER-USER -i cali+ -o cali+ -j ACCEPT +iptables -A DOCKER-USER -i lo -j ACCEPT +iptables -A DOCKER-USER -j DROP +iptables -A DOCKER-USER -j RETURN + +exit 0 From 1fbbb2beb73d640c26e873d64b62ec450b9ca122 Mon Sep 17 00:00:00 2001 From: Pratik Patel Date: Mon, 11 May 2026 09:26:02 -0700 Subject: [PATCH 4/4] review feedback --- v1/providers/nebius/instance.go | 19 +++++++++++-------- v1/providers/shadeform/firewall.go | 5 +++++ v1/providers/shadeform/firewall_test.go | 19 +++++++++++++++++++ 3 files changed, 35 insertions(+), 8 deletions(-) create mode 100644 v1/providers/shadeform/firewall_test.go diff --git a/v1/providers/nebius/instance.go b/v1/providers/nebius/instance.go index 322e32f..bfc5262 100644 --- a/v1/providers/nebius/instance.go +++ b/v1/providers/nebius/instance.go @@ -1657,6 +1657,12 @@ const ( // Keep these generated paths stable: cloud-init, systemd, and validation // tests all depend on this Docker firewall handoff. dockerFirewallScriptPath = "/usr/local/sbin/brev-apply-docker-firewall.sh" + + // This is a docker.service drop-in because the firewall rules must be + // re-applied immediately after Docker initializes or resets DOCKER-USER. If + // we need a separately inspectable status surface later, this can move to a + // named oneshot unit such as brev-docker-firewall.service; for now the + // execution is visible through docker.service journal/status output. dockerServiceDropInDir = "/etc/systemd/system/docker.service.d" dockerFirewallDropInPath = dockerServiceDropInDir + "/10-brev-firewall.conf" ) @@ -1670,20 +1676,17 @@ func generateDockerFirewallWriteFiles() string { // NAT/FORWARD rules that can make `docker run -p host:container` reachable // from the public internet even when UFW says incoming traffic is denied. // - // DOCKER-USER is Docker's documented filter hook for this traffic. The - // ordering is important: some Nebius images run cloud-init before Docker has - // created DOCKER-USER, and Docker may create/reset the chain during daemon - // startup. We therefore install both: - // - an immediate cloud-init run for images where Docker is already active - // - a docker.service ExecStartPost hook for images where Docker starts later + // DOCKER-USER is Docker's documented filter hook for this traffic. The script + // ensures the chain exists before configuring it. If Docker already created + // the chain, the create command fails harmlessly and the script continues. // // The generated script exits successfully even if an iptables command fails // because failing Docker startup would be worse operationally. Validation // tests assert that the rule set is actually present and blocks published // ports. // - // UFW persists its own rules in /etc/ufw; only DOCKER-USER needed a Docker - // startup hook after removing netfilter-persistent. + // UFW persists its own rules in /etc/ufw; Docker firewall rules are applied + // through cloud-init and the docker.service post-start hook. return fmt.Sprintf(` write_files: - path: %s diff --git a/v1/providers/shadeform/firewall.go b/v1/providers/shadeform/firewall.go index 7fc7a38..49bc4e4 100644 --- a/v1/providers/shadeform/firewall.go +++ b/v1/providers/shadeform/firewall.go @@ -15,6 +15,10 @@ const ( ufwDefaultAllowPort2222 = "ufw allow 2222/tcp" ufwForceEnable = "ufw --force enable" + // Ensure DOCKER-USER exists before clearing it. Docker normally creates this + // chain, but firewall setup can run before Docker has initialized iptables. + ipTablesCreateDockerUserChain = "iptables -N DOCKER-USER || true" + // Clear DOCKER-USER policy. ipTablesResetDockerUserChain = "iptables -F DOCKER-USER" @@ -83,6 +87,7 @@ func (c *ShadeformClient) getUFWCommands(firewallRules v1.FirewallRules) []strin func (c *ShadeformClient) getIPTablesCommands() []string { commands := []string{ + ipTablesCreateDockerUserChain, ipTablesResetDockerUserChain, ipTablesAllowDockerUserOutbound, ipTablesAllowDockerUserOutboundInit0, diff --git a/v1/providers/shadeform/firewall_test.go b/v1/providers/shadeform/firewall_test.go new file mode 100644 index 0000000..744416f --- /dev/null +++ b/v1/providers/shadeform/firewall_test.go @@ -0,0 +1,19 @@ +package v1 + +import ( + "strings" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestShadeformIPTablesCommandsCreateDockerUserChainBeforeFlush(t *testing.T) { + client := &ShadeformClient{} + commands := strings.Join(client.getIPTablesCommands(), "\n") + + createChainIndex := strings.Index(commands, "iptables -N DOCKER-USER") + flushChainIndex := strings.Index(commands, "iptables -F DOCKER-USER") + + assert.Greater(t, createChainIndex, -1) + assert.Greater(t, flushChainIndex, createChainIndex) +}