feat: run-on-login scheduling, reliability fixes, scheduler diagnostics#146
Conversation
| out := make([]byte, 0, 2+len(u16)*2) | ||
| out = append(out, 0xFF, 0xFE) // UTF-16LE BOM | ||
| for _, u := range u16 { | ||
| out = append(out, byte(u), byte(u>>8)) |
| if runtime.GOOS == "darwin" { | ||
| runHookStateReconcile(exec, log) |
There was a problem hiding this comment.
I think we can use constants that we have defined for Darwin (same for linux and windows) to keep the codebase consistent.
| // applyBatterySettings re-imports the named task's XML with the power / | ||
| // missed-run / retry settings that `schtasks /create` cannot set from the | ||
| // command line: DisallowStartIfOnBatteries=false, StopIfGoingOnBatteries=false, | ||
| // StartWhenAvailable=true, and RestartOnFailure (15 min, 3 attempts). The OS | ||
| // defaults (battery-restricted, no catch-up, 1-minute retry) otherwise leave a | ||
| // laptop on battery silently never firing the task and retry too aggressively. | ||
| // Flow: query the task's generated XML, force the Settings, and re-import via | ||
| // `schtasks /create /xml`. Pure schtasks.exe — no dependency on the PowerShell | ||
| // ScheduledTasks module (absent on Server Core). Best-effort: any failure | ||
| // leaves the task with default settings, which is still better than no task. | ||
| func applyBatterySettings(ctx context.Context, exec executor.Executor, log *progress.Logger, name string) { |
There was a problem hiding this comment.
I think we should log the xml configuration that is being applied (if we are not doing it yet) - even at debug level should be okay.
| case "darwin": | ||
| // `launchctl list <label>` prints a "PID" key only while the job runs. | ||
| out, _, code, err := exec.RunWithTimeout(context.Background(), jobStateProbeTimeout, "launchctl", "list", launchd.Label) | ||
| if err != nil || code != 0 { | ||
| return false, false | ||
| } | ||
| return !strings.Contains(out, `"PID"`), true | ||
| case "linux": | ||
| // is-active prints active/activating while the oneshot service runs; | ||
| // inactive/failed when idle. Fixed strings, not localized. | ||
| out, _, _, err := exec.RunWithTimeout(context.Background(), jobStateProbeTimeout, "systemctl", "--user", "is-active", systemd.ServiceUnitName()) | ||
| if err != nil { | ||
| return false, false | ||
| } | ||
| state := strings.TrimSpace(out) | ||
| if state == "" { | ||
| return false, false | ||
| } | ||
| running := state == "active" || state == "activating" | ||
| return !running, true | ||
| case "windows": |
There was a problem hiding this comment.
For all these cases for platform we can use the constants, that we already have. I think its defined in /internal/model.
| // schedulerJobIdle reports whether the scheduler job/task is CONFIRMED not | ||
| // currently running. The second return is false when the state can't be | ||
| // determined (probe failed / unsupported), so callers fail safe to "install". | ||
| // The signals are locale-independent: launchctl's "PID" key, systemctl's fixed | ||
| // active/activating states, and Get-ScheduledTask's State enum (NOT schtasks | ||
| // /query's localized Status field). | ||
| func schedulerJobIdle(exec executor.Executor) (idle, known bool) { |
There was a problem hiding this comment.
I think we should also log the exact command that was exec'd at debug level - in case something doesn't works properly we can just take the command as it is that DMG ran and then test it on our testing machines.
| // Scheduler info — first tracked phase. Gathers launchd/schtasks/systemd | ||
| // state (interval, last/next run, RunAtLoad, last exit, missed runs) for | ||
| // troubleshooting; see docs/launchd-troubleshooting.md. Best-effort — | ||
| // schedinfo.Gather uses short per-query timeouts and never errors. Like | ||
| // device_info it completes before the "started" post (so the first | ||
| // heartbeat includes it), but postPhase() is intentionally NOT called | ||
| // here: the backend has no run-status row to upsert into until that post, | ||
| // so this phase's completion ships in the first postPhase() below. | ||
| schedCtx, schedCancel := startPhase(ctx, tracker, "scheduler_info") | ||
| log.Progress("Gathering scheduler information...") | ||
| schedinfo.Log(schedinfo.Gather(schedCtx, exec), log) |
There was a problem hiding this comment.
So the first phase now will be "Scheduler Information Gathering" and then with the initial heartbeats it would be sent right? Will we be sure that even if the whole telemetry is not sent we would still have this information with us - so that we can at least debug by looking at how the scheduler was configured?
| // decodeTaskXML converts the bytes emitted by `schtasks /query /xml` to a UTF-8 | ||
| // string. That output is typically UTF-16LE with a BOM; UTF-8 (with or without | ||
| // BOM) and plain ASCII are handled too. | ||
| func decodeTaskXML(raw string) string { | ||
| b := []byte(raw) | ||
| switch { | ||
| case len(b) >= 2 && b[0] == 0xFF && b[1] == 0xFE: // UTF-16LE BOM | ||
| u16 := make([]uint16, 0, (len(b)-2)/2) | ||
| for i := 2; i+1 < len(b); i += 2 { | ||
| u16 = append(u16, uint16(b[i])|uint16(b[i+1])<<8) | ||
| } | ||
| return string(utf16.Decode(u16)) | ||
| case len(b) >= 3 && b[0] == 0xEF && b[1] == 0xBB && b[2] == 0xBF: // UTF-8 BOM | ||
| return string(b[3:]) | ||
| default: | ||
| return raw | ||
| } | ||
| } |
There was a problem hiding this comment.
This function looks a bit complicated, I am not sure what it is doing, but if we can extend unit test to cover all the paths it would be better.
It also has a lot of arbitrary numbers and values so if we can comment on how this logic works it would be easy to understand a bit more.
What does this PR do?
Type of change
Testing
./stepsecurity-dev-machine-guard --verbose./stepsecurity-dev-machine-guard --json | python3 -m json.toolmake lintmake testRelated Issues