Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 12 additions & 4 deletions testsuite/fleettest.json.example
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,23 @@
"this target mirrors), configure_flags. Optional (with defaults): make (\"make\"),",
"python (\"python3\"), rsync_bin (\"rsync\"; \"rsync.exe\" on Cygwin), privilege",
"(\"root\" | \"sudo\" | \"user\"), pipe_jobs/tcp_jobs (8), builddir (\"rsync-citest\",",
"relative to the remote $HOME), env_prefix, configure_pre, nonroot, protocols.",
"relative to the remote $HOME), env_prefix, configure_pre, nonroot, protocols,",
"max_retry.",
"",
"nonroot: true reruns -- as the non-root ssh user, after the sudo runs -- the",
"tests that declare `fleet_nonroot = True` at module level (so the set is",
"maintained in the test files, not here).",
"",
"protocols: [30, 29] adds one extra stdio-pipe test pass per listed version,",
"each run with runtests --protocol=N (the fleet analogue of a workflow's",
"check30/check29 steps) and shown as a protoNN column. Keys starting with",
"\"_\" are comments. See testsuite/README.md."
"check30/check29 steps) and shown as a protoNN column.",
"",
"max_retry: N (default 0) re-runs each failed test on its own up to N more",
"times and drops any that then pass (listed under RECOVERED, not hidden). Use",
"on a slow/loaded box where concurrency-sensitive tests occasionally flake,",
"instead of dropping the whole target to a lower pipe_jobs/tcp_jobs.",
"",
"Keys starting with \"_\" are comments. See testsuite/README.md."
],
"targets": [
{
Expand All @@ -40,12 +47,13 @@
"--disable-xxhash", "--disable-lz4"]
},
{
"_comment": "Nested-VM OpenBSD occasionally flakes a daemon/tcp test under load; max_retry re-runs just the failed test rather than throttling the whole box (tcp_jobs/pipe_jobs are still available if you prefer that).",
"name": "openbsd",
"ssh_host": "root@openbsd",
"workflow": "openbsd-build.yml",
"make": "gmake",
"configure_pre": "export AUTOCONF_VERSION=2.71 AUTOMAKE_VERSION=1.16;",
"tcp_jobs": 2,
"max_retry": 2,
"configure_flags": ["--with-rrsync", "--disable-zstd", "--disable-md2man",
"--disable-xxhash", "--disable-lz4"]
},
Expand Down
83 changes: 74 additions & 9 deletions testsuite/fleettest.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,12 @@ class Target:
# stdio-pipe pass with runtests --protocol=N (the fleet analogue of a
# workflow's check30/check29 steps). e.g. [30, 29]. Empty => proto pass off.
protocols: list[int] = dataclasses.field(default_factory=list)
# Per-target retry budget for FLAKY tests: after a run, each failed test is
# re-run on its own up to max_retry more times, and any that then pass are
# dropped from the failure list (and reported as "recovered", never hidden).
# Use on a slow/loaded box where concurrency-sensitive tests occasionally
# flake, instead of dropping the whole target to a lower -j. 0 => no retry.
max_retry: int = 0


def load_fleet(path: Path) -> list[Target]:
Expand Down Expand Up @@ -283,17 +289,22 @@ def build_script(t: Target) -> str:


def test_script(t: Target, transport: str, skip_csv: str | None, jobs: int,
protocol: int | None = None) -> str:
protocol: int | None = None, only: list[str] | None = None) -> str:
rb = f'--rsync-bin="$PWD/{t.rsync_bin}"'
tcp = " --use-tcp" if transport == "tcp" else ""
# protocol forces an older wire version (mirrors `make check30`/`check29`).
proto = f" --protocol={protocol}" if protocol is not None else ""
# PYTHONDONTWRITEBYTECODE: don't drop root-owned __pycache__/*.pyc into the
# tree (a sudo run would, breaking the next non-root push --delete).
env = "PYTHONDONTWRITEBYTECODE=1 "
if skip_csv:
# Named tests (a max_retry re-run) make runtests full_run False, so the
# expected-skip list does not apply -- only the named tests' pass/fail matter.
names = ""
if only:
names = " " + " ".join(only)
elif skip_csv:
env += f"RSYNC_EXPECT_SKIPPED={skip_csv} "
runtests = f'{t.python} runtests.py {rb}{tcp}{proto} -j {jobs}'
runtests = f'{t.python} runtests.py {rb}{tcp}{proto} -j {jobs}{names}'
# env_prefix (e.g. a brew PATH) must reach the test too: some tests build a
# helper binary on the fly (a test may invoke `make`, which needs gawk etc.),
# so the build tools must be on PATH at test time.
Expand Down Expand Up @@ -349,6 +360,10 @@ class TransportResult:
skip_expected: set[str]
skip_got: set[str]
raw: str
# Tests that failed the initial run but passed on a max_retry re-run, so they
# were dropped from `failed`. Surfaced in the report (a recovered flake is
# noted, never silently hidden).
recovered: list[str] = dataclasses.field(default_factory=list)

@property
def skip_mismatch(self) -> bool:
Expand Down Expand Up @@ -376,6 +391,35 @@ def parse_transport(transport: str, r: CmdResult, skip_checked: bool) -> Transpo
skip_checked, exp, got, r.out)


def retry_failed(t: Target, label: str, tr: TransportResult, rerun) -> None:
"""Honour the target's max_retry budget: re-run each failed test on its own
(serially) up to max_retry more times; drop any that pass and record them in
tr.recovered. `rerun(names)` runs the given tests and returns a CmdResult.
A no-op when max_retry is 0 or there were no failures."""
if not t.max_retry or not tr.failed:
return
remaining = list(tr.failed)
for attempt in range(1, t.max_retry + 1):
r = rerun(remaining)
still = [m.group(2) for m in RE_RESULT.finditer(r.out)
if m.group(1) in ("FAIL", "ERROR")]
recovered = [n for n in remaining if n not in still]
if recovered:
tr.recovered.extend(recovered)
log(f"[{t.name}] {label} retry {attempt}/{t.max_retry}: "
f"recovered {','.join(recovered)}"
+ (f"; still failing {','.join(still)}" if still else ""))
remaining = [n for n in remaining if n in still]
if not remaining:
break
tr.failed = remaining
# The initial run's non-zero exit was the now-recovered failures; once they
# all pass on retry the cell is OK, so clear the stale exit code (only the
# failed tests can make runtests exit non-zero on a no-skip-list re-run).
if not remaining and tr.recovered and tr.exit_code != 0:
tr.exit_code = 0


@dataclasses.dataclass
class TargetResult:
target: str
Expand Down Expand Up @@ -444,9 +488,12 @@ def run_target(t: Target, args, staging: str) -> TargetResult:
t0 = time.monotonic()
r = run_on(t, cmd, timeout=2400)
res.timings[transport] = time.monotonic() - t0
res.transports[transport] = parse_transport(transport, r, skip_csv is not None)
tr = parse_transport(transport, r, skip_csv is not None)
retry_failed(t, transport, tr, lambda names, tp=transport: run_on(
t, test_script(t, tp, None, 1, only=names), timeout=1200))
res.transports[transport] = tr
log(f"[{t.name}] {transport} done "
f"({'ok' if res.transports[transport].ok else 'ISSUE'})")
f"({'ok' if tr.ok else 'ISSUE'})")

# Extra older-protocol passes (mirroring the workflow's check30/check29
# steps): same stdio-pipe transport and skip list as `make check`, but with
Expand All @@ -461,19 +508,26 @@ def run_target(t: Target, args, staging: str) -> TargetResult:
t0 = time.monotonic()
r = run_on(t, cmd, timeout=2400)
res.timings[label] = time.monotonic() - t0
res.transports[label] = parse_transport(label, r, skip_csv is not None)
tr = parse_transport(label, r, skip_csv is not None)
retry_failed(t, label, tr, lambda names, pr=proto: run_on(
t, test_script(t, "pipe", None, 1, protocol=pr, only=names),
timeout=1200))
res.transports[label] = tr
log(f"[{t.name}] {label} done "
f"({'ok' if res.transports[label].ok else 'ISSUE'})")
f"({'ok' if tr.ok else 'ISSUE'})")

# Extra non-root pass (after the sudo runs) for targets that opt in, running
# the tests that declare `fleet_nonroot = True` (discovered in main()).
if t.nonroot and args.nonroot_tests:
t0 = time.monotonic()
r = run_on(t, nonroot_test_script(t, args.nonroot_tests), timeout=2400)
res.timings["nonroot"] = time.monotonic() - t0
res.transports["nonroot"] = parse_transport("nonroot", r, skip_checked=False)
tr = parse_transport("nonroot", r, skip_checked=False)
retry_failed(t, "nonroot", tr, lambda names: run_on(
t, nonroot_test_script(t, names), timeout=1200))
res.transports["nonroot"] = tr
log(f"[{t.name}] nonroot done "
f"({'ok' if res.transports['nonroot'].ok else 'ISSUE'})")
f"({'ok' if tr.ok else 'ISSUE'})")
res.timings["total"] = time.monotonic() - started
return res

Expand Down Expand Up @@ -598,6 +652,17 @@ def print_report(results: list[TargetResult], args, fleet: list[Target]) -> bool
for d in details:
print(d)
print("=" * 64)
# Recovered flakes: tests that failed but passed within the target's
# max_retry budget. The cell counts as OK, but list them so a flaky test is
# never silently swallowed.
recovered = [f"{res.target} / {transport}: {','.join(tr.recovered)}"
for res in results for transport in transports
if (tr := res.transports.get(transport)) and tr.recovered]
if recovered:
print("==== RECOVERED (flaky -- failed, then passed on retry) ====")
for r in recovered:
print(f" {r}")
print("=" * 64)
print(f"{len(results)} targets x {len(transports)} transports = {cells} cells: "
f"{ok_cells} OK, {cells - ok_cells} not OK")
return all_ok
Expand Down
Loading