diff --git a/Makefile b/Makefile index ec0028e..35b9785 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,7 @@ .PHONY: pre-commit fmt fmt-check clippy test build help .PHONY: docker-build-operator docker-build-console-web docker-build-all .PHONY: console-lint console-build console-fmt console-fmt-check -.PHONY: e2e-check e2e-live-create .e2e-live-install-cert-manager e2e-live-run e2e-live-faults e2e-live-update e2e-live-delete +.PHONY: e2e-check e2e-live-create .e2e-live-install-cert-manager e2e-live-run e2e-live-update e2e-live-delete # Default target IMAGE_REPO ?= rustfs/operator @@ -43,7 +43,6 @@ help: @echo " make e2e-check - Check Rust-native e2e harness (fmt + test + clippy)" @echo " make e2e-live-create - Clean dedicated storage, recreate live Kind environment, install cert-manager, and load e2e image" @echo " make e2e-live-run - Run all non-destructive live suites in the existing live environment" - @echo " make e2e-live-faults - Run destructive live fault suites with RUSTFS_E2E_DESTRUCTIVE=1" @echo " make e2e-live-update - Rebuild image and update the live environment (load + rollout)" @echo " make e2e-live-delete - Delete live Kind environment and clean dedicated storage" @@ -130,9 +129,6 @@ e2e-live-run: RUSTFS_E2E_LIVE=1 cargo test --manifest-path $(E2E_MANIFEST) --test cert_manager_tls -- --ignored --test-threads=$(E2E_TEST_THREADS) --nocapture @echo "configured live e2e suites passed." -e2e-live-faults: - RUSTFS_E2E_LIVE=1 RUSTFS_E2E_DESTRUCTIVE=1 cargo test --manifest-path $(E2E_MANIFEST) --test faults -- --ignored --test-threads=$(E2E_TEST_THREADS) --nocapture - e2e-live-update: docker build --network host -t rustfs/operator:e2e . docker build --network host -t rustfs/console-web:e2e -f console-web/Dockerfile console-web diff --git a/README.md b/README.md index f1d2246..1dbab1f 100755 --- a/README.md +++ b/README.md @@ -73,7 +73,6 @@ From the repo root: | `make e2e-check` | Validate the e2e harness without creating a live cluster. | | `make e2e-live-create` | Build e2e images, recreate the dedicated Kind cluster, install cert-manager, and load images. | | `make e2e-live-run` | Deploy the dev control plane and run all non-destructive live suites. | -| `make e2e-live-faults` | Run destructive live fault suites with `RUSTFS_E2E_DESTRUCTIVE=1`. | | `make e2e-live-update` | Rebuild images, reload them into Kind, and roll out control-plane deployments. | | `make e2e-live-delete` | Delete the dedicated Kind cluster and its local storage. | @@ -160,7 +159,7 @@ Then use `http://127.0.0.1:19000` for the Tenant S3 API and `http://127.0.0.1:19 - `deploy/rustfs-operator/` — Helm chart, templates, values, and packaged CRDs. - `deploy/k8s-dev/` — Development manifests used by the dev/e2e deployment flows. - `deploy/kind/` — Kind cluster configuration for local development. -- **e2e/** — Rust-native Kind e2e harness, live test suites, and dedicated manifests. +- **e2e/** — Rust-native Kind e2e harness plus shared implementation modules for the separate real-cluster fault-test runner. - **examples/** — Sample `Tenant` custom resources and usage notes. - **docs/** — Design notes, GA planning material, and supporting images. - **assets/** — README and documentation images. diff --git a/e2e/Cargo.lock b/e2e/Cargo.lock index fb8ae15..8ca0435 100644 --- a/e2e/Cargo.lock +++ b/e2e/Cargo.lock @@ -110,6 +110,15 @@ dependencies = [ "derive_arbitrary", ] +[[package]] +name = "arc-swap" +version = "1.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a3a1fd6f75306b68087b831f025c712524bcb19aad54e557b1129cfa0a2b207" +dependencies = [ + "rustversion", +] + [[package]] name = "async-broadcast" version = "0.7.2" @@ -179,6 +188,49 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" +[[package]] +name = "aws-config" +version = "1.8.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e33f815b73a3899c03b380d543532e5865f230dce9678d108dc10732a8682275" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-sdk-sso", + "aws-sdk-ssooidc", + "aws-sdk-sts", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-schema", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand", + "hex", + "http 1.4.0", + "sha1 0.10.6", + "time", + "tokio", + "tracing", + "url", + "zeroize", +] + +[[package]] +name = "aws-credential-types" +version = "1.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f20799b373a1be121fe3005fba0c2090af9411573878f224df44b42727fcaf7" +dependencies = [ + "aws-smithy-async", + "aws-smithy-runtime-api", + "aws-smithy-types", + "zeroize", +] + [[package]] name = "aws-lc-rs" version = "1.17.0" @@ -201,6 +253,414 @@ dependencies = [ "fs_extra", ] +[[package]] +name = "aws-runtime" +version = "1.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c9b9de216a988dd54b754a82a7660cfe14cee4f6782ae4524470972fa0ccb39" +dependencies = [ + "aws-credential-types", + "aws-sigv4", + "aws-smithy-async", + "aws-smithy-eventstream", + "aws-smithy-http", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "bytes-utils", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "http-body 0.4.6", + "http-body 1.0.1", + "percent-encoding", + "pin-project-lite", + "tracing", + "uuid", +] + +[[package]] +name = "aws-sdk-s3" +version = "1.137.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2dd7213994e2ff9382ff100403b78c30d1b74cdfcd8fa9d0d1dc3a94a5c4874" +dependencies = [ + "arc-swap", + "aws-credential-types", + "aws-runtime", + "aws-sigv4", + "aws-smithy-async", + "aws-smithy-checksums", + "aws-smithy-eventstream", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-observability", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-smithy-xml", + "aws-types", + "bytes", + "fastrand", + "hex", + "hmac 0.13.0", + "http 0.2.12", + "http 1.4.0", + "http-body 1.0.1", + "lru", + "percent-encoding", + "regex-lite", + "sha2 0.11.0", + "tracing", + "url", +] + +[[package]] +name = "aws-sdk-sso" +version = "1.102.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c82b3ac19f1431854f7ace3a7531674633e286bfdde21976893bfee36fd493b" +dependencies = [ + "arc-swap", + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-observability", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sdk-ssooidc" +version = "1.104.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "321000d2b4c5519ee573f73167f612efd7329322d9b26969ad1979f0427f1913" +dependencies = [ + "arc-swap", + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-observability", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sdk-sts" +version = "1.107.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d0d328ba962af23ecfa3c9f23b98d3d35e325fa218d7f13d17a6bf522f8a560" +dependencies = [ + "arc-swap", + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-observability", + "aws-smithy-query", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-smithy-xml", + "aws-types", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sigv4" +version = "1.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bae38512beae0ffee7010fc24e7a8a123c53efdfef42a61e80fda4882418dc71" +dependencies = [ + "aws-credential-types", + "aws-smithy-eventstream", + "aws-smithy-http", + "aws-smithy-runtime-api", + "aws-smithy-types", + "bytes", + "crypto-bigint", + "form_urlencoded", + "hex", + "hmac 0.13.0", + "http 0.2.12", + "http 1.4.0", + "p256", + "percent-encoding", + "sha2 0.11.0", + "subtle", + "time", + "tracing", + "zeroize", +] + +[[package]] +name = "aws-smithy-async" +version = "1.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ffcaf626bdda484571968400c326a244598634dc75fd451325a54ad1a59acfc" +dependencies = [ + "futures-util", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "aws-smithy-checksums" +version = "0.64.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9e8e65f4f81fcccdeb6c3eca2af17ac21d421a1786a26a394aecf421d616d3a" +dependencies = [ + "aws-smithy-http", + "aws-smithy-types", + "bytes", + "crc-fast", + "hex", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "md-5", + "pin-project-lite", + "sha1 0.11.0", + "sha2 0.11.0", + "tracing", +] + +[[package]] +name = "aws-smithy-eventstream" +version = "0.60.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78d8391e65fcea47c586a22e1a41f173b38615b112b2c6b7a44e80cec3e6b706" +dependencies = [ + "aws-smithy-types", + "bytes", + "crc32fast", +] + +[[package]] +name = "aws-smithy-http" +version = "0.63.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba1ab2dc1c2c3749ead27180d333c42f11be8b0e934058fb4b2258ee8dbe5231" +dependencies = [ + "aws-smithy-eventstream", + "aws-smithy-runtime-api", + "aws-smithy-types", + "bytes", + "bytes-utils", + "futures-core", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "percent-encoding", + "pin-project-lite", + "pin-utils", + "tracing", +] + +[[package]] +name = "aws-smithy-http-client" +version = "1.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c3ef8931ad1c98aa6a55b4256f847f3116090819844e0dd41ea682cac5dd2d3" +dependencies = [ + "aws-smithy-async", + "aws-smithy-runtime-api", + "aws-smithy-types", + "h2 0.3.27", + "h2 0.4.14", + "http 0.2.12", + "http 1.4.0", + "http-body 0.4.6", + "hyper 0.14.32", + "hyper 1.9.0", + "hyper-rustls 0.24.2", + "hyper-rustls 0.27.9", + "hyper-util", + "pin-project-lite", + "rustls 0.21.12", + "rustls 0.23.40", + "rustls-native-certs", + "rustls-pki-types", + "tokio", + "tokio-rustls 0.26.4", + "tower", + "tracing", +] + +[[package]] +name = "aws-smithy-json" +version = "0.62.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "701a947f4797e52a911e114a898667c746c39feea467bbd1abd7b3721f702ffa" +dependencies = [ + "aws-smithy-runtime-api", + "aws-smithy-schema", + "aws-smithy-types", +] + +[[package]] +name = "aws-smithy-observability" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06c2315d173edbf1920da8ba3a7189695827002e4c0fc961973ab1c54abca9c" +dependencies = [ + "aws-smithy-runtime-api", +] + +[[package]] +name = "aws-smithy-query" +version = "0.60.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a56d79744fb3edb5d722ef79d86081e121d3b9422cb209eb03aea6aa4f21ebd" +dependencies = [ + "aws-smithy-types", + "urlencoding", +] + +[[package]] +name = "aws-smithy-runtime" +version = "1.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e6f5caf6fea86f8c2206541ab5857cfcda9013426cdbe8fa0098b9e2d32182" +dependencies = [ + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-http-client", + "aws-smithy-observability", + "aws-smithy-runtime-api", + "aws-smithy-schema", + "aws-smithy-types", + "bytes", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "http-body 0.4.6", + "http-body 1.0.1", + "http-body-util", + "pin-project-lite", + "pin-utils", + "tokio", + "tracing", +] + +[[package]] +name = "aws-smithy-runtime-api" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9db177daa6ba8afb9ee1aefcf548c907abcf52065e394ee11a92780057fe0e8c" +dependencies = [ + "aws-smithy-async", + "aws-smithy-runtime-api-macros", + "aws-smithy-types", + "bytes", + "http 0.2.12", + "http 1.4.0", + "pin-project-lite", + "tokio", + "tracing", + "zeroize", +] + +[[package]] +name = "aws-smithy-runtime-api-macros" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d7396fd9500589e62e460e987ecb671bad374934e55ec3b5f498cc7a8a8a7b7" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "aws-smithy-schema" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7442cb268338f0eb8278140a107c046756aa01093d8ef5e99628d34ae09c94f5" +dependencies = [ + "aws-smithy-runtime-api", + "aws-smithy-types", + "http 1.4.0", +] + +[[package]] +name = "aws-smithy-types" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32b42fcf341259d85ca10fac9a2f6448a8ec691c6955a18e45bc3b71a85fab85" +dependencies = [ + "base64-simd", + "bytes", + "bytes-utils", + "futures-core", + "http 0.2.12", + "http 1.4.0", + "http-body 0.4.6", + "http-body 1.0.1", + "http-body-util", + "itoa", + "num-integer", + "pin-project-lite", + "pin-utils", + "ryu", + "serde", + "time", + "tokio", + "tokio-util", +] + +[[package]] +name = "aws-smithy-xml" +version = "0.60.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce02add1aa3677d022f8adf81dcbe3046a95f17a1b1e8979c145cd21d3d22b3" +dependencies = [ + "xmlparser", +] + +[[package]] +name = "aws-types" +version = "1.3.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d16bf10b03a3c01e6b3b7d47cd964e873ffe9e7d4e80fad16bd4c077cb068531" +dependencies = [ + "aws-credential-types", + "aws-smithy-async", + "aws-smithy-runtime-api", + "aws-smithy-schema", + "aws-smithy-types", + "rustc_version", + "tracing", +] + [[package]] name = "axum" version = "0.7.9" @@ -212,10 +672,10 @@ dependencies = [ "axum-macros", "bytes", "futures-util", - "http", - "http-body", + "http 1.4.0", + "http-body 1.0.1", "http-body-util", - "hyper", + "hyper 1.9.0", "hyper-util", "itoa", "matchit", @@ -245,8 +705,8 @@ dependencies = [ "async-trait", "bytes", "futures-util", - "http", - "http-body", + "http 1.4.0", + "http-body 1.0.1", "http-body-util", "mime", "pin-project-lite", @@ -279,12 +739,34 @@ dependencies = [ "tokio", ] +[[package]] +name = "base16ct" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c7f02d4ea65f2c1853089ffd8d2787bdbc63de2f0d29dedbcf8ccdfa0ccd4cf" + [[package]] name = "base64" version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "base64-simd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "339abbe78e73178762e23bea9dfd08e697eb3f3301cd4be981c0f78ba5859195" +dependencies = [ + "outref", + "vsimd", +] + +[[package]] +name = "base64ct" +version = "1.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2af50177e190e07a26ab74f8b1efbfe2ef87da2116221318cb1c2e82baf7de06" + [[package]] name = "bitflags" version = "2.11.1" @@ -300,6 +782,15 @@ dependencies = [ "generic-array", ] +[[package]] +name = "block-buffer" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2f6c7dbe95a6ed67ad9f18e57daf93a2f034c524b99fd2b76d18fdfeb6660aa" +dependencies = [ + "hybrid-array", +] + [[package]] name = "bumpalo" version = "3.20.2" @@ -312,6 +803,16 @@ version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" +[[package]] +name = "bytes-utils" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dafe3a8757b027e2be6e4e5601ed563c55989fcf1546e933c66c8eb3a058d35" +dependencies = [ + "bytes", + "either", +] + [[package]] name = "cc" version = "1.2.62" @@ -399,6 +900,12 @@ dependencies = [ "cc", ] +[[package]] +name = "cmov" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c9ea0ac24bc397ab3c98583a3c9ba74fa56b09a4449bbe172b9b1ddb016027a" + [[package]] name = "colorchoice" version = "1.0.5" @@ -431,6 +938,18 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "const-oid" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" + +[[package]] +name = "const-oid" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6ef517f0926dd24a1582492c791b6a4818a4d94e789a334894aa15b0d12f55c" + [[package]] name = "const-str" version = "1.1.0" @@ -512,6 +1031,25 @@ dependencies = [ "libc", ] +[[package]] +name = "cpufeatures" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" +dependencies = [ + "libc", +] + +[[package]] +name = "crc-fast" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e75b2483e97a5a7da73ac68a05b629f9c53cff58d8ed1c77866079e18b00dba5" +dependencies = [ + "digest 0.10.7", + "spin", +] + [[package]] name = "crc32fast" version = "1.5.0" @@ -527,6 +1065,18 @@ version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" +[[package]] +name = "crypto-bigint" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76" +dependencies = [ + "generic-array", + "rand_core 0.6.4", + "subtle", + "zeroize", +] + [[package]] name = "crypto-common" version = "0.1.7" @@ -537,6 +1087,24 @@ dependencies = [ "typenum", ] +[[package]] +name = "crypto-common" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce6e4c961d6cd6c9a86db418387425e8bdeaf05b3c8bc1411e6dca4c252f1453" +dependencies = [ + "hybrid-array", +] + +[[package]] +name = "ctutils" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d5515a3834141de9eafb9717ad39eea8247b5674e6066c404e8c4b365d2a29e" +dependencies = [ + "cmov", +] + [[package]] name = "darling" version = "0.21.3" @@ -572,6 +1140,17 @@ dependencies = [ "syn", ] +[[package]] +name = "der" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7c1832837b905bbfb5101e07cc24c8deddf52f93225eee6ead5f4d63d53ddcb" +dependencies = [ + "const-oid 0.9.6", + "pem-rfc7468", + "zeroize", +] + [[package]] name = "deranged" version = "0.5.8" @@ -619,11 +1198,24 @@ version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ - "block-buffer", - "crypto-common", + "block-buffer 0.10.4", + "const-oid 0.9.6", + "crypto-common 0.1.7", "subtle", ] +[[package]] +name = "digest" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1dd6dbb5841937940781866fa1281a1ff7bd3bf827091440879f9994983d5c2" +dependencies = [ + "block-buffer 0.12.1", + "const-oid 0.10.2", + "crypto-common 0.2.2", + "ctutils", +] + [[package]] name = "displaydoc" version = "0.2.5" @@ -656,6 +1248,20 @@ version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" +[[package]] +name = "ecdsa" +version = "0.16.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee27f32b5c5292967d2d4a9d7f1e0b0aed2c15daded5a60300e4abb9d8020bca" +dependencies = [ + "der", + "digest 0.10.7", + "elliptic-curve", + "rfc6979", + "signature", + "spki", +] + [[package]] name = "educe" version = "0.6.0" @@ -674,6 +1280,26 @@ version = "1.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" +[[package]] +name = "elliptic-curve" +version = "0.13.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e6043086bf7973472e0c7dff2142ea0b680d30e18d9cc40f267efbf222bd47" +dependencies = [ + "base16ct", + "crypto-bigint", + "digest 0.10.7", + "ff", + "generic-array", + "group", + "pem-rfc7468", + "pkcs8", + "rand_core 0.6.4", + "sec1", + "subtle", + "zeroize", +] + [[package]] name = "enum-ordinalize" version = "4.3.2" @@ -735,7 +1361,17 @@ dependencies = [ name = "fastrand" version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" +checksum = "9f1f227452a390804cdb637b74a86990f2a7d7ba4b7d5693aac9b4dd6defd8d6" + +[[package]] +name = "ff" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0b50bfb653653f9ca9095b427bed08ab8d75a137839d9ad64eb11810d5b6393" +dependencies = [ + "rand_core 0.6.4", + "subtle", +] [[package]] name = "find-msvc-tools" @@ -765,6 +1401,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + [[package]] name = "form_urlencoded" version = "1.2.2" @@ -876,6 +1518,7 @@ checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" dependencies = [ "typenum", "version_check", + "zeroize", ] [[package]] @@ -943,6 +1586,36 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "group" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0f9ef7462f7c099f518d754361858f86d8a07af53ba9af0fe635bbccb151a63" +dependencies = [ + "ff", + "rand_core 0.6.4", + "subtle", +] + +[[package]] +name = "h2" +version = "0.3.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0beca50380b1fc32983fc1cb4587bfa4bb9e78fc259aad4a0032d2080309222d" +dependencies = [ + "bytes", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http 0.2.12", + "indexmap", + "slab", + "tokio", + "tokio-util", + "tracing", +] + [[package]] name = "h2" version = "0.4.14" @@ -954,7 +1627,7 @@ dependencies = [ "fnv", "futures-core", "futures-sink", - "http", + "http 1.4.0", "indexmap", "slab", "tokio", @@ -970,7 +1643,18 @@ checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ "allocator-api2", "equivalent", - "foldhash", + "foldhash 0.1.5", +] + +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash 0.2.0", ] [[package]] @@ -997,7 +1681,16 @@ version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" dependencies = [ - "digest", + "digest 0.10.7", +] + +[[package]] +name = "hmac" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6303bc9732ae41b04cb554b844a762b4115a61bfaa81e3e83050991eeb56863f" +dependencies = [ + "digest 0.11.3", ] [[package]] @@ -1020,6 +1713,17 @@ dependencies = [ "windows-link", ] +[[package]] +name = "http" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + [[package]] name = "http" version = "1.4.0" @@ -1030,6 +1734,17 @@ dependencies = [ "itoa", ] +[[package]] +name = "http-body" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" +dependencies = [ + "bytes", + "http 0.2.12", + "pin-project-lite", +] + [[package]] name = "http-body" version = "1.0.1" @@ -1037,7 +1752,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" dependencies = [ "bytes", - "http", + "http 1.4.0", ] [[package]] @@ -1048,8 +1763,8 @@ checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" dependencies = [ "bytes", "futures-core", - "http", - "http-body", + "http 1.4.0", + "http-body 1.0.1", "pin-project-lite", ] @@ -1071,6 +1786,39 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" +[[package]] +name = "hybrid-array" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9155a582abd142abc056962c29e3ce5ff2ad5469f4246b537ed42c5deba857da" +dependencies = [ + "typenum", +] + +[[package]] +name = "hyper" +version = "0.14.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41dfc780fdec9373c01bae43289ea34c972e40ee3c9f6b3c8801a35f35586ce7" +dependencies = [ + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "h2 0.3.27", + "http 0.2.12", + "http-body 0.4.6", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "socket2 0.5.10", + "tokio", + "tower-service", + "tracing", + "want", +] + [[package]] name = "hyper" version = "1.9.0" @@ -1081,9 +1829,9 @@ dependencies = [ "bytes", "futures-channel", "futures-core", - "h2", - "http", - "http-body", + "h2 0.4.14", + "http 1.4.0", + "http-body 1.0.1", "httparse", "httpdate", "itoa", @@ -1093,20 +1841,35 @@ dependencies = [ "want", ] +[[package]] +name = "hyper-rustls" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" +dependencies = [ + "futures-util", + "http 0.2.12", + "hyper 0.14.32", + "log", + "rustls 0.21.12", + "tokio", + "tokio-rustls 0.24.1", +] + [[package]] name = "hyper-rustls" version = "0.27.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33ca68d021ef39cf6463ab54c1d0f5daf03377b70561305bb89a8f83aab66e0f" dependencies = [ - "http", - "hyper", + "http 1.4.0", + "hyper 1.9.0", "hyper-util", "log", - "rustls", + "rustls 0.23.40", "rustls-native-certs", "tokio", - "tokio-rustls", + "tokio-rustls 0.26.4", "tower-service", "webpki-roots", ] @@ -1117,7 +1880,7 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0" dependencies = [ - "hyper", + "hyper 1.9.0", "hyper-util", "pin-project-lite", "tokio", @@ -1134,14 +1897,14 @@ dependencies = [ "bytes", "futures-channel", "futures-util", - "http", - "http-body", - "hyper", + "http 1.4.0", + "http-body 1.0.1", + "hyper 1.9.0", "ipnet", "libc", "percent-encoding", "pin-project-lite", - "socket2", + "socket2 0.6.3", "tokio", "tower-service", "tracing", @@ -1432,18 +2195,18 @@ dependencies = [ "either", "futures", "home", - "http", - "http-body", + "http 1.4.0", + "http-body 1.0.1", "http-body-util", - "hyper", - "hyper-rustls", + "hyper 1.9.0", + "hyper-rustls 0.27.9", "hyper-timeout", "hyper-util", "jsonpath-rust", "k8s-openapi", "kube-core", "pem", - "rustls", + "rustls 0.23.40", "secrecy", "serde", "serde_json", @@ -1465,7 +2228,7 @@ dependencies = [ "chrono", "derive_more", "form_urlencoded", - "http", + "http 1.4.0", "json-patch", "k8s-openapi", "schemars", @@ -1608,6 +2371,15 @@ version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +[[package]] +name = "lru" +version = "0.16.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f66e8d5d03f609abc3a39e6f08e4164ebf1447a732906d39eb9b99b7919ef39" +dependencies = [ + "hashbrown 0.16.1", +] + [[package]] name = "lru-slab" version = "0.1.2" @@ -1629,6 +2401,16 @@ version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" +[[package]] +name = "md-5" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69b6441f590336821bb897fb28fc622898ccceb1d6cea3fde5ea86b090c4de98" +dependencies = [ + "cfg-if", + "digest 0.11.3", +] + [[package]] name = "memchr" version = "2.8.0" @@ -1687,6 +2469,15 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c6673768db2d862beb9b39a78fdcb1a69439615d5794a1be50caa9bc92c81967" +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -1735,10 +2526,10 @@ dependencies = [ "const-str", "futures", "hex", - "hmac", + "hmac 0.12.1", "hostname", - "http", - "hyper", + "http 1.4.0", + "hyper 1.9.0", "hyper-util", "k8s-openapi", "kube", @@ -1746,19 +2537,19 @@ dependencies = [ "rcgen", "reqwest", "ring", - "rustls", + "rustls 0.23.40", "rustls-pemfile", - "rustls-webpki", + "rustls-webpki 0.103.13", "schemars", "serde", "serde_json", "serde_yaml_ng", - "sha2", + "sha2 0.10.9", "shadow-rs", "snafu", "strum", "tokio", - "tokio-rustls", + "tokio-rustls 0.26.4", "tokio-stream", "tokio-util", "tower", @@ -1779,6 +2570,24 @@ dependencies = [ "num-traits", ] +[[package]] +name = "outref" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e" + +[[package]] +name = "p256" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9863ad85fa8f4460f9c48cb909d38a0d689dba1f6f6988a5e3e0d31071bcd4b" +dependencies = [ + "ecdsa", + "elliptic-curve", + "primeorder", + "sha2 0.10.9", +] + [[package]] name = "parking" version = "2.2.1" @@ -1818,6 +2627,15 @@ dependencies = [ "serde_core", ] +[[package]] +name = "pem-rfc7468" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412" +dependencies = [ + "base64ct", +] + [[package]] name = "percent-encoding" version = "2.3.2" @@ -1864,7 +2682,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "89815c69d36021a140146f26659a81d6c2afa33d216d736dd4be5381a7362220" dependencies = [ "pest", - "sha2", + "sha2 0.10.9", ] [[package]] @@ -1893,6 +2711,22 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "pkcs8" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7" +dependencies = [ + "der", + "spki", +] + [[package]] name = "pkg-config" version = "0.3.33" @@ -1933,6 +2767,15 @@ dependencies = [ "syn", ] +[[package]] +name = "primeorder" +version = "0.13.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "353e1ca18966c16d9deb1c69278edbc5f194139612772bd9537af60ac231e1e6" +dependencies = [ + "elliptic-curve", +] + [[package]] name = "proc-macro2" version = "1.0.106" @@ -1970,8 +2813,8 @@ dependencies = [ "quinn-proto", "quinn-udp", "rustc-hash", - "rustls", - "socket2", + "rustls 0.23.40", + "socket2 0.6.3", "thiserror", "tokio", "tracing", @@ -1990,7 +2833,7 @@ dependencies = [ "rand", "ring", "rustc-hash", - "rustls", + "rustls 0.23.40", "rustls-pki-types", "slab", "thiserror", @@ -2008,7 +2851,7 @@ dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2", + "socket2 0.6.3", "tracing", "windows-sys 0.60.2", ] @@ -2041,7 +2884,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" dependencies = [ "rand_chacha", - "rand_core", + "rand_core 0.9.5", ] [[package]] @@ -2051,7 +2894,16 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.9.5", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom 0.2.17", ] [[package]] @@ -2128,6 +2980,12 @@ dependencies = [ "regex-syntax", ] +[[package]] +name = "regex-lite" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973" + [[package]] name = "regex-syntax" version = "0.8.10" @@ -2146,25 +3004,25 @@ dependencies = [ "cookie_store", "futures-core", "futures-util", - "http", - "http-body", + "http 1.4.0", + "http-body 1.0.1", "http-body-util", - "hyper", - "hyper-rustls", + "hyper 1.9.0", + "hyper-rustls 0.27.9", "hyper-util", "js-sys", "log", "percent-encoding", "pin-project-lite", "quinn", - "rustls", + "rustls 0.23.40", "rustls-pki-types", "serde", "serde_json", "serde_urlencoded", "sync_wrapper", "tokio", - "tokio-rustls", + "tokio-rustls 0.26.4", "tokio-util", "tower", "tower-http", @@ -2177,6 +3035,16 @@ dependencies = [ "webpki-roots", ] +[[package]] +name = "rfc6979" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dd2a808d456c4a54e300a23e9f5a67e122c3024119acbfd73e3bf664491cb2" +dependencies = [ + "hmac 0.12.1", + "subtle", +] + [[package]] name = "ring" version = "0.17.14" @@ -2221,7 +3089,7 @@ version = "8.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5bcdef0be6fe7f6fa333b1073c949729274b05f123a0ad7efcb8efd878e5c3b1" dependencies = [ - "sha2", + "sha2 0.10.9", "walkdir", ] @@ -2245,8 +3113,12 @@ name = "rustfs-operator-e2e" version = "0.1.0" dependencies = [ "anyhow", + "aws-config", + "aws-credential-types", + "aws-sdk-s3", "axum", "futures", + "hex", "k8s-openapi", "kube", "operator", @@ -2254,6 +3126,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml_ng", + "sha2 0.10.9", "tempfile", "tokio", "tower", @@ -2273,6 +3146,18 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "rustls" +version = "0.21.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f56a14d1f48b391359b22f731fd4bd7e43c97f3c50eee276f3aa09c94784d3e" +dependencies = [ + "log", + "ring", + "rustls-webpki 0.101.7", + "sct", +] + [[package]] name = "rustls" version = "0.23.40" @@ -2284,7 +3169,7 @@ dependencies = [ "once_cell", "ring", "rustls-pki-types", - "rustls-webpki", + "rustls-webpki 0.103.13", "subtle", "zeroize", ] @@ -2320,6 +3205,16 @@ dependencies = [ "zeroize", ] +[[package]] +name = "rustls-webpki" +version = "0.101.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765" +dependencies = [ + "ring", + "untrusted", +] + [[package]] name = "rustls-webpki" version = "0.103.13" @@ -2393,6 +3288,30 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "sct" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "sec1" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3e97a565f76233a6003f9f5c54be1d9c5bdfa3eccfb189469f11ec4901c47dc" +dependencies = [ + "base16ct", + "der", + "generic-array", + "pkcs8", + "subtle", + "zeroize", +] + [[package]] name = "secrecy" version = "0.10.3" @@ -2544,6 +3463,28 @@ dependencies = [ "unsafe-libyaml", ] +[[package]] +name = "sha1" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +dependencies = [ + "cfg-if", + "cpufeatures 0.2.17", + "digest 0.10.7", +] + +[[package]] +name = "sha1" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aacc4cc499359472b4abe1bf11d0b12e688af9a805fa5e3016f9a386dc2d0214" +dependencies = [ + "cfg-if", + "cpufeatures 0.3.0", + "digest 0.11.3", +] + [[package]] name = "sha2" version = "0.10.9" @@ -2551,8 +3492,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" dependencies = [ "cfg-if", - "cpufeatures", - "digest", + "cpufeatures 0.2.17", + "digest 0.10.7", +] + +[[package]] +name = "sha2" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "446ba717509524cb3f22f17ecc096f10f4822d76ab5c0b9822c5f9c284e825f4" +dependencies = [ + "cfg-if", + "cpufeatures 0.3.0", + "digest 0.11.3", ] [[package]] @@ -2593,6 +3545,16 @@ dependencies = [ "libc", ] +[[package]] +name = "signature" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" +dependencies = [ + "digest 0.10.7", + "rand_core 0.6.4", +] + [[package]] name = "simd-adler32" version = "0.3.9" @@ -2634,6 +3596,16 @@ dependencies = [ "syn", ] +[[package]] +name = "socket2" +version = "0.5.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + [[package]] name = "socket2" version = "0.6.3" @@ -2644,6 +3616,22 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "spin" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5fe4ccb98d9c292d56fec89a5e07da7fc4cf0dc11e156b41793132775d3e591" + +[[package]] +name = "spki" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d" +dependencies = [ + "base64ct", + "der", +] + [[package]] name = "stable_deref_trait" version = "1.2.1" @@ -2825,7 +3813,7 @@ dependencies = [ "mio", "pin-project-lite", "signal-hook-registry", - "socket2", + "socket2 0.6.3", "tokio-macros", "windows-sys 0.61.2", ] @@ -2841,13 +3829,23 @@ dependencies = [ "syn", ] +[[package]] +name = "tokio-rustls" +version = "0.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" +dependencies = [ + "rustls 0.21.12", + "tokio", +] + [[package]] name = "tokio-rustls" version = "0.26.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61" dependencies = [ - "rustls", + "rustls 0.23.40", "tokio", ] @@ -2908,8 +3906,8 @@ dependencies = [ "bytes", "futures-core", "futures-util", - "http", - "http-body", + "http 1.4.0", + "http-body 1.0.1", "http-body-util", "http-range-header", "httpdate", @@ -3086,6 +4084,12 @@ dependencies = [ "serde", ] +[[package]] +name = "urlencoding" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" + [[package]] name = "utf8_iter" version = "1.0.4" @@ -3175,6 +4179,12 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "vsimd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" + [[package]] name = "walkdir" version = "2.5.0" @@ -3673,6 +4683,12 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" +[[package]] +name = "xmlparser" +version = "0.13.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" + [[package]] name = "yasna" version = "0.5.2" diff --git a/e2e/Cargo.toml b/e2e/Cargo.toml index d8e2c64..5793d7c 100644 --- a/e2e/Cargo.toml +++ b/e2e/Cargo.toml @@ -8,11 +8,16 @@ publish = false operator = { path = ".." } anyhow = "1" +aws-config = "1" +aws-credential-types = "1" +aws-sdk-s3 = "1" axum = { version = "0.7", features = ["macros"] } futures = "0.3.31" +hex = "0.4" serde = { version = "1.0.228", features = ["derive"] } serde_json = "1.0.148" serde_yaml_ng = "0.10.0" +sha2 = "0.10" tempfile = "3" tokio = { version = "1.49.0", features = ["rt-multi-thread", "macros", "process", "time", "io-util"] } tower = "0.5" diff --git a/e2e/FAULT_TESTING.md b/e2e/FAULT_TESTING.md new file mode 100644 index 0000000..3440bd1 --- /dev/null +++ b/e2e/FAULT_TESTING.md @@ -0,0 +1,440 @@ + + +# RustFS Fault-Test Operations / RustFS 故障测试操作手册 + +本手册是 Agent 和开发人员使用 `e2e` package 故障测试工具的唯一操作入口。它说明执行步骤、步骤原因、安全边界、验收证据和清理方式。 + +This manual is the single operational entry point for agents and developers using the fault-test tooling in the `e2e` package. Fault-test commands, prerequisites, safety limits, evidence, and cleanup are intentionally kept here instead of duplicated in README files. + +## 1. Purpose And Safety / 目的与安全边界 + +故障测试只允许在专用真实 Kubernetes 测试集群执行。测试会创建并删除专用 Tenant、PVC、Pod、Service、StatefulSet 和 Chaos resources。禁止把测试 namespace、Tenant、StorageClass 或 DM 路径指向现有业务资源。 + +Run fault tests only in a dedicated real Kubernetes test cluster. The suite creates and removes a dedicated Tenant, PVCs, Pods, Services, StatefulSets, and Chaos resources. Never point its namespace, Tenant, StorageClass, or DM path at application resources. + +固定测试所有权: + +```text +namespace: rustfs-fault-test +tenant: fault-test-tenant +manager: app.kubernetes.io/managed-by=rustfs-operator-fault-test +annotation: rustfs.com/fault-test-tenant=fault-test-tenant +``` + +安全规则 / Safety rules: + +- 当前 context 必须与 `RUSTFS_FAULT_TEST_EXPECTED_CONTEXT` 完全一致,并且不能是 `kind-*`。 +- 四个 RustFS 测试 Pod 必须调度到至少四个 Ready 节点。 +- 常规场景使用独立动态 StorageClass;`dm-flakey` 使用独立静态 Local PV StorageClass。 +- Make 编排器会监控所有节点和运行前已有的非 fault Tenant;任一异常会撤销 managed Chaos 并停止测试。 +- `fault-cleanup` 只删除带正确所有权标记的 namespace 和 Chaos,不删除外部 StorageClass、PV 或主机设备。 +- The current context must exactly match `RUSTFS_FAULT_TEST_EXPECTED_CONTEXT` and must not be `kind-*`. +- The four RustFS test Pods require at least four Ready schedulable nodes. +- Regular scenarios use a dedicated dynamic StorageClass; `dm-flakey` uses a dedicated static Local PV StorageClass. +- The Make runner monitors every node and every pre-existing non-fault Tenant. It removes managed Chaos and stops on degradation. +- `fault-cleanup` removes only the owned namespace and managed Chaos. It never removes external StorageClasses, PVs, or host devices. + +## 2. Workload Profile / 工作负载 + +每个场景使用 seed 确定性生成对象内容和尺寸顺序。未设置 `RUSTFS_FAULT_TEST_SEED` 时自动生成 seed;所有重放信息写入 `workload-plan.json` 和 `history.jsonl`。 + +Each scenario deterministically generates object content and size order from a seed. A seed is generated when `RUSTFS_FAULT_TEST_SEED` is unset. Replay information is recorded in `workload-plan.json` and `history.jsonl`. + +| Size | Weight | Objects | +| --- | ---: | ---: | +| 4KiB | 85% | 34,000 | +| 16KiB | 10% | 4,000 | +| 8MiB | 4% | 1,600 | +| 16MiB | 1% | 400 | + +```text +objects: 40,000 +concurrency: 80 +payload/scenario: 20,337,459,200 bytes (~18.94GiB) +PVCs: 4 × 100Gi +maximum fault TTL: 7,200 seconds +``` + +7,200 秒是故障资源的最大保护时间,不是固定等待时间。正常测试在 workload 完成后立即恢复故障。较长 TTL 防止 40,000 对象 workload 在完成前超过 Chaos duration。 + +The 7,200-second duration is a maximum fault-resource safety window, not a fixed wait. Successful runs recover immediately after the workload. The larger window prevents the 40,000-object workload from outliving Chaos. + +Tenant `Ready` 之后、注入故障之前,以及故障恢复之后,测试都会等待四个 RustFS Pod 连续 60 秒保持 `Running/Ready`,且 Pod UID 和容器重启数不变。这个稳定窗口避免把启动期 DNS 或 Pod 重启抖动误判为故障注入结果。 + +After Tenant `Ready`, both before injection and after recovery, the test requires all four RustFS Pods to remain `Running/Ready` for 60 seconds with unchanged Pod UIDs and container restart counts. This stability window prevents startup DNS or restart churn from being misclassified as a fault-injection result. + +## 3. Package Commands / Package 命令 + +所有公共入口都位于 `e2e/Makefile`。从仓库根目录执行: + +All public entry points are in `e2e/Makefile`. Run them from the repository root: + +```bash +make -C e2e help +make -C e2e fault-check +make -C e2e fault-preflight SCENARIO=io-eio +make -C e2e fault-run SCENARIO=io-eio +make -C e2e fault-run-regular +make -C e2e fault-run-dm +make -C e2e fault-cleanup +``` + +| Target | Behavior / 行为 | +| --- | --- | +| `fault-check` | 单 job Rust fmt/test/clippy 和 Bash 语法检查;不访问集群。 / Single-job Rust fmt, tests, clippy, and Bash syntax; no cluster mutation. | +| `fault-preflight` | 校验 context、CRD、StorageClass、Chaos、节点、namespace 所有权和现有 Tenant。 / Validates context, CRDs, storage, Chaos, nodes, ownership, and existing Tenants. | +| `fault-run` | 运行一个场景,持续健康守护并验收 artifacts。 / Runs one guarded scenario and validates artifacts. | +| `fault-run-regular` | 串行运行六个常规场景,首败停止。 / Runs six regular scenarios serially and stops on first failure. | +| `fault-run-dm` | 使用预先准备的静态 PV 和 DM 设备运行 `dm-flakey`。 / Runs `dm-flakey` with pre-provisioned static PVs and DM storage. | +| `fault-cleanup` | 安全删除 owned namespace 和 managed Chaos。 / Safely removes the owned namespace and managed Chaos. | + +`fault-run*` 会先用单 job、最低主机优先级预编译精确的 `faults` 测试二进制,再等待 60 秒并确认原有 RustFS Pod 的 UID、重启数和 Ready 状态没有变化。故障窗口直接运行该二进制,不再次调用 Cargo。预编译不计入故障窗口;如果编译影响现有 Tenant,runner 会在创建故障 Tenant 前停止。 + +Before creating a fault Tenant, every `fault-run*` target prebuilds the exact `faults` binary with one job and the lowest host priority. It then verifies for 60 seconds that every pre-existing RustFS Pod keeps the same UID, restart count, and Ready state. The fault window executes that binary directly without invoking Cargo again. Compilation is outside the fault window, and the runner stops if the build disturbs an existing Tenant. + +### 3.1 Recommended Flow / 推荐执行顺序 + +1. 运行 `make -C e2e fault-check`,先确认本地代码、脚本和普通测试可用。 / Run `make -C e2e fault-check` first to validate code, scripts, and non-live tests. +2. 准备真实测试集群、专用 StorageClass、Chaos Mesh 和固定 digest 的 RustFS image。 / Prepare the real test cluster, dedicated StorageClass, Chaos Mesh, and a pinned RustFS image digest. +3. 导出 `RUSTFS_FAULT_TEST_EXPECTED_CONTEXT`、`RUSTFS_FAULT_TEST_STORAGE_CLASS` 和 `RUSTFS_FAULT_TEST_SERVER_IMAGE`。 / Export the required context, StorageClass, and image variables. +4. 先执行 `make -C e2e fault-preflight SCENARIO=io-eio`,再单独跑 `io-eio`。 / Run `io-eio` preflight first, then run `io-eio` alone. +5. `io-eio` 通过后再执行 `make -C e2e fault-run-regular`。 / After `io-eio` passes, run the remaining regular scenarios with `fault-run-regular`. +6. 只有准备好静态 Local PV 和 Device Mapper 后,才执行 `make -C e2e fault-run-dm`。 / Run `fault-run-dm` only after static Local PVs and Device Mapper are ready. +7. 结束后先收集 artifacts,再执行 `make -C e2e fault-cleanup`。 / Collect artifacts before running `fault-cleanup`. + +## 4. Cluster Preparation / 集群准备 + +### 4.1 Required Tools / 必需工具 + +```bash +rustc --version +cargo --version +kubectl version --client +jq --version +make -C e2e fault-check +``` + +`warp` v1.3.1 仅用于 `warp-under-chaos`。运行机必须能访问 Kubernetes API;如果设置 ClusterIP 直连,还必须能访问 Service ClusterIP。 + +`warp` v1.3.1 is required only for `warp-under-chaos`. The runner must reach the Kubernetes API and, when ClusterIP mode is enabled, Service ClusterIPs. + +### 4.2 Kubernetes And Storage / Kubernetes 与存储 + +```bash +kubectl config current-context +kubectl get nodes +kubectl get crd tenants.rustfs.com +kubectl get storageclass +kubectl get tenant -A +``` + +常规场景要求动态 StorageClass。每个承载测试 PVC 的节点应在实际 provisioner 路径上至少有 120Gi 可用空间。hostPath/local-path 的 PVC capacity 通常不执行真实配额,必须检查后端文件系统,而不能只看 `kubectl get pvc`。 + +Regular scenarios require a dynamic StorageClass. Every node that can host a test PVC should have at least 120Gi available on the actual provisioner filesystem. hostPath/local-path capacity is commonly not enforced, so inspect the backing filesystem instead of trusting only `kubectl get pvc`. + +```bash +kubectl -n kube-system get configmap local-path-config -o yaml +kubectl get pv -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.hostPath.path}{"\n"}{end}' +df -h +``` + +如果 K3s 默认 `/var/lib/rancher/k3s/storage` 位于小系统盘,应创建独立 provisioner/StorageClass,把 fault-test PVC 放到 `/data/rustfs/rustfs-fault-local-path` 等专用数据盘目录。不得修改现有业务 PVC 或默认 provisioner。 + +If K3s stores its default local-path data on a small system disk, create an independent provisioner and StorageClass backed by a dedicated data-disk path such as `/data/rustfs/rustfs-fault-local-path`. Do not modify existing application PVCs or the default provisioner. + +### 4.3 Chaos Mesh / Chaos Mesh + +已验证版本为 Chaos Mesh v2.8.3: + +The validated version is Chaos Mesh v2.8.3: + +```bash +helm repo add chaos-mesh https://charts.chaos-mesh.org +helm repo update +helm upgrade --install chaos-mesh chaos-mesh/chaos-mesh \ + -n chaos-mesh --create-namespace --version 2.8.3 \ + --set chaosDaemon.runtime=containerd \ + --set chaosDaemon.socketPath=/run/k3s/containerd/containerd.sock \ + --set dashboard.create=false \ + --wait --timeout 10m + +kubectl -n chaos-mesh get deployment,daemonset +kubectl get crd iochaos.chaos-mesh.org podchaos.chaos-mesh.org networkchaos.chaos-mesh.org +``` + +非 K3s 集群必须使用实际 container runtime socket。 + +Non-K3s clusters must use their actual container runtime socket. + +## 5. Regular Scenarios / 常规场景 + +先固定 context、动态 StorageClass 和 RustFS image digest。测试机位于集群节点或 Pod 内时使用 ClusterIP,避免 80 并发经过 `kubectl port-forward`。 + +Pin the context, dynamic StorageClass, and RustFS image digest. Use ClusterIP when the runner is on a cluster node or in a Pod so 80 concurrent requests do not traverse `kubectl port-forward`. + +```bash +export RUSTFS_FAULT_TEST_EXPECTED_CONTEXT=default +export RUSTFS_FAULT_TEST_STORAGE_CLASS= +export RUSTFS_FAULT_TEST_SERVER_IMAGE='docker.io/rustfs/rustfs@sha256:' +export RUSTFS_FAULT_TEST_USE_CLUSTER_IP=1 +export RUSTFS_FAULT_TEST_RUN_ROOT="$PWD/e2e/target/fault-tests/$(date -u +%Y%m%dT%H%M%SZ)" + +make -C e2e fault-preflight SCENARIO=io-eio +make -C e2e fault-run SCENARIO=io-eio +``` + +场景顺序 / Scenario order: + +```text +io-eio +pod-kill-one +network-partition-one +io-read-mistake +disk-full +warp-under-chaos +``` + +完整运行: + +Run all regular scenarios: + +```bash +make -C e2e fault-run-regular +``` + +分阶段验证时,可以先运行 `io-eio`,再通过 `RUSTFS_FAULT_TEST_SCENARIOS` 指定剩余场景: + +For staged validation, run `io-eio` first and then select the remaining scenarios with `RUSTFS_FAULT_TEST_SCENARIOS`: + +```bash +export RUSTFS_FAULT_TEST_SCENARIOS='pod-kill-one network-partition-one io-read-mistake disk-full warp-under-chaos' +make -C e2e fault-run-regular +unset RUSTFS_FAULT_TEST_SCENARIOS +``` + +测试可能持续数小时。不要并行运行场景。每个场景完成后编排脚本会校验 seed、尺寸分布、故障状态、40,000 committed PUT 和 checker verdict。 + +The suite can run for several hours. Do not run scenarios in parallel. After every scenario, the runner validates the seed, size distribution, fault state, 40,000 committed PUTs, and checker verdict. + +## 6. dm-flakey / dm-flakey + +`dm-flakey` 不需要重装 Kubernetes、Operator、Chaos Mesh 或 Rust。它只需要把 fault Tenant 的存储切换为四个专用静态 Local PV,其中一个 PV 由 Device Mapper 提供。 + +`dm-flakey` does not require reinstalling Kubernetes, the Operator, Chaos Mesh, or Rust. It only switches the fault Tenant to four dedicated static Local PVs, one backed by Device Mapper. + +### 6.1 Host Storage / 主机存储 + +真实专用块设备优先。loop 文件仅适用于实验室。每个 backing 至少 120Gi,并且路径必须只服务 fault-test。 + +Prefer dedicated block devices. Loop files are for lab use only. Each backing device must be at least 120Gi and serve only fault-test. + +DM 节点示例 / DM-node example: + +```bash +export LAB=/data/rustfs/rustfs-fault-lab +export DM_NAME=rustfs-fault-dm +sudo mkdir -p "$LAB/volume" +sudo truncate -s 120G "$LAB/disk.img" +export BACKING=$(sudo losetup --find --show "$LAB/disk.img") +export SECTORS=$(sudo blockdev --getsz "$BACKING") +sudo dmsetup create "$DM_NAME" --table "0 $SECTORS linear $BACKING 0" +sudo mkfs.ext4 -F "/dev/mapper/$DM_NAME" +sudo mount "/dev/mapper/$DM_NAME" "$LAB/volume" +sudo chmod 0777 "$LAB/volume" +``` + +其他三个节点 / Other three nodes: + +```bash +export LAB=/data/rustfs/rustfs-fault-lab +sudo mkdir -p "$LAB/volume" +sudo truncate -s 120G "$LAB/disk.img" +export BACKING=$(sudo losetup --find --show "$LAB/disk.img") +sudo mkfs.ext4 -F "$BACKING" +sudo mount "$BACKING" "$LAB/volume" +sudo chmod 0777 "$LAB/volume" +``` + +### 6.2 Static StorageClass And PVs / 静态 StorageClass 与 PV + +创建 `kubernetes.io/no-provisioner` StorageClass,并为四个节点各创建一个 `100Gi` Local PV。每个 PV 的 node affinity 必须匹配实际节点;`local.path` 必须是 `/data/rustfs/rustfs-fault-lab/volume`。 + +Create a `kubernetes.io/no-provisioner` StorageClass and one `100Gi` Local PV per node. Each PV must use the matching node affinity and `/data/rustfs/rustfs-fault-lab/volume` as `local.path`. + +```yaml +apiVersion: storage.k8s.io/v1 +kind: StorageClass +metadata: + name: rustfs-fault-dm +provisioner: kubernetes.io/no-provisioner +volumeBindingMode: WaitForFirstConsumer +reclaimPolicy: Retain +--- +apiVersion: v1 +kind: PersistentVolume +metadata: + name: rustfs-fault-dm- + labels: + app.kubernetes.io/managed-by: rustfs-operator-fault-test +spec: + capacity: + storage: 100Gi + volumeMode: Filesystem + accessModes: [ReadWriteOnce] + persistentVolumeReclaimPolicy: Retain + storageClassName: rustfs-fault-dm + local: + path: /data/rustfs/rustfs-fault-lab/volume + nodeAffinity: + required: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: [] +``` + +验证四个 PV 为 `Available`: + +Verify all four PVs are `Available`: + +```bash +kubectl get storageclass rustfs-fault-dm +kubectl get pv -l app.kubernetes.io/managed-by=rustfs-operator-fault-test -o wide +``` + +helper Pod 需要 privileged Pod Security。复用常规场景创建的 namespace 时补充 label;如果 namespace 不存在,则预创建带完整所有权的 namespace: + +The helper Pod requires privileged Pod Security. Label the namespace left by regular scenarios, or pre-create an owned namespace when it does not exist: + +```bash +if kubectl get namespace rustfs-fault-test >/dev/null 2>&1; then + kubectl label namespace rustfs-fault-test \ + pod-security.kubernetes.io/enforce=privileged --overwrite +else + kubectl create namespace rustfs-fault-test + kubectl label namespace rustfs-fault-test \ + app.kubernetes.io/managed-by=rustfs-operator-fault-test \ + pod-security.kubernetes.io/enforce=privileged + kubectl annotate namespace rustfs-fault-test \ + rustfs.com/fault-test-tenant=fault-test-tenant +fi +``` + +### 6.3 Run / 执行 + +```bash +export RUSTFS_FAULT_TEST_STORAGE_CLASS=rustfs-fault-dm +export RUSTFS_FAULT_TEST_DM_NAME=rustfs-fault-dm +export RUSTFS_FAULT_TEST_DM_NODE= +export RUSTFS_FAULT_TEST_DM_MOUNT_PATH=/data/rustfs/rustfs-fault-lab/volume +export RUSTFS_FAULT_TEST_DM_FAULT_TABLE="0 $SECTORS flakey $BACKING 0 1 15" + +make -C e2e fault-preflight SCENARIO=dm-flakey +make -C e2e fault-run-dm +``` + +## 7. Evidence And Acceptance / 证据与验收 + +每个场景目录至少包含: + +Each scenario directory contains at least: + +```text +test.log +health-watch.log +workload-plan.json +history.jsonl +workload-summary.json +checker-report.json +fault-evidence.json +nodes-before.txt / nodes-after.txt +tenants-before.txt / tenants-after.txt +pods-before.txt / pods-after.txt +Chaos or DM snapshots +``` + +通过条件 / Pass criteria: + +- 测试退出码为 0。 +- `fault-evidence.json` 的 `injected`、`active_during_workload`、`recovered` 都为 `true`。 +- `workload-plan.json` 精确记录 40,000 对象、80 并发和四档尺寸分布。 +- `checker-report.json` 的 `committed_puts=40000`,并且 missing、hash mismatch、successful corrupted read、LIST warning 均为空。 +- fault Tenant 恢复 Ready;所有原有非 fault Tenant 和节点保持 Ready。 +- The test exits with zero. +- `fault-evidence.json` reports `injected`, `active_during_workload`, and `recovered` as `true`. +- `workload-plan.json` reports exactly 40,000 objects, concurrency 80, and the four size classes. +- `checker-report.json` reports `committed_puts=40000` with no missing object, hash mismatch, successful corrupted read, or LIST warning. +- The fault Tenant recovers Ready while every pre-existing non-fault Tenant and node remains Ready. + +客户端没有看到错误并不表示故障无效。故障是否生效由 Chaos/DM 后端证据判断;客户端 disruption 单独记录。 + +No client-visible error does not mean the fault was inactive. Chaos/DM backend evidence proves injection; client disruption is reported separately. + +## 8. Cleanup And Recovery / 清理与恢复 + +先运行安全清理: + +Start with owned-resource cleanup: + +```bash +make -C e2e fault-cleanup +``` + +然后由运维删除本次创建的外部 StorageClass、静态 PV、独立 provisioner 和主机设备。DM 实验室清理示例: + +Operators must then remove the external StorageClass, static PVs, independent provisioner, and host devices created for the run. Lab DM cleanup example: + +```bash +sudo umount /data/rustfs/rustfs-fault-lab/volume +sudo dmsetup remove rustfs-fault-dm # DM node only +sudo losetup -d +sudo rm -rf /data/rustfs/rustfs-fault-lab +kubectl delete pv -l app.kubernetes.io/managed-by=rustfs-operator-fault-test +kubectl delete storageclass rustfs-fault-dm +``` + +最终确认 / Final checks: + +```bash +kubectl get nodes +kubectl get tenant -A +kubectl -n chaos-mesh get deployment,daemonset +kubectl get iochaos,podchaos,networkchaos -A +kubectl get namespace rustfs-fault-test +``` + +## 9. Runtime Variables / 运行参数 + +| Variable | Default | Purpose / 用途 | +| --- | --- | --- | +| `RUSTFS_FAULT_TEST_EXPECTED_CONTEXT` | required | 防止在错误 context 执行。 / Prevents execution against the wrong context. | +| `RUSTFS_FAULT_TEST_STORAGE_CLASS` | required | 常规动态 SC 或 DM 静态 SC。 / Dynamic regular SC or static DM SC. | +| `RUSTFS_FAULT_TEST_SERVER_IMAGE` | required by Make | 建议固定 digest。 / Pin an image digest. | +| `RUSTFS_FAULT_TEST_RUN_ROOT` | timestamp directory | 整次运行的 artifacts 根目录。 / Artifact root for the run. | +| `RUSTFS_FAULT_TEST_SCENARIOS` | six regular scenarios | `fault-run-regular` 的空格分隔场景列表。 / Space-separated regular scenario list. | +| `RUSTFS_FAULT_TEST_SEED` | generated | 固定后可重放相同对象。 / Replays the same objects when set. | +| `RUSTFS_FAULT_TEST_USE_CLUSTER_IP` | `false` | 集群节点/Pod 内建议设为 `1`。 / Set to `1` on a node or in-cluster runner. | +| `RUSTFS_FAULT_TEST_BUILD_JOBS` | `1` | 预编译并行度;小型控制面保持为 1。 / Prebuild parallelism; keep at 1 on small control planes. | +| `RUSTFS_FAULT_TEST_BUILD_SETTLE_SECONDS` | `60` | 预编译后原有 RustFS Pod 的稳定校验时间。 / Existing-Pod stability check after prebuild. | +| `RUSTFS_FAULT_TEST_WORKLOAD_OBJECTS` | `40000` | Make runner 强制验收该值。 / Required object count. | +| `RUSTFS_FAULT_TEST_WORKLOAD_CONCURRENCY` | `80` | Make runner 强制验收该值。 / Required concurrency. | +| `RUSTFS_FAULT_TEST_DURATION_SECONDS` | `7200` | 最大故障 TTL。 / Maximum fault TTL. | +| `RUSTFS_FAULT_TEST_REQUEST_TIMEOUT_SECONDS` | `30` | 单次 S3 请求超时。 / Per-request S3 timeout. | +| `RUSTFS_FAULT_TEST_REQUIRE_CLIENT_DISRUPTION` | `false` | 是否要求客户端可见错误。 / Whether client-visible disruption is mandatory. | +| `RUSTFS_FAULT_TEST_CHAOS_NAMESPACE` | `chaos-mesh` | Chaos resource namespace。 | +| `RUSTFS_FAULT_TEST_DM_*` | unset | `dm-flakey` 专用映射参数。 / DM mapping parameters. | diff --git a/e2e/Makefile b/e2e/Makefile new file mode 100644 index 0000000..cff152e --- /dev/null +++ b/e2e/Makefile @@ -0,0 +1,61 @@ +# Copyright 2025 RustFS Team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +SHELL := /bin/bash + +FAULT_SCRIPT := $(CURDIR)/scripts/fault-test.sh +MANIFEST := $(CURDIR)/Cargo.toml +FAULT_BUILD_JOBS ?= 1 + +.PHONY: help fault-check fault-preflight fault-run fault-run-regular fault-run-dm fault-cleanup + +help: + @echo "RustFS e2e fault-test package" + @echo "" + @echo "Usage:" + @echo " make -C e2e fault-check" + @echo " make -C e2e fault-preflight [SCENARIO=io-eio]" + @echo " make -C e2e fault-run SCENARIO=io-eio" + @echo " make -C e2e fault-run-regular" + @echo " make -C e2e fault-run-dm" + @echo " make -C e2e fault-cleanup" + @echo "" + @echo "Required runtime environment:" + @echo " RUSTFS_FAULT_TEST_EXPECTED_CONTEXT" + @echo " RUSTFS_FAULT_TEST_STORAGE_CLASS" + @echo " RUSTFS_FAULT_TEST_SERVER_IMAGE" + @echo "" + @echo "See e2e/FAULT_TESTING.md for cluster preparation and safety requirements." + +fault-check: + bash -n $(FAULT_SCRIPT) + CARGO_BUILD_JOBS=$(FAULT_BUILD_JOBS) cargo fmt --manifest-path $(MANIFEST) --all --check + CARGO_BUILD_JOBS=$(FAULT_BUILD_JOBS) cargo test --manifest-path $(MANIFEST) + CARGO_BUILD_JOBS=$(FAULT_BUILD_JOBS) cargo clippy --manifest-path $(MANIFEST) --all-targets -- -D warnings + +fault-preflight: + @bash $(FAULT_SCRIPT) preflight "$(or $(SCENARIO),io-eio)" + +fault-run: + @test -n "$(SCENARIO)" || (echo "SCENARIO is required" >&2; exit 2) + @bash $(FAULT_SCRIPT) run "$(SCENARIO)" + +fault-run-regular: + @bash $(FAULT_SCRIPT) run-regular + +fault-run-dm: + @bash $(FAULT_SCRIPT) run dm-flakey + +fault-cleanup: + @bash $(FAULT_SCRIPT) cleanup diff --git a/e2e/README.md b/e2e/README.md index b8f797c..837a1b7 100644 --- a/e2e/README.md +++ b/e2e/README.md @@ -1,6 +1,6 @@ # RustFS Operator E2E Harness -This crate is the Rust-native integration-test harness for release-grade validation of the RustFS Operator and its Console API. +This crate provides the Rust-native Kind e2e harness and shared primitives used by the separate real-cluster fault-test runner. The harness is intentionally separated from the main operator crate so e2e-only dependencies stay scoped to the `e2e/` manifest while still being validated by `make e2e-check` and the default `make pre-commit` path. It is driven through the reduced live entrypoints `e2e-live-create`, `e2e-live-run`, `e2e-live-update`, and `e2e-live-delete`. @@ -16,13 +16,17 @@ The harness is split into four top-level domains: ```text e2e/ Cargo.toml + FAULT_TESTING.md package-local fault-test operations manual + scripts/ + fault-test.sh guarded real-cluster fault-test orchestration manifests/ kind-rustfs-e2e.yaml dedicated 1 control-plane + 3 worker Kind cluster src/ lib.rs bin/rustfs-e2e.rs Makefile-internal helper for live workflow steps framework/ - config.rs environment and CI knobs + config.rs dedicated Kind e2e configuration + fault_config.rs real-cluster fault-test configuration and safety checks command.rs safe subprocess wrapper for kind/docker/kubectl kind.rs Kind cluster lifecycle and host mount preparation kubectl.rs kubectl command construction boundary @@ -37,7 +41,7 @@ e2e/ resources.rs namespace/Secret/Tenant apply boundary storage.rs local StorageClass/PV preparation boundary assertions.rs Kubernetes and Tenant status assertions - tenant_factory.rs reusable Tenant manifests for e2e + tenant_factory.rs Kind-local and real-cluster Tenant templates cases/ smoke.rs install and readiness checks operator.rs Tenant status and observed-generation checks @@ -46,7 +50,7 @@ e2e/ smoke.rs ignored live smoke entrypoints operator.rs ignored live Operator assertion console.rs ignored live Console API assertion - faults.rs non-live destructive opt-in guard + faults.rs real-cluster destructive fault-injection suite with scenario-selected runners; not part of e2e case inventory ``` ## Boundary rules @@ -55,10 +59,11 @@ e2e/ 2. `framework::kubectl` is the shell/Kubernetes YAML boundary and must always pin `--context`. 3. `framework::kube_client` is the typed Kubernetes API boundary. 4. `framework::console_client` is the HTTP boundary for Console API tests. -5. `framework::storage` owns e2e local PV setup; `framework::resources` owns e2e namespace/Secret/Tenant setup. +5. `framework::storage` owns Kind local PV setup; `framework::resources` owns shared namespace/Secret/Tenant lifecycle. 6. `framework::live` owns live-run opt-in and dedicated-context checks. 7. `cases/*` should describe behavior and call framework helpers; avoid shell details there. -8. Destructive tests must use dedicated e2e namespaces and must never run against an arbitrary current context. +8. Kind e2e cases remain in `cases/*`; real-cluster fault tests are intentionally excluded from that inventory. +9. Destructive real-cluster fault tests are documented only in [`FAULT_TESTING.md`](FAULT_TESTING.md). ## Safety defaults diff --git a/e2e/scripts/fault-test.sh b/e2e/scripts/fault-test.sh new file mode 100644 index 0000000..27e59ef --- /dev/null +++ b/e2e/scripts/fault-test.sh @@ -0,0 +1,529 @@ +#!/usr/bin/env bash +# Copyright 2025 RustFS Team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -Eeuo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PACKAGE_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" +MANIFEST="$PACKAGE_DIR/Cargo.toml" +MANAGER="rustfs-operator-fault-test" +MANAGER_SELECTOR="app.kubernetes.io/managed-by=$MANAGER" +DEFAULT_SCENARIOS="io-eio pod-kill-one network-partition-one io-read-mistake disk-full warp-under-chaos" +EXPECTED_OBJECTS=40000 +EXPECTED_CONCURRENCY=80 +EXPECTED_PAYLOAD_BYTES=20337459200 +BUILD_JOBS="${RUSTFS_FAULT_TEST_BUILD_JOBS:-1}" +BUILD_SETTLE_SECONDS="${RUSTFS_FAULT_TEST_BUILD_SETTLE_SECONDS:-60}" + +FAULT_NAMESPACE="${RUSTFS_FAULT_TEST_NAMESPACE:-rustfs-fault-test}" +FAULT_TENANT="${RUSTFS_FAULT_TEST_TENANT:-fault-test-tenant}" +CHAOS_NAMESPACE="${RUSTFS_FAULT_TEST_CHAOS_NAMESPACE:-chaos-mesh}" +ACTIVE_PID="" +ACTIVE_ARTIFACTS="" +FAULT_TEST_BINARY="" + +usage() { + cat <<'EOF' +Usage: fault-test.sh [scenario] + +Commands: + preflight [scenario] Validate the current real-cluster environment. + run Run one destructive scenario with health guards. + run-regular Run the six regular scenarios serially. + cleanup Remove managed Chaos and the owned fault namespace. + +Run through the package Make targets documented in e2e/FAULT_TESTING.md. +EOF +} + +die() { + echo "fault-test: $*" >&2 + exit 1 +} + +require_command() { + command -v "$1" >/dev/null 2>&1 || die "required command not found: $1" +} + +kubectl_context() { + kubectl config current-context +} + +kubectl_ns() { + kubectl --context "$RUSTFS_FAULT_TEST_EXPECTED_CONTEXT" -n "$1" "${@:2}" +} + +kubectl_cluster() { + kubectl --context "$RUSTFS_FAULT_TEST_EXPECTED_CONTEXT" "$@" +} + +is_supported_scenario() { + case "$1" in + io-eio|pod-kill-one|network-partition-one|io-read-mistake|disk-full|warp-under-chaos|dm-flakey) + return 0 + ;; + *) + return 1 + ;; + esac +} + +scenario_crd() { + case "$1" in + pod-kill-one) echo "podchaos.chaos-mesh.org" ;; + network-partition-one) echo "networkchaos.chaos-mesh.org" ;; + dm-flakey) echo "" ;; + *) echo "iochaos.chaos-mesh.org" ;; + esac +} + +require_namespace_ownership() { + if ! kubectl_cluster get namespace "$FAULT_NAMESPACE" >/dev/null 2>&1; then + return 0 + fi + + local manager tenant + manager="$(kubectl_cluster get namespace "$FAULT_NAMESPACE" -o jsonpath='{.metadata.labels.app\.kubernetes\.io/managed-by}')" + tenant="$(kubectl_cluster get namespace "$FAULT_NAMESPACE" -o jsonpath='{.metadata.annotations.rustfs\.com/fault-test-tenant}')" + [[ "$manager" == "$MANAGER" ]] || die "namespace $FAULT_NAMESPACE is not managed by $MANAGER" + [[ "$tenant" == "$FAULT_TENANT" ]] || die "namespace $FAULT_NAMESPACE is not owned by tenant $FAULT_TENANT" +} + +require_non_fault_tenants_ready() { + local unhealthy + unhealthy="$(kubectl_cluster get tenants -A -o json | jq -r --arg namespace "$FAULT_NAMESPACE" ' + .items[] + | select(.metadata.namespace != $namespace) + | select((.status.currentState // "") != "Ready") + | "\(.metadata.namespace)/\(.metadata.name)=\(.status.currentState // "missing")" + ')" + [[ -z "$unhealthy" ]] || die "non-fault Tenant is not Ready: $unhealthy" +} + +snapshot_non_fault_rustfs_pods() { + kubectl_cluster get pods -A -o json | jq -r --arg namespace "$FAULT_NAMESPACE" ' + .items[] + | select(.metadata.namespace != $namespace) + | select(.metadata.labels["rustfs.tenant"] != null) + | [ + .metadata.namespace, + .metadata.name, + .metadata.uid, + ([.status.containerStatuses[]?.restartCount] | add // 0), + ((.status.phase == "Running") and ([.status.containerStatuses[]?.ready] | all)) + ] + | @tsv + ' | sort +} + +prepare_fault_binary() { + local scenario="$1" run_root="$2" + local before="$run_root/build-pods-before.tsv" + local current="$run_root/build-pods-current.tsv" + local changes="$run_root/build-pod-changes.diff" + local build_messages="$run_root/fault-build.jsonl" + local elapsed=0 interval=10 + local -a build_command=( + cargo test --manifest-path "$MANIFEST" --test faults --no-run + --message-format=json-render-diagnostics + ) + + [[ "$BUILD_JOBS" =~ ^[1-9][0-9]*$ ]] || die "RUSTFS_FAULT_TEST_BUILD_JOBS must be a positive integer" + [[ "$BUILD_SETTLE_SECONDS" =~ ^[0-9]+$ ]] || die "RUSTFS_FAULT_TEST_BUILD_SETTLE_SECONDS must be a non-negative integer" + preflight "$scenario" + snapshot_non_fault_rustfs_pods >"$before" + echo "preparing fault-test binary with jobs=$BUILD_JOBS and lowest host priority" + if command -v ionice >/dev/null 2>&1; then + CARGO_BUILD_JOBS="$BUILD_JOBS" nice -n 19 ionice -c3 "${build_command[@]}" \ + >"$build_messages" 2>"$run_root/fault-build.log" + else + CARGO_BUILD_JOBS="$BUILD_JOBS" nice -n 19 "${build_command[@]}" \ + >"$build_messages" 2>"$run_root/fault-build.log" + fi + FAULT_TEST_BINARY="$(jq -r ' + select( + .reason == "compiler-artifact" + and .target.name == "faults" + and (.target.kind | index("test")) + ) + | .executable // empty + ' "$build_messages" | tail -n 1)" + [[ -x "$FAULT_TEST_BINARY" ]] || die "faults test binary was not produced; see $run_root/fault-build.log" + printf '%s\n' "$FAULT_TEST_BINARY" >"$run_root/fault-test-binary.path" + + while (( elapsed <= BUILD_SETTLE_SECONDS )); do + snapshot_non_fault_rustfs_pods >"$current" + if ! cmp -s "$before" "$current"; then + diff -u "$before" "$current" >"$changes" || true + die "fault-test build changed a pre-existing RustFS Pod; see $changes" + fi + require_non_fault_tenants_ready + (( elapsed == BUILD_SETTLE_SECONDS )) && break + sleep "$interval" + elapsed=$((elapsed + interval)) + (( elapsed > BUILD_SETTLE_SECONDS )) && elapsed="$BUILD_SETTLE_SECONDS" + done + preflight "$scenario" + echo "fault-test binary ready; pre-existing RustFS Pods remained unchanged for ${BUILD_SETTLE_SECONDS}s" +} + +require_chaos_ready() { + local deployment_ready daemon_ready + deployment_ready="$(kubectl_ns "$CHAOS_NAMESPACE" get deployment chaos-controller-manager -o json | jq -r ' + (.status.readyReplicas // 0) == (.spec.replicas // 0) and (.spec.replicas // 0) > 0 + ')" + daemon_ready="$(kubectl_ns "$CHAOS_NAMESPACE" get daemonset chaos-daemon -o json | jq -r ' + (.status.numberReady // 0) == (.status.desiredNumberScheduled // 0) and (.status.desiredNumberScheduled // 0) > 0 + ')" + [[ "$deployment_ready" == "true" ]] || die "Chaos Mesh controller-manager is not fully Ready" + [[ "$daemon_ready" == "true" ]] || die "Chaos Mesh chaos-daemon is not fully Ready" +} + +require_storage_class() { + local scenario="$1" + local storage_class provisioner pv_count + storage_class="${RUSTFS_FAULT_TEST_STORAGE_CLASS:-}" + [[ -n "$storage_class" ]] || die "RUSTFS_FAULT_TEST_STORAGE_CLASS is required" + provisioner="$(kubectl_cluster get storageclass "$storage_class" -o json | jq -r '.provisioner // ""')" + [[ -n "$provisioner" ]] || die "StorageClass $storage_class has no provisioner" + + if [[ "$scenario" == "dm-flakey" ]]; then + [[ "$provisioner" == "kubernetes.io/no-provisioner" ]] || die "dm-flakey requires a no-provisioner StorageClass" + pv_count="$(kubectl_cluster get pv -o json | jq -r --arg storage_class "$storage_class" ' + [.items[] + | select(.spec.storageClassName == $storage_class) + | select(.status.phase == "Available" or .status.phase == "Bound") + | select(.spec.capacity.storage == "100Gi")] + | length + ')" + [[ "$pv_count" -eq 4 ]] || die "dm-flakey requires exactly four Available/Bound 100Gi PVs, found $pv_count" + else + [[ "$provisioner" != "kubernetes.io/no-provisioner" ]] || die "regular scenarios require dynamic provisioning" + fi +} + +preflight() { + local scenario="${1:-io-eio}" + local current_context ready_nodes crd + is_supported_scenario "$scenario" || die "unsupported scenario: $scenario" + + require_command cargo + require_command jq + require_command kubectl + require_command nice + require_command pgrep + [[ -n "${RUSTFS_FAULT_TEST_EXPECTED_CONTEXT:-}" ]] || die "RUSTFS_FAULT_TEST_EXPECTED_CONTEXT is required" + [[ -n "${RUSTFS_FAULT_TEST_SERVER_IMAGE:-}" ]] || die "RUSTFS_FAULT_TEST_SERVER_IMAGE is required" + + current_context="$(kubectl_context)" + [[ "$current_context" == "$RUSTFS_FAULT_TEST_EXPECTED_CONTEXT" ]] || die "current context $current_context does not match expected context $RUSTFS_FAULT_TEST_EXPECTED_CONTEXT" + [[ "$current_context" != kind-* ]] || die "fault tests require a real Kubernetes cluster, got $current_context" + + kubectl_cluster get crd tenants.rustfs.com >/dev/null + ready_nodes="$(kubectl_cluster get nodes -o json | jq -r '[.items[] + | select(.spec.unschedulable != true) + | select(any(.status.conditions[]; .type == "Ready" and .status == "True"))] | length')" + [[ "$ready_nodes" -ge 4 ]] || die "at least four schedulable Ready nodes are required, found $ready_nodes" + + require_storage_class "$scenario" + require_namespace_ownership + require_non_fault_tenants_ready + + if [[ "$scenario" != "dm-flakey" ]]; then + crd="$(scenario_crd "$scenario")" + kubectl_cluster get crd "$crd" >/dev/null + require_chaos_ready + fi + if [[ "$scenario" == "warp-under-chaos" ]]; then + require_command warp + fi + if [[ "$scenario" == "dm-flakey" ]]; then + [[ -n "${RUSTFS_FAULT_TEST_DM_NAME:-}" ]] || die "RUSTFS_FAULT_TEST_DM_NAME is required" + [[ -n "${RUSTFS_FAULT_TEST_DM_NODE:-}" ]] || die "RUSTFS_FAULT_TEST_DM_NODE is required" + [[ -n "${RUSTFS_FAULT_TEST_DM_MOUNT_PATH:-}" ]] || die "RUSTFS_FAULT_TEST_DM_MOUNT_PATH is required" + [[ -n "${RUSTFS_FAULT_TEST_DM_FAULT_TABLE:-}" ]] || die "RUSTFS_FAULT_TEST_DM_FAULT_TABLE is required" + kubectl_cluster get namespace "$FAULT_NAMESPACE" >/dev/null 2>&1 || die "dm-flakey requires a pre-created owned fault namespace with privileged Pod Security" + [[ "$(kubectl_cluster get namespace "$FAULT_NAMESPACE" -o jsonpath='{.metadata.labels.pod-security\.kubernetes\.io/enforce}')" == "privileged" ]] || die "dm-flakey requires pod-security.kubernetes.io/enforce=privileged on $FAULT_NAMESPACE" + fi + + echo "preflight passed: context=$current_context scenario=$scenario nodes=$ready_nodes storageClass=${RUSTFS_FAULT_TEST_STORAGE_CLASS}" +} + +preflight_cleanup() { + local current_context + require_command jq + require_command kubectl + [[ -n "${RUSTFS_FAULT_TEST_EXPECTED_CONTEXT:-}" ]] || die "RUSTFS_FAULT_TEST_EXPECTED_CONTEXT is required" + current_context="$(kubectl_context)" + [[ "$current_context" == "$RUSTFS_FAULT_TEST_EXPECTED_CONTEXT" ]] || die "current context $current_context does not match expected context $RUSTFS_FAULT_TEST_EXPECTED_CONTEXT" + [[ "$current_context" != kind-* ]] || die "fault cleanup requires a real Kubernetes cluster, got $current_context" + require_namespace_ownership +} + +cleanup_managed_chaos() { + kubectl_ns "$CHAOS_NAMESPACE" delete iochaos,podchaos,networkchaos \ + -l "$MANAGER_SELECTOR" --ignore-not-found=true --wait=false >/dev/null 2>&1 || true +} + +terminate_process_tree() { + local parent="$1" + local child + for child in $(pgrep -P "$parent" 2>/dev/null || true); do + terminate_process_tree "$child" + done + kill -TERM "$parent" 2>/dev/null || true +} + +handle_signal() { + cleanup_managed_chaos + if [[ -n "$ACTIVE_PID" ]]; then + terminate_process_tree "$ACTIVE_PID" + fi + if [[ -n "$ACTIVE_ARTIFACTS" ]]; then + touch "$ACTIVE_ARTIFACTS/interrupted" + echo 130 >"$ACTIVE_ARTIFACTS/exit-code" + capture_cluster_snapshot "$ACTIVE_ARTIFACTS" interrupted + capture_fault_logs "$ACTIVE_ARTIFACTS" + fi + exit 130 +} + +capture_cluster_snapshot() { + local artifacts="$1" stage="$2" + kubectl_cluster get nodes -o wide >"$artifacts/nodes-$stage.txt" 2>&1 || true + kubectl_cluster get tenants -A -o wide >"$artifacts/tenants-$stage.txt" 2>&1 || true + kubectl_cluster get pods -A -o wide >"$artifacts/pods-$stage.txt" 2>&1 || true + kubectl_cluster get pv,pvc -A -o wide >"$artifacts/volumes-$stage.txt" 2>&1 || true + kubectl_ns "$CHAOS_NAMESPACE" get iochaos,podchaos,networkchaos -o yaml >"$artifacts/chaos-$stage.yaml" 2>&1 || true + kubectl_ns "$FAULT_NAMESPACE" get events --sort-by=.lastTimestamp >"$artifacts/events-$stage.txt" 2>&1 || true +} + +capture_fault_logs() { + local artifacts="$1" pod name + for pod in $(kubectl_ns "$FAULT_NAMESPACE" get pods -l "rustfs.tenant=$FAULT_TENANT" -o name 2>/dev/null || true); do + name="${pod#pod/}" + kubectl_ns "$FAULT_NAMESPACE" logs "$pod" >"$artifacts/$name.log" 2>&1 || true + kubectl_ns "$FAULT_NAMESPACE" logs "$pod" --previous >"$artifacts/$name-previous.log" 2>&1 || true + done +} + +health_is_safe() { + local baseline_nodes="$1" baseline_tenants="$2" + local current_nodes namespace tenant state + current_nodes="$(kubectl_cluster get nodes -o json 2>/dev/null | jq -r '[.items[] | select(any(.status.conditions[]; .type == "Ready" and .status == "True"))] | length' 2>/dev/null || echo 0)" + [[ "$current_nodes" -eq "$baseline_nodes" ]] || return 1 + + while IFS=$'\t' read -r namespace tenant; do + [[ -n "$namespace" ]] || continue + state="$(kubectl_ns "$namespace" get tenant "$tenant" -o jsonpath='{.status.currentState}' 2>/dev/null || true)" + [[ "$state" == "Ready" ]] || return 1 + done <"$baseline_tenants" + return 0 +} + +find_artifact() { + find "$1" -name "$2" -type f -print -quit +} + +validate_scenario_artifacts() { + local scenario="$1" artifacts="$2" run_root="$3" + local plan evidence checker summary seed disruptions recommitted committed + plan="$(find_artifact "$artifacts" workload-plan.json)" + evidence="$(find_artifact "$artifacts" fault-evidence.json)" + checker="$(find_artifact "$artifacts" checker-report.json)" + summary="$(find_artifact "$artifacts" workload-summary.json)" + [[ -f "$plan" ]] || die "$scenario did not produce workload-plan.json" + [[ -f "$evidence" ]] || die "$scenario did not produce fault-evidence.json" + [[ -f "$checker" ]] || die "$scenario did not produce checker-report.json" + [[ -f "$summary" ]] || die "$scenario did not produce workload-summary.json" + + jq -e --argjson objects "$EXPECTED_OBJECTS" --argjson concurrency "$EXPECTED_CONCURRENCY" --argjson payload "$EXPECTED_PAYLOAD_BYTES" ' + .object_count == $objects + and .concurrency == $concurrency + and .total_payload_bytes == $payload + and .size_distribution == [ + {"size_bytes":4096,"object_count":34000}, + {"size_bytes":16384,"object_count":4000}, + {"size_bytes":8388608,"object_count":1600}, + {"size_bytes":16777216,"object_count":400} + ] + ' "$plan" >/dev/null || die "$scenario workload plan does not match the required profile" + jq -e '.injected == true and .active_during_workload == true and .recovered == true' "$evidence" >/dev/null || die "$scenario fault evidence is incomplete" + jq -e --argjson objects "$EXPECTED_OBJECTS" ' + .committed_puts == $objects + and (.missing_committed_objects | length) == 0 + and (.hash_mismatches | length) == 0 + and (.successful_corrupted_reads | length) == 0 + and (.list_warnings | length) == 0 + and .tenant_recovered == true + and .passed == true + ' "$checker" >/dev/null || die "$scenario checker verdict failed" + + seed="$(jq -r '.seed' "$plan")" + disruptions="$(jq -r '.client_disruptions' "$evidence")" + recommitted="$(jq -r '.recommitted_after_recovery' "$summary")" + committed="$(jq -r '.committed_puts' "$checker")" + printf '%s\t%s\t0\t%s\t%s\t%s\t0\t0\t0\t0\ttrue\n' \ + "$scenario" "$seed" "$disruptions" "$recommitted" "$committed" >>"$run_root/validation-summary.tsv" +} + +run_scenario() { + local scenario="$1" run_root="$2" + local artifacts="$run_root/$scenario" + local baseline_nodes baseline_tenants test_pid rc current_time health_checks + preflight "$scenario" + mkdir -p "$artifacts" + baseline_nodes="$(kubectl_cluster get nodes -o json | jq -r '.items | length')" + baseline_tenants="$artifacts/baseline-tenants.tsv" + kubectl_cluster get tenants -A -o json | jq -r --arg namespace "$FAULT_NAMESPACE" ' + .items[] | select(.metadata.namespace != $namespace) | [.metadata.namespace,.metadata.name] | @tsv + ' >"$baseline_tenants" + capture_cluster_snapshot "$artifacts" before + + echo "starting scenario=$scenario artifacts=$artifacts" + ( + set +e + RUSTFS_FAULT_TEST_DESTRUCTIVE=1 \ + RUSTFS_FAULT_TEST_SCENARIO="$scenario" \ + RUSTFS_FAULT_TEST_WORKLOAD_OBJECTS="$EXPECTED_OBJECTS" \ + RUSTFS_FAULT_TEST_WORKLOAD_CONCURRENCY="$EXPECTED_CONCURRENCY" \ + RUSTFS_FAULT_TEST_DURATION_SECONDS="${RUSTFS_FAULT_TEST_DURATION_SECONDS:-7200}" \ + RUSTFS_FAULT_TEST_ARTIFACTS="$artifacts" \ + "$FAULT_TEST_BINARY" --ignored --test-threads=1 --nocapture \ + >"$artifacts/test.log" 2>&1 + echo "$?" >"$artifacts/test-exit-code.tmp" + ) & + test_pid=$! + ACTIVE_PID="$test_pid" + ACTIVE_ARTIFACTS="$artifacts" + health_checks=0 + + while kill -0 "$test_pid" 2>/dev/null; do + current_time="$(date -u +%FT%TZ)" + health_checks=$((health_checks + 1)) + if health_is_safe "$baseline_nodes" "$baseline_tenants"; then + echo "$current_time safe=true" >>"$artifacts/health-watch.log" + if (( health_checks % 6 == 0 )); then + echo "scenario=$scenario running safe=true time=$current_time" + fi + else + echo "$current_time safe=false" >>"$artifacts/health-watch.log" + touch "$artifacts/health-guard-failed" + cleanup_managed_chaos + terminate_process_tree "$test_pid" + break + fi + sleep 10 + done + + wait "$test_pid" 2>/dev/null || true + ACTIVE_PID="" + ACTIVE_ARTIFACTS="" + rc=125 + [[ -f "$artifacts/test-exit-code.tmp" ]] && rc="$(cat "$artifacts/test-exit-code.tmp")" + [[ ! -f "$artifacts/health-guard-failed" ]] || rc=90 + echo "$rc" >"$artifacts/exit-code" + capture_cluster_snapshot "$artifacts" after + capture_fault_logs "$artifacts" + + if [[ "$rc" -ne 0 ]]; then + cleanup_managed_chaos + echo "scenario failed: $scenario rc=$rc log=$artifacts/test.log" >&2 + return "$rc" + fi + validate_scenario_artifacts "$scenario" "$artifacts" "$run_root" + echo "scenario passed: $scenario" +} + +new_run_root() { + if [[ -n "${RUSTFS_FAULT_TEST_RUN_ROOT:-}" ]]; then + echo "$RUSTFS_FAULT_TEST_RUN_ROOT" + else + echo "$PACKAGE_DIR/target/fault-tests/$(date -u +%Y%m%dT%H%M%SZ)" + fi +} + +initialize_summary() { + local run_root="$1" + mkdir -p "$run_root" + if [[ ! -f "$run_root/validation-summary.tsv" ]]; then + printf 'scenario\tseed\texit\tdisruptions\trecommitted\tcommitted\tmissing\thash_mismatch\tcorrupt_read\tlist_warning\trecovered\n' \ + >"$run_root/validation-summary.tsv" + fi +} + +run_one() { + local scenario="$1" run_root + is_supported_scenario "$scenario" || die "unsupported scenario: $scenario" + run_root="$(new_run_root)" + initialize_summary "$run_root" + prepare_fault_binary "$scenario" "$run_root" + run_scenario "$scenario" "$run_root" + echo "run artifacts: $run_root" +} + +run_regular() { + local run_root scenario prepared=false + local scenarios="${RUSTFS_FAULT_TEST_SCENARIOS:-$DEFAULT_SCENARIOS}" + run_root="$(new_run_root)" + initialize_summary "$run_root" + for scenario in $scenarios; do + [[ "$scenario" != "dm-flakey" ]] || die "run-regular cannot include dm-flakey" + if [[ "$prepared" == "false" ]]; then + prepare_fault_binary "$scenario" "$run_root" + prepared=true + fi + run_scenario "$scenario" "$run_root" || return $? + done + echo "regular scenario artifacts: $run_root" +} + +cleanup() { + cleanup_managed_chaos + if kubectl_cluster get namespace "$FAULT_NAMESPACE" >/dev/null 2>&1; then + require_namespace_ownership + kubectl_cluster delete namespace "$FAULT_NAMESPACE" --wait=true + fi + if kubectl_ns "$CHAOS_NAMESPACE" get iochaos,podchaos,networkchaos -l "$MANAGER_SELECTOR" -o name 2>/dev/null | grep -q .; then + die "managed Chaos resources remain after cleanup" + fi + echo "managed fault-test resources cleaned; external StorageClasses, PVs, and host devices were not changed" +} + +trap handle_signal INT TERM HUP + +case "${1:-help}" in + help|-h|--help) + usage + ;; + preflight) + preflight "${2:-io-eio}" + ;; + run) + [[ -n "${2:-}" ]] || die "scenario is required" + run_one "$2" + ;; + run-regular) + run_regular + ;; + cleanup) + preflight_cleanup + cleanup + ;; + *) + usage >&2 + die "unknown command: $1" + ;; +esac diff --git a/e2e/src/bin/rustfs-e2e.rs b/e2e/src/bin/rustfs-e2e.rs index a3dc28b..b662ac4 100644 --- a/e2e/src/bin/rustfs-e2e.rs +++ b/e2e/src/bin/rustfs-e2e.rs @@ -110,7 +110,7 @@ fn sanitize_live_storage(config: &E2eConfig) -> Result<()> { fn reset_live_fixtures(config: &E2eConfig) -> Result<()> { live::require_live_enabled(config)?; live::ensure_dedicated_context(config)?; - resources::reset_smoke_tenant_resources(config)?; + resources::reset_tenant_resources(config)?; storage::reset_default_local_storage(config)?; cert_manager_tls::reset_positive_case_resources(config)?; Ok(()) diff --git a/e2e/src/cases/mod.rs b/e2e/src/cases/mod.rs index 04933f3..51a68dc 100644 --- a/e2e/src/cases/mod.rs +++ b/e2e/src/cases/mod.rs @@ -27,6 +27,12 @@ pub enum Suite { CertManagerTls, } +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum CaseStatus { + Executable, + Planned, +} + #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct CaseSpec { pub suite: Suite, @@ -34,6 +40,7 @@ pub struct CaseSpec { pub description: &'static str, pub boundary: &'static str, pub ci_phase: &'static str, + pub status: CaseStatus, } impl CaseSpec { @@ -50,6 +57,24 @@ impl CaseSpec { description, boundary, ci_phase, + status: CaseStatus::Executable, + } + } + + pub const fn planned( + suite: Suite, + name: &'static str, + description: &'static str, + boundary: &'static str, + ci_phase: &'static str, + ) -> Self { + Self { + suite, + name, + description, + boundary, + ci_phase, + status: CaseStatus::Planned, } } } @@ -66,7 +91,7 @@ pub fn all_cases() -> Vec { #[cfg(test)] mod tests { - use super::{Suite, all_cases}; + use super::{CaseStatus, Suite, all_cases}; use std::collections::{HashMap, HashSet}; #[test] @@ -97,13 +122,15 @@ mod tests { fn cases_are_mapped_to_ci_phases_and_architecture_boundaries() { let missing = all_cases() .into_iter() - .filter(|case| case.boundary.is_empty() || case.ci_phase.is_empty()) + .filter(|case| { + case.description.is_empty() || case.boundary.is_empty() || case.ci_phase.is_empty() + }) .map(|case| case.name) .collect::>(); assert!( missing.is_empty(), - "cases missing boundary/ci phase: {missing:?}" + "cases missing description/boundary/ci phase: {missing:?}" ); } @@ -111,6 +138,7 @@ mod tests { fn executable_cases_are_present_for_each_suite() { let counts = all_cases() .into_iter() + .filter(|case| case.status == CaseStatus::Executable) .fold(HashMap::new(), |mut acc, case| { *acc.entry(case.suite).or_insert(0usize) += 1; acc diff --git a/e2e/src/framework/artifacts.rs b/e2e/src/framework/artifacts.rs index a7942f4..25455d5 100644 --- a/e2e/src/framework/artifacts.rs +++ b/e2e/src/framework/artifacts.rs @@ -16,7 +16,7 @@ use anyhow::Result; use std::fs; use std::path::{Path, PathBuf}; -use crate::framework::{command::CommandSpec, config::E2eConfig, kubectl::Kubectl}; +use crate::framework::{command::CommandSpec, config::ClusterTestConfig, kubectl::Kubectl}; const ERASURE_READ_QUORUM: &str = "erasure read quorum"; const DNS_LOOKUP_FAILURE: &str = "failed to lookup address information"; @@ -62,7 +62,7 @@ impl ArtifactCollector { pub fn collect_kubernetes_snapshot( &self, case_name: &str, - config: &E2eConfig, + config: &ClusterTestConfig, ) -> Result { let mut combined_output = String::new(); @@ -90,7 +90,7 @@ impl ArtifactCollector { } } -fn kubernetes_snapshot_commands(config: &E2eConfig) -> Vec { +fn kubernetes_snapshot_commands(config: &ClusterTestConfig) -> Vec { let kubectl = Kubectl::new(config); let operator_kubectl = Kubectl::new(config).namespaced(&config.operator_namespace); let test_kubectl = Kubectl::new(config).namespaced(&config.test_namespace); @@ -193,7 +193,7 @@ fn kubernetes_snapshot_commands(config: &E2eConfig) -> Vec { fn diagnose_snapshot(snapshot: &str) -> String { let mut lines = vec![ - "RustFS Operator e2e diagnostic summary".to_string(), + "RustFS Operator test diagnostic summary".to_string(), String::new(), ]; let mut matched = false; @@ -203,9 +203,8 @@ fn diagnose_snapshot(snapshot: &str) -> String { lines.extend([ format!("Detected `{ERASURE_READ_QUORUM}` in RustFS pod logs."), "Meaning: RustFS ECStore could not read a majority of matching erasure format metadata during startup.".to_string(), - "Most likely live-e2e causes: stale or partially initialized data in dedicated local PV host paths, peer startup/DNS timing, or a RustFS bootstrap retry window that ended before quorum converged.".to_string(), + "Most likely test causes: stale or partially initialized volumes, peer startup/DNS timing, or a RustFS bootstrap retry window that ended before quorum converged.".to_string(), "Inspect: rustfs-pods-current.log, rustfs-pods-previous.log, tenant-describe.txt, rustfs-pods-describe.txt, and pv-paths.txt.".to_string(), - "Recovery for the dedicated e2e cluster: RUSTFS_E2E_LIVE=1 make e2e-live-delete && RUSTFS_E2E_LIVE=1 make e2e-live-create && RUSTFS_E2E_LIVE=1 make e2e-live-run".to_string(), String::new(), ]); } @@ -293,7 +292,7 @@ mod tests { assert!(diagnosis.contains("Detected `erasure read quorum`")); assert!(diagnosis.contains("ECStore could not read a majority")); - assert!(diagnosis.contains("e2e-live-delete")); + assert!(diagnosis.contains("stale or partially initialized volumes")); } #[test] diff --git a/e2e/src/framework/cert_manager_tls.rs b/e2e/src/framework/cert_manager_tls.rs index 15caa66..78acfab 100644 --- a/e2e/src/framework/cert_manager_tls.rs +++ b/e2e/src/framework/cert_manager_tls.rs @@ -101,7 +101,7 @@ pub fn external_secret_storage_layout(config: &E2eConfig) -> storage::LocalStora pub fn reset_positive_case_resources(config: &E2eConfig) -> Result<()> { let managed = managed_certificate_case_config(config); - resources::reset_smoke_tenant_resources(&managed)?; + resources::reset_tenant_resources(&managed)?; storage::reset_local_storage_for_layout( &managed, &managed_certificate_storage_layout(&managed), @@ -109,7 +109,7 @@ pub fn reset_positive_case_resources(config: &E2eConfig) -> Result<()> { .context("reset managed cert-manager TLS e2e storage")?; let external = external_secret_case_config(config); - resources::reset_smoke_tenant_resources(&external)?; + resources::reset_tenant_resources(&external)?; storage::reset_local_storage_for_layout(&external, &external_secret_storage_layout(&external)) .context("reset external Secret cert-manager TLS e2e storage")?; diff --git a/e2e/src/framework/chaos_mesh.rs b/e2e/src/framework/chaos_mesh.rs new file mode 100644 index 0000000..a0c1083 --- /dev/null +++ b/e2e/src/framework/chaos_mesh.rs @@ -0,0 +1,730 @@ +// Copyright 2025 RustFS Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Context, Result, bail, ensure}; +use serde_json::Value; +use std::thread::sleep; +use std::time::{Duration, Instant}; + +use crate::framework::{config::ClusterTestConfig, kubectl::Kubectl}; + +const IOCHAOS_CRD: &str = "iochaos.chaos-mesh.org"; +const PODCHAOS_CRD: &str = "podchaos.chaos-mesh.org"; +const NETWORKCHAOS_CRD: &str = "networkchaos.chaos-mesh.org"; +const RUN_ID_LABEL: &str = "rustfs-fault-test/run-id"; +const SCENARIO_LABEL: &str = "rustfs-fault-test/scenario"; +const MANAGED_BY_LABEL: &str = "app.kubernetes.io/managed-by"; +const MANAGED_BY_VALUE: &str = "rustfs-operator-fault-test"; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum IoChaosAction { + Fault { + errno: u8, + }, + Mistake { + filling: String, + max_occurrences: u8, + max_length: usize, + }, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct IoChaosSpec { + pub name: String, + pub namespace: String, + pub run_id: String, + pub scenario: String, + pub target_namespace: String, + pub tenant_name: String, + pub container_name: String, + pub volume_path: String, + pub methods: Vec, + pub action: IoChaosAction, + pub percent: u8, + pub duration: Duration, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PodChaosSpec { + pub name: String, + pub namespace: String, + pub run_id: String, + pub scenario: String, + pub target_namespace: String, + pub tenant_name: String, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct NetworkChaosSpec { + pub name: String, + pub namespace: String, + pub run_id: String, + pub scenario: String, + pub target_namespace: String, + pub tenant_name: String, + pub duration: Duration, +} + +#[derive(Debug, Clone)] +pub struct ChaosGuard { + config: ClusterTestConfig, + kind: &'static str, + namespace: String, + name: String, + deleted: bool, +} + +impl IoChaosSpec { + pub fn eio_on_rustfs_volume( + config: &ClusterTestConfig, + chaos_namespace: impl Into, + run_id: impl Into, + scenario: impl Into, + volume_path: impl Into, + percent: u8, + duration: Duration, + ) -> Result { + ensure!( + (1..=100).contains(&percent), + "IOChaos percent must be in 1..=100, got {percent}" + ); + ensure!( + duration > Duration::ZERO, + "IOChaos duration must be positive" + ); + + let run_id = run_id.into(); + let short_run_id = run_id.chars().take(12).collect::(); + let scenario = scenario.into(); + + Ok(Self { + name: format!("rustfs-fault-io-eio-{short_run_id}"), + namespace: chaos_namespace.into(), + run_id, + scenario, + target_namespace: config.test_namespace.clone(), + tenant_name: config.tenant_name.clone(), + container_name: "rustfs".to_string(), + volume_path: volume_path.into(), + methods: vec!["READ".to_string(), "WRITE".to_string()], + action: IoChaosAction::Fault { errno: 5 }, + percent, + duration, + }) + } + + pub fn read_mistake_on_rustfs_volume( + config: &ClusterTestConfig, + chaos_namespace: impl Into, + run_id: impl Into, + scenario: impl Into, + volume_path: impl Into, + percent: u8, + duration: Duration, + ) -> Result { + ensure!( + (1..=100).contains(&percent), + "IOChaos percent must be in 1..=100, got {percent}" + ); + ensure!( + duration > Duration::ZERO, + "IOChaos duration must be positive" + ); + + let run_id = run_id.into(); + let short_run_id = run_id.chars().take(12).collect::(); + let scenario = scenario.into(); + + Ok(Self { + name: format!("rustfs-fault-io-mistake-{short_run_id}"), + namespace: chaos_namespace.into(), + run_id, + scenario, + target_namespace: config.test_namespace.clone(), + tenant_name: config.tenant_name.clone(), + container_name: "rustfs".to_string(), + volume_path: volume_path.into(), + methods: vec!["READ".to_string()], + action: IoChaosAction::Mistake { + filling: "random".to_string(), + max_occurrences: 1, + max_length: 4096, + }, + percent, + duration, + }) + } + + pub fn enospc_on_rustfs_volume( + config: &ClusterTestConfig, + chaos_namespace: impl Into, + run_id: impl Into, + scenario: impl Into, + volume_path: impl Into, + percent: u8, + duration: Duration, + ) -> Result { + ensure!( + (1..=100).contains(&percent), + "IOChaos percent must be in 1..=100, got {percent}" + ); + ensure!( + duration > Duration::ZERO, + "IOChaos duration must be positive" + ); + + let run_id = run_id.into(); + let short_run_id = run_id.chars().take(12).collect::(); + let scenario = scenario.into(); + + Ok(Self { + name: format!("rustfs-fault-enospc-{short_run_id}"), + namespace: chaos_namespace.into(), + run_id, + scenario, + target_namespace: config.test_namespace.clone(), + tenant_name: config.tenant_name.clone(), + container_name: "rustfs".to_string(), + volume_path: volume_path.into(), + methods: vec!["WRITE".to_string()], + action: IoChaosAction::Fault { errno: 28 }, + percent, + duration, + }) + } + + pub fn manifest(&self) -> String { + let methods = self + .methods + .iter() + .map(|method| format!(" - {method}")) + .collect::>() + .join("\n"); + let seconds = self.duration.as_secs(); + let action = self.action_manifest(); + + format!( + r#"apiVersion: chaos-mesh.org/v1alpha1 +kind: IOChaos +metadata: + name: {name} + namespace: {namespace} + labels: + {run_id_label}: "{run_id}" + {scenario_label}: "{scenario}" + {managed_by_label}: {managed_by_value} +spec: +{action} + mode: one + selector: + namespaces: + - {target_namespace} + labelSelectors: + rustfs.tenant: {tenant_name} + containerNames: + - {container_name} + volumePath: {volume_path} + path: {volume_path}/**/* + methods: +{methods} + percent: {percent} + duration: "{seconds}s" +"#, + name = self.name, + namespace = self.namespace, + run_id_label = RUN_ID_LABEL, + run_id = self.run_id, + scenario_label = SCENARIO_LABEL, + scenario = self.scenario, + managed_by_label = MANAGED_BY_LABEL, + managed_by_value = MANAGED_BY_VALUE, + target_namespace = self.target_namespace, + tenant_name = self.tenant_name, + container_name = self.container_name, + volume_path = self.volume_path, + methods = methods, + percent = self.percent, + action = action, + ) + } + + fn action_manifest(&self) -> String { + match &self.action { + IoChaosAction::Fault { errno } => { + format!(" action: fault\n errno: {errno}") + } + IoChaosAction::Mistake { + filling, + max_occurrences, + max_length, + } => format!( + r#" action: mistake + mistake: + filling: {filling} + maxOccurrences: {max_occurrences} + maxLength: {max_length}"# + ), + } + } +} + +impl PodChaosSpec { + pub fn kill_one_rustfs_pod( + config: &ClusterTestConfig, + chaos_namespace: impl Into, + run_id: impl Into, + scenario: impl Into, + ) -> Self { + let run_id = run_id.into(); + let short_run_id = run_id.chars().take(12).collect::(); + Self { + name: format!("rustfs-fault-pod-kill-{short_run_id}"), + namespace: chaos_namespace.into(), + run_id, + scenario: scenario.into(), + target_namespace: config.test_namespace.clone(), + tenant_name: config.tenant_name.clone(), + } + } + + pub fn manifest(&self) -> String { + format!( + r#"apiVersion: chaos-mesh.org/v1alpha1 +kind: PodChaos +metadata: + name: {name} + namespace: {namespace} + labels: + {run_id_label}: "{run_id}" + {scenario_label}: "{scenario}" + {managed_by_label}: {managed_by_value} +spec: + action: pod-kill + mode: one + selector: + namespaces: + - {target_namespace} + labelSelectors: + rustfs.tenant: {tenant_name} +"#, + name = self.name, + namespace = self.namespace, + run_id_label = RUN_ID_LABEL, + run_id = self.run_id, + scenario_label = SCENARIO_LABEL, + scenario = self.scenario, + managed_by_label = MANAGED_BY_LABEL, + managed_by_value = MANAGED_BY_VALUE, + target_namespace = self.target_namespace, + tenant_name = self.tenant_name, + ) + } +} + +impl NetworkChaosSpec { + pub fn partition_one_rustfs_pod( + config: &ClusterTestConfig, + chaos_namespace: impl Into, + run_id: impl Into, + scenario: impl Into, + duration: Duration, + ) -> Result { + ensure!( + duration > Duration::ZERO, + "NetworkChaos duration must be positive" + ); + + let run_id = run_id.into(); + let short_run_id = run_id.chars().take(12).collect::(); + Ok(Self { + name: format!("rustfs-fault-net-partition-{short_run_id}"), + namespace: chaos_namespace.into(), + run_id, + scenario: scenario.into(), + target_namespace: config.test_namespace.clone(), + tenant_name: config.tenant_name.clone(), + duration, + }) + } + + pub fn manifest(&self) -> String { + let seconds = self.duration.as_secs(); + format!( + r#"apiVersion: chaos-mesh.org/v1alpha1 +kind: NetworkChaos +metadata: + name: {name} + namespace: {namespace} + labels: + {run_id_label}: "{run_id}" + {scenario_label}: "{scenario}" + {managed_by_label}: {managed_by_value} +spec: + action: partition + mode: one + selector: + namespaces: + - {target_namespace} + labelSelectors: + rustfs.tenant: {tenant_name} + direction: both + target: + mode: all + selector: + namespaces: + - {target_namespace} + labelSelectors: + rustfs.tenant: {tenant_name} + duration: "{seconds}s" +"#, + name = self.name, + namespace = self.namespace, + run_id_label = RUN_ID_LABEL, + run_id = self.run_id, + scenario_label = SCENARIO_LABEL, + scenario = self.scenario, + managed_by_label = MANAGED_BY_LABEL, + managed_by_value = MANAGED_BY_VALUE, + target_namespace = self.target_namespace, + tenant_name = self.tenant_name, + ) + } +} + +pub fn require_iochaos_crd(config: &ClusterTestConfig) -> Result<()> { + require_crd(config, IOCHAOS_CRD, "Chaos Mesh IOChaos") +} + +pub fn require_podchaos_crd(config: &ClusterTestConfig) -> Result<()> { + require_crd(config, PODCHAOS_CRD, "Chaos Mesh PodChaos") +} + +pub fn require_networkchaos_crd(config: &ClusterTestConfig) -> Result<()> { + require_crd(config, NETWORKCHAOS_CRD, "Chaos Mesh NetworkChaos") +} + +fn require_crd(config: &ClusterTestConfig, crd: &str, description: &str) -> Result<()> { + let output = Kubectl::new(config).command(["get", "crd", crd]).run()?; + ensure!( + output.code == Some(0), + "{description} CRD {crd} is required for fault tests; install Chaos Mesh before running faults\nstdout:\n{}\nstderr:\n{}", + output.stdout, + output.stderr + ); + Ok(()) +} + +pub fn cleanup_run(config: &ClusterTestConfig, namespace: &str, run_id: &str) -> Result<()> { + let selector = format!("{RUN_ID_LABEL}={run_id}"); + for kind in ["iochaos", "podchaos", "networkchaos"] { + Kubectl::new(config) + .namespaced(namespace) + .command(["delete", kind, "-l", &selector, "--ignore-not-found"]) + .run_checked()?; + } + Ok(()) +} + +pub fn cleanup_run_kind( + config: &ClusterTestConfig, + namespace: &str, + run_id: &str, + kind: &str, +) -> Result<()> { + let selector = format!("{RUN_ID_LABEL}={run_id}"); + Kubectl::new(config) + .namespaced(namespace) + .command(["delete", kind, "-l", &selector, "--ignore-not-found"]) + .run_checked()?; + Ok(()) +} + +pub fn cleanup_managed_iochaos(config: &ClusterTestConfig, namespace: &str) -> Result<()> { + cleanup_managed_kind(config, namespace, "iochaos") +} + +pub fn cleanup_managed_podchaos(config: &ClusterTestConfig, namespace: &str) -> Result<()> { + cleanup_managed_kind(config, namespace, "podchaos") +} + +pub fn cleanup_managed_networkchaos(config: &ClusterTestConfig, namespace: &str) -> Result<()> { + cleanup_managed_kind(config, namespace, "networkchaos") +} + +fn cleanup_managed_kind(config: &ClusterTestConfig, namespace: &str, kind: &str) -> Result<()> { + let selector = format!("{MANAGED_BY_LABEL}={MANAGED_BY_VALUE}"); + Kubectl::new(config) + .namespaced(namespace) + .command(["delete", kind, "-l", &selector, "--ignore-not-found"]) + .run_checked()?; + Ok(()) +} + +pub fn apply_iochaos(config: &ClusterTestConfig, spec: &IoChaosSpec) -> Result { + cleanup_run_kind(config, &spec.namespace, &spec.run_id, "iochaos")?; + Kubectl::new(config) + .namespaced(&spec.namespace) + .apply_yaml_command(spec.manifest()) + .run_checked()?; + + Ok(ChaosGuard { + config: config.clone(), + kind: "iochaos", + namespace: spec.namespace.clone(), + name: spec.name.clone(), + deleted: false, + }) +} + +pub fn apply_podchaos(config: &ClusterTestConfig, spec: &PodChaosSpec) -> Result { + cleanup_run_kind(config, &spec.namespace, &spec.run_id, "podchaos")?; + Kubectl::new(config) + .namespaced(&spec.namespace) + .apply_yaml_command(spec.manifest()) + .run_checked()?; + + Ok(ChaosGuard { + config: config.clone(), + kind: "podchaos", + namespace: spec.namespace.clone(), + name: spec.name.clone(), + deleted: false, + }) +} + +pub fn apply_networkchaos( + config: &ClusterTestConfig, + spec: &NetworkChaosSpec, +) -> Result { + cleanup_run_kind(config, &spec.namespace, &spec.run_id, "networkchaos")?; + Kubectl::new(config) + .namespaced(&spec.namespace) + .apply_yaml_command(spec.manifest()) + .run_checked()?; + + Ok(ChaosGuard { + config: config.clone(), + kind: "networkchaos", + namespace: spec.namespace.clone(), + name: spec.name.clone(), + deleted: false, + }) +} + +impl ChaosGuard { + pub fn kind(&self) -> &'static str { + self.kind + } + + pub fn name(&self) -> &str { + &self.name + } + + pub fn wait_active(&self, timeout: Duration) -> Result<()> { + let deadline = Instant::now() + timeout; + + loop { + let status_snapshot = match self.json() { + Ok(status) => { + if chaos_experiment_is_active(&status)? { + return Ok(()); + } + status + } + Err(error) => { + format!("failed to read {kind} status: {error}", kind = self.kind) + } + }; + + if Instant::now() >= deadline { + let describe = self.describe().unwrap_or_else(|error| { + format!( + "failed to describe {kind}/{name}: {error}", + kind = self.kind, + name = self.name + ) + }); + bail!( + "timed out waiting for {kind}/{name} to become active after {timeout:?}\nlast status:\n{status_snapshot}\n\ndescribe:\n{describe}", + kind = self.kind, + name = self.name, + ); + } + + sleep(Duration::from_secs(1)); + } + } + + pub fn ensure_active(&self, stage: &str) -> Result<()> { + let status = self.json()?; + ensure!( + chaos_experiment_is_active(&status)?, + "{kind}/{name} is not active at {stage}; status:\n{status}", + kind = self.kind, + name = self.name + ); + Ok(()) + } + + pub fn describe(&self) -> Result { + let output = Kubectl::new(&self.config) + .namespaced(&self.namespace) + .command(["describe", self.kind, &self.name]) + .run_checked()?; + Ok(output.stdout) + } + + pub fn yaml(&self) -> Result { + let output = Kubectl::new(&self.config) + .namespaced(&self.namespace) + .command(["get", self.kind, &self.name, "-o", "yaml"]) + .run_checked()?; + Ok(output.stdout) + } + + pub fn delete(&mut self) -> Result<()> { + self.delete_inner()?; + self.deleted = true; + Ok(()) + } + + pub fn json(&self) -> Result { + let output = Kubectl::new(&self.config) + .namespaced(&self.namespace) + .command(["get", self.kind, &self.name, "-o", "json"]) + .run_checked()?; + Ok(output.stdout) + } + + fn delete_inner(&self) -> Result<()> { + Kubectl::new(&self.config) + .namespaced(&self.namespace) + .command(["delete", self.kind, &self.name, "--ignore-not-found"]) + .run_checked()?; + Ok(()) + } +} + +fn chaos_experiment_is_active(raw: &str) -> Result { + let value = serde_json::from_str::(raw).context("parse Chaos Mesh status json")?; + let selected = condition_status(&value, "Selected").is_some_and(|status| status == "True"); + let injected = condition_status(&value, "AllInjected") + .or_else(|| condition_status(&value, "Injected")) + .is_some_and(|status| status == "True"); + let recovered = condition_status(&value, "AllRecovered").is_some_and(|status| status == "True"); + + Ok(selected && injected && !recovered) +} + +fn condition_status(value: &Value, condition_type: &str) -> Option { + value + .pointer("/status/conditions")? + .as_array()? + .iter() + .find(|condition| condition.get("type").and_then(Value::as_str) == Some(condition_type))? + .get("status")? + .as_str() + .map(str::to_string) +} + +impl Drop for ChaosGuard { + fn drop(&mut self) { + if !self.deleted { + let _ = self.delete_inner(); + } + } +} + +#[cfg(test)] +mod tests { + use super::{IoChaosSpec, chaos_experiment_is_active}; + use crate::framework::fault_config::FaultTestConfig; + use std::time::Duration; + + #[test] + fn iochaos_manifest_targets_rustfs_workload_only() { + let config = FaultTestConfig::for_test("real-cluster", "fast-csi"); + let spec = IoChaosSpec::eio_on_rustfs_volume( + &config.cluster, + "chaos-mesh", + "run-1234567890", + "io-eio", + "/data/rustfs0", + 20, + Duration::from_secs(60), + ) + .expect("valid io chaos"); + let manifest = spec.manifest(); + + assert!(manifest.contains("kind: IOChaos")); + assert!(manifest.contains("namespace: chaos-mesh")); + assert!(manifest.contains("rustfs.tenant: fault-test-tenant")); + assert!(manifest.contains("rustfs-fault-test/run-id")); + assert!(manifest.contains("rustfs-operator-fault-test")); + assert!(manifest.contains("containerNames:\n - rustfs")); + assert!(manifest.contains("volumePath: /data/rustfs0")); + assert!(manifest.contains("errno: 5")); + assert!(manifest.contains("percent: 20")); + } + + #[test] + fn enospc_manifest_targets_only_volume_writes() { + let config = FaultTestConfig::for_test("real-cluster", "fast-csi"); + let spec = IoChaosSpec::enospc_on_rustfs_volume( + &config.cluster, + "chaos-mesh", + "run-1234567890", + "disk-full", + "/data/rustfs0", + 100, + Duration::from_secs(60), + ) + .expect("valid enospc chaos"); + let manifest = spec.manifest(); + + assert!(manifest.contains("errno: 28")); + assert!(manifest.contains("methods:\n - WRITE")); + assert!(manifest.contains("percent: 100")); + assert!(!manifest.contains(" - READ")); + } + + #[test] + fn iochaos_active_requires_selected_and_injected_not_recovered() { + let status = r#"{ + "status": { + "conditions": [ + {"type": "Selected", "status": "True"}, + {"type": "AllInjected", "status": "True"}, + {"type": "AllRecovered", "status": "False"} + ] + } + }"#; + + assert!(chaos_experiment_is_active(status).expect("valid status")); + } + + #[test] + fn chaos_experiment_active_rejects_unselected_experiment() { + let status = r#"{ + "status": { + "conditions": [ + {"type": "Selected", "status": "False"}, + {"type": "AllInjected", "status": "True"} + ] + } + }"#; + + assert!(!chaos_experiment_is_active(status).expect("valid status")); + } +} diff --git a/e2e/src/framework/checker.rs b/e2e/src/framework/checker.rs new file mode 100644 index 0000000..7f72b5f --- /dev/null +++ b/e2e/src/framework/checker.rs @@ -0,0 +1,248 @@ +// Copyright 2025 RustFS Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Result, ensure}; +use futures::{StreamExt, stream}; +use serde::{Deserialize, Serialize}; +use std::collections::{BTreeMap, BTreeSet}; + +use crate::framework::{ + history::{OperationKind, OperationOutcome, OperationRecord, Recorder}, + s3_workload::{ObjectSpec, S3WorkloadClient, sha256_hex}, +}; + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct CheckerReport { + pub scenario: String, + pub run_id: String, + pub committed_puts: usize, + pub missing_committed_objects: Vec, + pub hash_mismatches: Vec, + pub successful_corrupted_reads: Vec, + pub unknown_writes_materialized: Vec, + pub list_warnings: Vec, + pub tenant_recovered: bool, + pub passed: bool, +} + +impl CheckerReport { + pub fn require_success(&self) -> Result<()> { + ensure!( + self.passed, + "fault checker failed for scenario {} run {}: {}", + self.scenario, + self.run_id, + serde_json::to_string_pretty(self)? + ); + Ok(()) + } +} + +pub async fn check_s3_history( + s3: &S3WorkloadClient, + recorder: &Recorder, + tenant_recovered: bool, + concurrency: usize, +) -> Result { + let initial_records = recorder.records(); + let committed = committed_puts(&initial_records); + let unknown_writes = unknown_puts(&initial_records); + let mut report = CheckerReport { + scenario: recorder.scenario(), + run_id: recorder.run_id(), + committed_puts: committed.len(), + missing_committed_objects: Vec::new(), + hash_mismatches: Vec::new(), + successful_corrupted_reads: successful_corrupted_reads(&initial_records, &committed), + unknown_writes_materialized: Vec::new(), + list_warnings: Vec::new(), + tenant_recovered, + passed: false, + }; + + let mut committed_results = + stream::iter(committed.clone().into_iter().map(|(key, expected_hash)| { + let s3 = s3.clone(); + let recorder = recorder.clone(); + async move { + let body = s3.get_object(&key, &recorder).await?; + Ok::<_, anyhow::Error>((key, expected_hash, body)) + } + })) + .buffer_unordered(concurrency); + while let Some(result) = committed_results.next().await { + let (key, expected_hash, body) = result?; + match body { + Some(body) => { + let actual_hash = sha256_hex(&body); + if actual_hash != expected_hash { + report.hash_mismatches.push(format!( + "{key}: expected {expected_hash}, got {actual_hash}" + )); + } + } + None => report.missing_committed_objects.push(key), + } + } + + let mut unknown_results = + stream::iter(unknown_writes.into_iter().map(|(key, attempted_hash)| { + let s3 = s3.clone(); + let recorder = recorder.clone(); + async move { + let body = s3.get_object(&key, &recorder).await?; + Ok::<_, anyhow::Error>((key, attempted_hash, body)) + } + })) + .buffer_unordered(concurrency); + while let Some(result) = unknown_results.next().await { + let (key, attempted_hash, body) = result?; + if let Some(body) = body { + let actual_hash = sha256_hex(&body); + report.unknown_writes_materialized.push(format!( + "{key}: attempted {attempted_hash}, got {actual_hash}" + )); + } + } + + let run_id = recorder.run_id(); + let prefix = ObjectSpec::key_prefix(&run_id); + match s3.list_prefix(&prefix, recorder).await? { + Some(keys) => { + let listed = keys.into_iter().collect::>(); + for key in committed.keys() { + if !listed.contains(key) { + report.list_warnings.push(format!( + "LIST prefix {prefix} did not include committed key {key}" + )); + } + } + } + None => report + .list_warnings + .push(format!("LIST prefix {prefix} did not complete")), + } + + report.missing_committed_objects.sort(); + report.hash_mismatches.sort(); + report.unknown_writes_materialized.sort(); + report.list_warnings.sort(); + report.passed = report.tenant_recovered + && report.missing_committed_objects.is_empty() + && report.hash_mismatches.is_empty() + && report.successful_corrupted_reads.is_empty() + && report.list_warnings.is_empty(); + + Ok(report) +} + +fn committed_puts(records: &[OperationRecord]) -> BTreeMap { + records + .iter() + .filter(|record| { + record.kind == OperationKind::Put && record.outcome == OperationOutcome::Ok + }) + .filter_map(|record| Some((record.key.clone()?, record.value_sha256.clone()?))) + .collect() +} + +fn unknown_puts(records: &[OperationRecord]) -> BTreeMap { + records + .iter() + .filter(|record| { + record.kind == OperationKind::Put + && matches!( + record.outcome, + OperationOutcome::Timeout | OperationOutcome::Unknown + ) + }) + .filter_map(|record| Some((record.key.clone()?, record.value_sha256.clone()?))) + .collect() +} + +fn successful_corrupted_reads( + records: &[OperationRecord], + committed: &BTreeMap, +) -> Vec { + records + .iter() + .filter(|record| { + record.kind == OperationKind::Get && record.outcome == OperationOutcome::Ok + }) + .filter_map(|record| { + let key = record.key.as_ref()?; + let expected_hash = committed.get(key)?; + let actual_hash = record.value_sha256.as_ref()?; + (expected_hash != actual_hash) + .then(|| format!("{key}: expected {expected_hash}, got {actual_hash}")) + }) + .collect() +} + +#[cfg(test)] +mod tests { + use super::{CheckerReport, successful_corrupted_reads}; + use crate::framework::history::{OperationKind, OperationOutcome, OperationRecord}; + use std::collections::BTreeMap; + + fn record( + kind: OperationKind, + key: &str, + hash: &str, + outcome: OperationOutcome, + ) -> OperationRecord { + OperationRecord { + id: "op-1".to_string(), + scenario: "io-eio".to_string(), + kind, + bucket: "bucket".to_string(), + key: Some(key.to_string()), + value_sha256: Some(hash.to_string()), + size_bytes: Some(1), + started_at_ms: 1, + ended_at_ms: 2, + outcome, + http_status: Some(200), + error: None, + } + } + + #[test] + fn corrupted_successful_get_is_hard_failure_input() { + let records = vec![record(OperationKind::Get, "k", "bad", OperationOutcome::Ok)]; + let committed = BTreeMap::from([("k".to_string(), "good".to_string())]); + + let corrupted = successful_corrupted_reads(&records, &committed); + + assert_eq!(corrupted, vec!["k: expected good, got bad"]); + } + + #[test] + fn report_requires_clean_correctness_verdict() { + let report = CheckerReport { + scenario: "io-eio".to_string(), + run_id: "run-1".to_string(), + committed_puts: 1, + missing_committed_objects: Vec::new(), + hash_mismatches: Vec::new(), + successful_corrupted_reads: Vec::new(), + unknown_writes_materialized: Vec::new(), + list_warnings: Vec::new(), + tenant_recovered: true, + passed: true, + }; + + assert!(report.require_success().is_ok()); + } +} diff --git a/e2e/src/framework/config.rs b/e2e/src/framework/config.rs index c3c4cfe..c0c079f 100644 --- a/e2e/src/framework/config.rs +++ b/e2e/src/framework/config.rs @@ -13,6 +13,7 @@ // limitations under the License. use operator::types::v1alpha1::k8s::PodManagementPolicy; +use std::ops::{Deref, DerefMut}; use std::path::PathBuf; use std::time::Duration; @@ -23,25 +24,43 @@ pub const DEFAULT_CERT_MANAGER_VERSION: &str = "v1.16.2"; pub const KIND_WORKER_COUNT: usize = 3; #[derive(Debug, Clone)] -pub struct E2eConfig { - pub cluster_name: String, +pub struct ClusterTestConfig { pub context: String, pub operator_namespace: String, pub test_namespace_prefix: String, pub test_namespace: String, pub tenant_name: String, pub storage_class: String, + pub rustfs_image: String, + pub pod_management_policy: Option, + pub artifacts_dir: PathBuf, + pub timeout: Duration, +} + +#[derive(Debug, Clone)] +pub struct E2eConfig { + pub cluster: ClusterTestConfig, + pub cluster_name: String, pub pv_count: usize, pub operator_image: String, pub console_web_image: String, - pub rustfs_image: String, pub cert_manager_version: String, - pub pod_management_policy: Option, pub kind_config: PathBuf, - pub artifacts_dir: PathBuf, pub live_enabled: bool, - pub destructive_enabled: bool, - pub timeout: Duration, +} + +impl Deref for E2eConfig { + type Target = ClusterTestConfig; + + fn deref(&self) -> &Self::Target { + &self.cluster + } +} + +impl DerefMut for E2eConfig { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.cluster + } } impl E2eConfig { @@ -64,17 +83,30 @@ impl E2eConfig { let test_namespace = env_or(&get_env, "RUSTFS_E2E_NAMESPACE", &test_namespace_default); Self { + cluster: ClusterTestConfig { + context, + operator_namespace: env_or( + &get_env, + "RUSTFS_E2E_OPERATOR_NAMESPACE", + "rustfs-system", + ), + test_namespace_prefix, + test_namespace, + tenant_name: env_or(&get_env, "RUSTFS_E2E_TENANT", "e2e-tenant"), + storage_class: env_or(&get_env, "RUSTFS_E2E_STORAGE_CLASS", "local-storage"), + rustfs_image: env_or(&get_env, "RUSTFS_E2E_SERVER_IMAGE", DEFAULT_RUSTFS_IMAGE), + artifacts_dir: PathBuf::from(env_or( + &get_env, + "RUSTFS_E2E_ARTIFACTS", + "target/e2e/artifacts", + )), + pod_management_policy: parse_pod_management_policy(&get_env), + timeout: Duration::from_secs(env_u64(&get_env, "RUSTFS_E2E_TIMEOUT_SECONDS", 300)), + }, cluster_name, - context, - operator_namespace: env_or(&get_env, "RUSTFS_E2E_OPERATOR_NAMESPACE", "rustfs-system"), - test_namespace_prefix, - test_namespace, - tenant_name: env_or(&get_env, "RUSTFS_E2E_TENANT", "e2e-tenant"), - storage_class: env_or(&get_env, "RUSTFS_E2E_STORAGE_CLASS", "local-storage"), pv_count: env_usize(&get_env, "RUSTFS_E2E_PV_COUNT", 12), operator_image: "rustfs/operator:e2e".to_string(), console_web_image: "rustfs/console-web:e2e".to_string(), - rustfs_image: env_or(&get_env, "RUSTFS_E2E_SERVER_IMAGE", DEFAULT_RUSTFS_IMAGE), cert_manager_version: env_or( &get_env, "RUSTFS_E2E_CERT_MANAGER_VERSION", @@ -85,15 +117,7 @@ impl E2eConfig { "RUSTFS_E2E_KIND_CONFIG", "e2e/manifests/kind-rustfs-e2e.yaml", )), - artifacts_dir: PathBuf::from(env_or( - &get_env, - "RUSTFS_E2E_ARTIFACTS", - "target/e2e/artifacts", - )), - pod_management_policy: parse_pod_management_policy(&get_env), live_enabled: env_bool(&get_env, "RUSTFS_E2E_LIVE"), - destructive_enabled: env_bool(&get_env, "RUSTFS_E2E_DESTRUCTIVE"), - timeout: Duration::from_secs(env_u64(&get_env, "RUSTFS_E2E_TIMEOUT_SECONDS", 300)), } } diff --git a/e2e/src/framework/fault_config.rs b/e2e/src/framework/fault_config.rs new file mode 100644 index 0000000..ab018d6 --- /dev/null +++ b/e2e/src/framework/fault_config.rs @@ -0,0 +1,430 @@ +// Copyright 2025 RustFS Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Context, Result, ensure}; +use serde_json::Value; +use std::path::PathBuf; +use std::time::Duration; + +use crate::framework::{command::CommandSpec, config::ClusterTestConfig, kubectl::Kubectl}; + +#[derive(Debug, Clone)] +pub struct FaultTestConfig { + pub cluster: ClusterTestConfig, + pub destructive_enabled: bool, + pub scenario: String, + pub duration: Duration, + pub percent: u8, + pub workload_objects: usize, + pub workload_concurrency: usize, + pub workload_seed: Option, + pub request_timeout: Duration, + pub use_cluster_ip: bool, + pub require_client_disruption: bool, + pub dm_name: Option, + pub dm_node: Option, + pub dm_mount_path: Option, + pub dm_fault_table: Option, + pub dm_recovery_table: Option, + pub dm_helper_image: String, + pub warp_duration: Duration, + pub chaos_namespace: String, +} + +impl FaultTestConfig { + pub fn from_env() -> Result { + let context = current_context()?; + Self::from_env_with(|name| std::env::var(name).ok(), context) + } + + fn from_env_with(get_env: F, context: String) -> Result + where + F: Fn(&str) -> Option, + { + ensure!( + !context.starts_with("kind-"), + "fault tests require a real Kubernetes cluster; current context {context:?} is a Kind context" + ); + + let storage_class = required_env(&get_env, "RUSTFS_FAULT_TEST_STORAGE_CLASS")?; + let namespace = env_or(&get_env, "RUSTFS_FAULT_TEST_NAMESPACE", "rustfs-fault-test"); + let scenario = env_or(&get_env, "RUSTFS_FAULT_TEST_SCENARIO", "io-eio"); + let default_percent = if scenario == "disk-full" { 100 } else { 20 }; + let cluster = ClusterTestConfig { + context, + operator_namespace: env_or( + &get_env, + "RUSTFS_FAULT_TEST_OPERATOR_NAMESPACE", + "rustfs-system", + ), + test_namespace_prefix: namespace.clone(), + test_namespace: namespace, + tenant_name: env_or(&get_env, "RUSTFS_FAULT_TEST_TENANT", "fault-test-tenant"), + storage_class, + rustfs_image: env_or( + &get_env, + "RUSTFS_FAULT_TEST_SERVER_IMAGE", + "rustfs/rustfs:latest", + ), + artifacts_dir: PathBuf::from(env_or( + &get_env, + "RUSTFS_FAULT_TEST_ARTIFACTS", + "target/fault-tests/artifacts", + )), + pod_management_policy: None, + timeout: Duration::from_secs(env_u64( + &get_env, + "RUSTFS_FAULT_TEST_TIMEOUT_SECONDS", + 300, + )), + }; + + Ok(Self { + cluster, + destructive_enabled: env_bool(&get_env, "RUSTFS_FAULT_TEST_DESTRUCTIVE"), + scenario, + duration: Duration::from_secs(env_u64( + &get_env, + "RUSTFS_FAULT_TEST_DURATION_SECONDS", + 7200, + )), + percent: env_u8(&get_env, "RUSTFS_FAULT_TEST_PERCENT", default_percent), + workload_objects: env_usize(&get_env, "RUSTFS_FAULT_TEST_WORKLOAD_OBJECTS", 40000), + workload_concurrency: env_usize(&get_env, "RUSTFS_FAULT_TEST_WORKLOAD_CONCURRENCY", 80), + workload_seed: env_optional_u64(&get_env, "RUSTFS_FAULT_TEST_SEED")?, + request_timeout: Duration::from_secs(env_u64( + &get_env, + "RUSTFS_FAULT_TEST_REQUEST_TIMEOUT_SECONDS", + 30, + )), + use_cluster_ip: env_bool(&get_env, "RUSTFS_FAULT_TEST_USE_CLUSTER_IP"), + require_client_disruption: env_bool( + &get_env, + "RUSTFS_FAULT_TEST_REQUIRE_CLIENT_DISRUPTION", + ), + dm_name: env_optional(&get_env, "RUSTFS_FAULT_TEST_DM_NAME"), + dm_node: env_optional(&get_env, "RUSTFS_FAULT_TEST_DM_NODE"), + dm_mount_path: env_optional(&get_env, "RUSTFS_FAULT_TEST_DM_MOUNT_PATH"), + dm_fault_table: env_optional(&get_env, "RUSTFS_FAULT_TEST_DM_FAULT_TABLE"), + dm_recovery_table: env_optional(&get_env, "RUSTFS_FAULT_TEST_DM_RECOVERY_TABLE"), + dm_helper_image: env_or( + &get_env, + "RUSTFS_FAULT_TEST_DM_HELPER_IMAGE", + "rancher/mirrored-library-busybox:1.37.0", + ), + warp_duration: Duration::from_secs(env_u64( + &get_env, + "RUSTFS_FAULT_TEST_WARP_DURATION_SECONDS", + 60, + )), + chaos_namespace: env_or(&get_env, "RUSTFS_FAULT_TEST_CHAOS_NAMESPACE", "chaos-mesh"), + }) + } + + pub fn require_destructive_enabled(&self) -> Result<()> { + ensure!( + self.destructive_enabled, + "destructive fault tests are disabled; run through an e2e package fault Make target or set RUSTFS_FAULT_TEST_DESTRUCTIVE=1 explicitly" + ); + Ok(()) + } + + pub fn validate_cluster(&self, allow_static_storage: bool) -> Result<()> { + Kubectl::new(&self.cluster) + .command(["get", "crd", "tenants.rustfs.com"]) + .run_checked() + .context("RustFS Tenant CRD tenants.rustfs.com is required")?; + + let output = Kubectl::new(&self.cluster) + .command([ + "get", + "storageclass", + &self.cluster.storage_class, + "-o", + "json", + ]) + .run_checked() + .with_context(|| { + format!( + "fault-test StorageClass {:?} is required", + self.cluster.storage_class + ) + })?; + validate_storage_class(&output.stdout, allow_static_storage) + } + + #[cfg(test)] + pub(crate) fn for_test(context: &str, storage_class: &str) -> Self { + Self::from_env_with( + |name| match name { + "RUSTFS_FAULT_TEST_STORAGE_CLASS" => Some(storage_class.to_string()), + _ => None, + }, + context.to_string(), + ) + .expect("fault test config") + } +} + +fn validate_storage_class(raw: &str, allow_static: bool) -> Result<()> { + let value = serde_json::from_str::(raw).context("parse StorageClass json")?; + let provisioner = value + .get("provisioner") + .and_then(Value::as_str) + .unwrap_or_default(); + ensure!( + !provisioner.is_empty(), + "StorageClass provisioner is missing" + ); + ensure!( + allow_static || provisioner != "kubernetes.io/no-provisioner", + "fault tests require a dynamically provisioned StorageClass unless the selected scenario explicitly requires dedicated static local PVs, got {provisioner}" + ); + Ok(()) +} + +fn current_context() -> Result { + let output = CommandSpec::new("kubectl") + .args(["config", "current-context"]) + .run_checked()?; + Ok(output.stdout.trim().to_string()) +} + +fn required_env(get_env: &F, name: &str) -> Result +where + F: Fn(&str) -> Option, +{ + let value = get_env(name).unwrap_or_default(); + ensure!(!value.trim().is_empty(), "{name} is required"); + Ok(value) +} + +fn env_or(get_env: &F, name: &str, default: &str) -> String +where + F: Fn(&str) -> Option, +{ + get_env(name).unwrap_or_else(|| default.to_string()) +} + +fn env_optional(get_env: &F, name: &str) -> Option +where + F: Fn(&str) -> Option, +{ + get_env(name).filter(|value| !value.trim().is_empty()) +} + +fn env_bool(get_env: &F, name: &str) -> bool +where + F: Fn(&str) -> Option, +{ + get_env(name) + .map(|value| matches!(value.as_str(), "1" | "true" | "TRUE" | "yes" | "YES")) + .unwrap_or(false) +} + +fn env_u64(get_env: &F, name: &str, default: u64) -> u64 +where + F: Fn(&str) -> Option, +{ + get_env(name) + .and_then(|value| value.parse::().ok()) + .unwrap_or(default) +} + +fn env_optional_u64(get_env: &F, name: &str) -> Result> +where + F: Fn(&str) -> Option, +{ + get_env(name) + .map(|value| { + value + .parse::() + .with_context(|| format!("{name} must be an unsigned 64-bit integer")) + }) + .transpose() +} + +fn env_usize(get_env: &F, name: &str, default: usize) -> usize +where + F: Fn(&str) -> Option, +{ + get_env(name) + .and_then(|value| value.parse::().ok()) + .unwrap_or(default) +} + +fn env_u8(get_env: &F, name: &str, default: u8) -> u8 +where + F: Fn(&str) -> Option, +{ + get_env(name) + .and_then(|value| value.parse::().ok()) + .unwrap_or(default) +} + +#[cfg(test)] +mod tests { + use super::{FaultTestConfig, validate_storage_class}; + + #[test] + fn real_cluster_fault_defaults_are_isolated() { + let config = FaultTestConfig::from_env_with( + |name| match name { + "RUSTFS_FAULT_TEST_STORAGE_CLASS" => Some("fast-csi".to_string()), + _ => None, + }, + "production-test-cluster".to_string(), + ) + .expect("fault config"); + + assert_eq!(config.cluster.context, "production-test-cluster"); + assert_eq!(config.cluster.test_namespace, "rustfs-fault-test"); + assert_eq!(config.cluster.tenant_name, "fault-test-tenant"); + assert_eq!(config.cluster.storage_class, "fast-csi"); + assert_eq!( + config.cluster.artifacts_dir, + std::path::PathBuf::from("target/fault-tests/artifacts") + ); + assert_eq!(config.scenario, "io-eio"); + assert_eq!(config.duration, std::time::Duration::from_secs(7200)); + assert_eq!(config.percent, 20); + assert_eq!(config.workload_objects, 40000); + assert_eq!(config.workload_concurrency, 80); + assert_eq!(config.workload_seed, None); + assert_eq!(config.request_timeout, std::time::Duration::from_secs(30)); + assert!(!config.use_cluster_ip); + assert!(config.dm_name.is_none()); + assert!(config.dm_node.is_none()); + assert!(config.dm_mount_path.is_none()); + assert!(config.dm_fault_table.is_none()); + assert!(config.dm_recovery_table.is_none()); + assert_eq!( + config.dm_helper_image, + "rancher/mirrored-library-busybox:1.37.0" + ); + assert_eq!(config.warp_duration, std::time::Duration::from_secs(60)); + assert!(!config.destructive_enabled); + assert!(config.require_destructive_enabled().is_err()); + } + + #[test] + fn fault_scenario_env_overrides_are_parsed() { + let config = FaultTestConfig::from_env_with( + |name| match name { + "RUSTFS_FAULT_TEST_STORAGE_CLASS" => Some("fast-csi".to_string()), + "RUSTFS_FAULT_TEST_SCENARIO" => Some("dm-flakey".to_string()), + "RUSTFS_FAULT_TEST_DURATION_SECONDS" => Some("45".to_string()), + "RUSTFS_FAULT_TEST_PERCENT" => Some("35".to_string()), + "RUSTFS_FAULT_TEST_WORKLOAD_OBJECTS" => Some("64".to_string()), + "RUSTFS_FAULT_TEST_WORKLOAD_CONCURRENCY" => Some("8".to_string()), + "RUSTFS_FAULT_TEST_SEED" => Some("4242".to_string()), + "RUSTFS_FAULT_TEST_REQUEST_TIMEOUT_SECONDS" => Some("7".to_string()), + "RUSTFS_FAULT_TEST_USE_CLUSTER_IP" => Some("true".to_string()), + "RUSTFS_FAULT_TEST_REQUIRE_CLIENT_DISRUPTION" => Some("true".to_string()), + "RUSTFS_FAULT_TEST_DM_NAME" => Some("rustfs-test".to_string()), + "RUSTFS_FAULT_TEST_DM_NODE" => Some("worker-a".to_string()), + "RUSTFS_FAULT_TEST_DM_MOUNT_PATH" => { + Some("/data/rustfs-fault/dm-volume".to_string()) + } + "RUSTFS_FAULT_TEST_DM_FAULT_TABLE" => Some("0 1024 error".to_string()), + "RUSTFS_FAULT_TEST_DM_RECOVERY_TABLE" => { + Some("0 1024 linear /dev/loop0 0".to_string()) + } + "RUSTFS_FAULT_TEST_WARP_DURATION_SECONDS" => Some("30".to_string()), + "RUSTFS_FAULT_TEST_DM_HELPER_IMAGE" => Some("busybox:test".to_string()), + _ => None, + }, + "production-test-cluster".to_string(), + ) + .expect("fault config"); + + assert_eq!(config.scenario, "dm-flakey"); + assert_eq!(config.duration, std::time::Duration::from_secs(45)); + assert_eq!(config.percent, 35); + assert_eq!(config.workload_objects, 64); + assert_eq!(config.workload_concurrency, 8); + assert_eq!(config.workload_seed, Some(4242)); + assert_eq!(config.request_timeout, std::time::Duration::from_secs(7)); + assert!(config.use_cluster_ip); + assert!(config.require_client_disruption); + assert_eq!(config.dm_name.as_deref(), Some("rustfs-test")); + assert_eq!(config.dm_node.as_deref(), Some("worker-a")); + assert_eq!( + config.dm_mount_path.as_deref(), + Some("/data/rustfs-fault/dm-volume") + ); + assert_eq!(config.dm_fault_table.as_deref(), Some("0 1024 error")); + assert_eq!( + config.dm_recovery_table.as_deref(), + Some("0 1024 linear /dev/loop0 0") + ); + assert_eq!(config.warp_duration, std::time::Duration::from_secs(30)); + assert_eq!(config.dm_helper_image, "busybox:test"); + } + + #[test] + fn kind_context_is_rejected_for_fault_tests() { + let result = FaultTestConfig::from_env_with( + |name| match name { + "RUSTFS_FAULT_TEST_STORAGE_CLASS" => Some("local-storage".to_string()), + _ => None, + }, + "kind-rustfs-e2e".to_string(), + ); + + assert!(result.is_err()); + } + + #[test] + fn invalid_workload_seed_is_rejected() { + let result = FaultTestConfig::from_env_with( + |name| match name { + "RUSTFS_FAULT_TEST_STORAGE_CLASS" => Some("fast-csi".to_string()), + "RUSTFS_FAULT_TEST_SEED" => Some("not-a-number".to_string()), + _ => None, + }, + "production-test-cluster".to_string(), + ); + + assert!(result.is_err()); + } + + #[test] + fn dynamic_storage_class_is_required() { + assert!(validate_storage_class(r#"{"provisioner":"ebs.csi.aws.com"}"#, false).is_ok()); + assert!( + validate_storage_class(r#"{"provisioner":"kubernetes.io/no-provisioner"}"#, false) + .is_err() + ); + assert!( + validate_storage_class(r#"{"provisioner":"kubernetes.io/no-provisioner"}"#, true) + .is_ok() + ); + } + + #[test] + fn disk_full_defaults_to_full_enospc_injection() { + let config = FaultTestConfig::from_env_with( + |name| match name { + "RUSTFS_FAULT_TEST_STORAGE_CLASS" => Some("fast-csi".to_string()), + "RUSTFS_FAULT_TEST_SCENARIO" => Some("disk-full".to_string()), + _ => None, + }, + "production-test-cluster".to_string(), + ) + .expect("fault config"); + + assert_eq!(config.percent, 100); + } +} diff --git a/e2e/src/framework/fault_scenarios.rs b/e2e/src/framework/fault_scenarios.rs new file mode 100644 index 0000000..7f83c03 --- /dev/null +++ b/e2e/src/framework/fault_scenarios.rs @@ -0,0 +1,332 @@ +// Copyright 2025 RustFS Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Result, ensure}; +use std::time::Duration; + +use crate::framework::fault_config::FaultTestConfig; + +pub const IO_EIO_SCENARIO: &str = "io-eio"; +pub const POD_KILL_ONE_SCENARIO: &str = "pod-kill-one"; +pub const NETWORK_PARTITION_ONE_SCENARIO: &str = "network-partition-one"; +pub const IO_READ_MISTAKE_SCENARIO: &str = "io-read-mistake"; +pub const DISK_FULL_SCENARIO: &str = "disk-full"; +pub const DM_FLAKEY_SCENARIO: &str = "dm-flakey"; +pub const WARP_UNDER_CHAOS_SCENARIO: &str = "warp-under-chaos"; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum FaultScenarioStatus { + Executable, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum FaultPriority { + P0, + P1, + P2, + P3, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum FaultBackend { + ChaosMeshIoChaos, + ChaosMeshPodChaos, + ChaosMeshNetworkChaos, + DeviceMapper, + MinioWarpWithChaos, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum FaultIsolation { + FreshTenant, + ReusableTenant, + DedicatedLinuxBlockDevice, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct FaultScenarioSpec { + pub scenario: &'static str, + pub case_name: &'static str, + pub description: &'static str, + pub priority: FaultPriority, + pub backend: FaultBackend, + pub status: FaultScenarioStatus, + pub isolation: FaultIsolation, + pub boundary: &'static str, + pub ci_phase: &'static str, + pub target: &'static str, + pub validation: &'static str, + pub observability: &'static str, + pub conflict_domain: &'static str, +} + +pub const FAULT_SCENARIO_CATALOG: &[FaultScenarioSpec] = &[ + FaultScenarioSpec { + scenario: IO_EIO_SCENARIO, + case_name: "fault_io_eio_preserves_committed_objects", + description: "Inject Chaos Mesh IOChaos EIO into one RustFS data volume and verify committed S3 objects remain readable with matching hashes after recovery.", + priority: FaultPriority::P0, + backend: FaultBackend::ChaosMeshIoChaos, + status: FaultScenarioStatus::Executable, + isolation: FaultIsolation::FreshTenant, + boundary: "rustfs-workload/fault-injection", + ci_phase: "faults", + target: "one RustFS container data volume selected by tenant label and /data/rustfs0 path", + validation: "prefill succeeds before injection, mixed PUT/GET workload runs while IOChaos is active, committed PUTs are GET+sha256 verified after recovery, and successful GETs cannot return corrupt bytes", + observability: "history.jsonl, workload-summary.json, checker-report.json, chaos-manifest.yaml, chaos-describe*.txt, Kubernetes snapshot artifacts", + conflict_domain: "fresh Tenant/PVC/PV fixture and run-scoped IOChaos cleanup", + }, + FaultScenarioSpec { + scenario: POD_KILL_ONE_SCENARIO, + case_name: "fault_pod_kill_one_preserves_committed_objects", + description: "Inject Chaos Mesh PodChaos against one RustFS Pod and verify StatefulSet recovery preserves committed S3 objects.", + priority: FaultPriority::P0, + backend: FaultBackend::ChaosMeshPodChaos, + status: FaultScenarioStatus::Executable, + isolation: FaultIsolation::ReusableTenant, + boundary: "rustfs-workload/pod-recovery", + ci_phase: "faults", + target: "one RustFS Pod selected by tenant label", + validation: "the killed Pod is recreated, Tenant returns Ready, committed PUTs remain readable with matching hashes, and failed or unknown operations are recorded without becoming correctness failures", + observability: "history.jsonl, workload-summary.json, checker-report.json, podchaos manifest/describe/yaml, Pod restart counts, current and previous RustFS logs", + conflict_domain: "run-scoped PodChaos resource and one target Pod; can reuse a ready Tenant after the prior scenario has cleaned up", + }, + FaultScenarioSpec { + scenario: NETWORK_PARTITION_ONE_SCENARIO, + case_name: "fault_network_partition_one_preserves_committed_objects", + description: "Inject Chaos Mesh NetworkChaos that partitions one RustFS Pod from its peers and verify recovery does not lose or corrupt committed objects.", + priority: FaultPriority::P1, + backend: FaultBackend::ChaosMeshNetworkChaos, + status: FaultScenarioStatus::Executable, + isolation: FaultIsolation::ReusableTenant, + boundary: "rustfs-workload/network-partition", + ci_phase: "faults", + target: "one RustFS Pod selected by tenant label with peer traffic disrupted inside the e2e namespace", + validation: "network disruption is active during workload, successful reads never return wrong hashes, committed PUTs remain readable after heal, and Tenant recovers Ready", + observability: "history.jsonl, workload-summary.json, checker-report.json, networkchaos manifest/describe/yaml, endpoints, events, and RustFS logs", + conflict_domain: "run-scoped NetworkChaos resource; must not overlap with PodChaos or IOChaos in the same Tenant", + }, + FaultScenarioSpec { + scenario: IO_READ_MISTAKE_SCENARIO, + case_name: "fault_io_read_mistake_rejects_corrupt_reads", + description: "Inject Chaos Mesh IOChaos mistake on RustFS read paths and verify RustFS never returns corrupt object bytes as successful S3 reads.", + priority: FaultPriority::P1, + backend: FaultBackend::ChaosMeshIoChaos, + status: FaultScenarioStatus::Executable, + isolation: FaultIsolation::FreshTenant, + boundary: "rustfs-workload/data-integrity", + ci_phase: "faults", + target: "one RustFS data volume read path selected by tenant label and /data/rustfs0 path", + validation: "successful GET responses must match the committed hash; RustFS may fail or repair reads but must not return wrong bytes with a successful status", + observability: "history.jsonl, checker-report.json with successful_corrupted_reads, iochaos manifest/describe/yaml, RustFS logs, events", + conflict_domain: "fresh Tenant/PVC/PV fixture and run-scoped IOChaos mistake resource", + }, + FaultScenarioSpec { + scenario: DISK_FULL_SCENARIO, + case_name: "fault_disk_full_preserves_committed_objects", + description: "Inject ENOSPC on writes to one RustFS data volume and verify committed objects survive storage pressure and recovery.", + priority: FaultPriority::P1, + backend: FaultBackend::ChaosMeshIoChaos, + status: FaultScenarioStatus::Executable, + isolation: FaultIsolation::FreshTenant, + boundary: "rustfs-workload/storage-pressure", + ci_phase: "faults", + target: "one RustFS data volume selected by tenant label with WRITE operations returning ENOSPC", + validation: "new writes may fail with ENOSPC, but previously committed PUTs remain readable after IOChaos recovery", + observability: "history.jsonl, checker-report.json, fault-evidence.json, IOChaos manifest/status, events, RustFS logs", + conflict_domain: "fresh Tenant/PVC/PV fixture and run-scoped IOChaos cleanup without consuming node disk capacity", + }, + FaultScenarioSpec { + scenario: DM_FLAKEY_SCENARIO, + case_name: "fault_dm_flakey_preserves_committed_objects", + description: "Use a device-mapper flakey or error target for a dedicated test volume and verify RustFS handles block-device instability without data corruption.", + priority: FaultPriority::P3, + backend: FaultBackend::DeviceMapper, + status: FaultScenarioStatus::Executable, + isolation: FaultIsolation::DedicatedLinuxBlockDevice, + boundary: "rustfs-workload/block-device-fault", + ci_phase: "faults", + target: "one dedicated Linux block-device-backed PV used only by the e2e Tenant", + validation: "committed objects remain readable after the device fault is removed, and successful reads never return corrupt bytes", + observability: "history.jsonl, checker-report.json, dmsetup table/status, kernel logs, PV mapping, events, RustFS logs", + conflict_domain: "dedicated Linux runner or lab host with an explicitly assigned block device; never part of shared test storage", + }, + FaultScenarioSpec { + scenario: WARP_UNDER_CHAOS_SCENARIO, + case_name: "fault_warp_under_chaos_reports_performance_separately", + description: "Run MinIO Warp during a selected chaos scenario while keeping performance output separate from the correctness verdict.", + priority: FaultPriority::P3, + backend: FaultBackend::MinioWarpWithChaos, + status: FaultScenarioStatus::Executable, + isolation: FaultIsolation::FreshTenant, + boundary: "rustfs-workload/performance-under-chaos", + ci_phase: "faults", + target: "RustFS S3 endpoint under an explicitly selected fault backend", + validation: "Warp throughput or latency changes are reported separately; correctness still comes only from history and checker reports", + observability: "warp report, history.jsonl, checker-report.json, selected chaos manifest/describe/yaml, RustFS logs", + conflict_domain: "performance-only run with isolated bucket prefix and no shared correctness threshold", + }, +]; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FaultScenario { + pub name: String, + pub case_name: &'static str, + pub duration: Duration, + pub percent: u8, + pub object_count: usize, +} + +impl FaultScenario { + pub fn from_config(config: &FaultTestConfig) -> Result { + let spec = scenario_spec(&config.scenario)?; + ensure!( + spec.status == FaultScenarioStatus::Executable, + "fault scenario {:?} is cataloged as {:?} but is not executable yet; case {}, backend {:?}, validation: {}", + config.scenario, + spec.status, + spec.case_name, + spec.backend, + spec.validation + ); + ensure!( + (1..=100).contains(&config.percent), + "RUSTFS_FAULT_TEST_PERCENT must be in 1..=100, got {}", + config.percent + ); + ensure!( + config.duration > Duration::ZERO, + "RUSTFS_FAULT_TEST_DURATION_SECONDS must be greater than zero" + ); + ensure!( + config.workload_objects >= 4, + "RUSTFS_FAULT_TEST_WORKLOAD_OBJECTS must be at least 4" + ); + ensure!( + (1..=config.workload_objects).contains(&config.workload_concurrency), + "RUSTFS_FAULT_TEST_WORKLOAD_CONCURRENCY must be between 1 and RUSTFS_FAULT_TEST_WORKLOAD_OBJECTS ({})", + config.workload_objects + ); + + Ok(Self { + name: spec.scenario.to_string(), + case_name: spec.case_name, + duration: config.duration, + percent: config.percent, + object_count: config.workload_objects, + }) + } + + pub fn prefill_count(&self) -> usize { + self.object_count / 2 + } + + pub fn mixed_workload_count(&self) -> usize { + self.object_count - self.prefill_count() + } +} + +pub fn scenario_catalog() -> &'static [FaultScenarioSpec] { + FAULT_SCENARIO_CATALOG +} + +pub fn scenario_spec(name: &str) -> Result<&'static FaultScenarioSpec> { + FAULT_SCENARIO_CATALOG + .iter() + .find(|scenario| scenario.scenario == name) + .ok_or_else(|| { + let supported = FAULT_SCENARIO_CATALOG + .iter() + .map(|scenario| scenario.scenario) + .collect::>() + .join(", "); + anyhow::anyhow!("unsupported fault scenario {name:?}; catalog contains: {supported}") + }) +} + +#[cfg(test)] +mod tests { + use super::{FaultScenario, FaultScenarioStatus, IO_EIO_SCENARIO, scenario_catalog}; + use crate::framework::fault_config::FaultTestConfig; + use std::time::Duration; + + #[test] + fn default_fault_scenario_is_io_eio_with_split_workload() { + let config = FaultTestConfig::for_test("real-cluster", "fast-csi"); + let scenario = FaultScenario::from_config(&config).expect("valid scenario"); + + assert_eq!(scenario.name, IO_EIO_SCENARIO); + assert_eq!( + scenario.case_name, + "fault_io_eio_preserves_committed_objects" + ); + assert_eq!(scenario.duration, Duration::from_secs(7200)); + assert_eq!(scenario.percent, 20); + assert_eq!(scenario.prefill_count(), 20000); + assert_eq!(scenario.mixed_workload_count(), 20000); + } + + #[test] + fn unsupported_fault_scenario_is_rejected() { + let mut config = FaultTestConfig::for_test("real-cluster", "fast-csi"); + config.scenario = "operator-restart".to_string(); + + assert!(FaultScenario::from_config(&config).is_err()); + } + + #[test] + fn workload_concurrency_must_fit_the_object_count() { + let mut config = FaultTestConfig::for_test("real-cluster", "fast-csi"); + config.workload_objects = 4; + config.workload_concurrency = 5; + + assert!(FaultScenario::from_config(&config).is_err()); + } + + #[test] + fn all_cataloged_fault_scenarios_are_executable() { + let mut config = FaultTestConfig::for_test("real-cluster", "fast-csi"); + + for spec in scenario_catalog() { + config.scenario = spec.scenario.to_string(); + + assert_eq!(spec.status, FaultScenarioStatus::Executable); + assert!( + FaultScenario::from_config(&config).is_ok(), + "{} should be selectable through the real-cluster fault-test entrypoint", + spec.scenario + ); + } + + assert_eq!(scenario_catalog().len(), 7); + } + + #[test] + fn fault_scenario_catalog_has_unique_clear_and_observable_cases() { + let mut names = std::collections::HashSet::new(); + let mut case_names = std::collections::HashSet::new(); + + for scenario in scenario_catalog() { + assert!(names.insert(scenario.scenario)); + assert!(case_names.insert(scenario.case_name)); + assert!(!scenario.description.is_empty()); + assert!(!scenario.boundary.is_empty()); + assert!(!scenario.ci_phase.is_empty()); + assert!(!scenario.target.is_empty()); + assert!(!scenario.validation.is_empty()); + assert!(!scenario.observability.is_empty()); + assert!(!scenario.conflict_domain.is_empty()); + } + } +} diff --git a/e2e/src/framework/history.rs b/e2e/src/framework/history.rs new file mode 100644 index 0000000..99dc105 --- /dev/null +++ b/e2e/src/framework/history.rs @@ -0,0 +1,250 @@ +// Copyright 2025 RustFS Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::Result; +use serde::{Deserialize, Serialize}; +use std::fs::{self, File}; +use std::io::{BufWriter, Write}; +use std::path::PathBuf; +use std::sync::{Arc, Mutex, MutexGuard}; +use std::time::{SystemTime, UNIX_EPOCH}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum OperationKind { + CreateBucket, + Put, + Get, + Head, + List, + Delete, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum OperationOutcome { + Ok, + Failed, + Timeout, + Unknown, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct OperationRecord { + pub id: String, + pub scenario: String, + pub kind: OperationKind, + pub bucket: String, + pub key: Option, + pub value_sha256: Option, + pub size_bytes: Option, + pub started_at_ms: u64, + pub ended_at_ms: u64, + pub outcome: OperationOutcome, + pub http_status: Option, + pub error: Option, +} + +#[derive(Debug, Clone)] +pub struct Recorder { + inner: Arc>, +} + +#[derive(Debug)] +struct RecorderState { + path: PathBuf, + scenario: String, + run_id: String, + next_id: usize, + records: Vec, + writer: BufWriter, +} + +impl Recorder { + pub fn create( + path: impl Into, + scenario: impl Into, + run_id: impl Into, + ) -> Result { + let path = path.into(); + if let Some(parent) = path.parent() { + fs::create_dir_all(parent)?; + } + let writer = BufWriter::new(File::create(&path)?); + Ok(Self { + inner: Arc::new(Mutex::new(RecorderState { + path, + scenario: scenario.into(), + run_id: run_id.into(), + next_id: 1, + records: Vec::new(), + writer, + })), + }) + } + + pub fn begin( + &self, + kind: OperationKind, + bucket: impl Into, + key: Option, + value_sha256: Option, + size_bytes: Option, + ) -> OperationRecord { + let mut state = self.state(); + let id = format!("op-{:06}", state.next_id); + state.next_id += 1; + let started_at_ms = now_ms(); + + OperationRecord { + id, + scenario: state.scenario.clone(), + kind, + bucket: bucket.into(), + key, + value_sha256, + size_bytes, + started_at_ms, + ended_at_ms: started_at_ms, + outcome: OperationOutcome::Unknown, + http_status: None, + error: None, + } + } + + pub fn finish( + &self, + mut record: OperationRecord, + outcome: OperationOutcome, + http_status: Option, + error: Option, + ) -> Result<()> { + record.ended_at_ms = now_ms(); + record.outcome = outcome; + record.http_status = http_status; + record.error = error.map(|message| truncate_error(&message)); + + let mut state = self.state(); + serde_json::to_writer(&mut state.writer, &record)?; + state.writer.write_all(b"\n")?; + state.writer.flush()?; + state.records.push(record); + Ok(()) + } + + pub fn records(&self) -> Vec { + self.state().records.clone() + } + + pub fn scenario(&self) -> String { + self.state().scenario.clone() + } + + pub fn run_id(&self) -> String { + self.state().run_id.clone() + } + + pub fn path(&self) -> PathBuf { + self.state().path.clone() + } + + fn state(&self) -> MutexGuard<'_, RecorderState> { + self.inner + .lock() + .unwrap_or_else(|poisoned| poisoned.into_inner()) + } +} + +fn now_ms() -> u64 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|duration| duration.as_millis() as u64) + .unwrap_or_default() +} + +fn truncate_error(message: &str) -> String { + const MAX_ERROR_LEN: usize = 300; + if message.len() <= MAX_ERROR_LEN { + message.to_string() + } else { + format!("{}...", &message[..MAX_ERROR_LEN]) + } +} + +#[cfg(test)] +mod tests { + use super::{OperationKind, OperationOutcome, Recorder}; + use std::collections::BTreeSet; + + #[test] + fn recorder_writes_jsonl_records() { + let dir = tempfile::tempdir().expect("tempdir"); + let path = dir.path().join("history.jsonl"); + let recorder = Recorder::create(&path, "io-eio", "run-1").expect("recorder"); + let record = recorder.begin( + OperationKind::Put, + "bucket", + Some("key".to_string()), + Some("abc".to_string()), + Some(3), + ); + + recorder + .finish(record, OperationOutcome::Ok, Some(200), None) + .expect("finish"); + + let content = std::fs::read_to_string(&path).expect("history"); + assert!(content.contains("\"scenario\":\"io-eio\"")); + assert!(content.contains("\"kind\":\"put\"")); + assert_eq!(recorder.records().len(), 1); + assert_eq!(recorder.path(), path); + } + + #[test] + fn recorder_assigns_unique_ids_across_concurrent_writers() { + let dir = tempfile::tempdir().expect("tempdir"); + let recorder = Recorder::create(dir.path().join("history.jsonl"), "io-eio", "run-1") + .expect("recorder"); + let writers = (0..8) + .map(|writer| { + let recorder = recorder.clone(); + std::thread::spawn(move || { + for operation in 0..25 { + let record = recorder.begin( + OperationKind::Put, + "bucket", + Some(format!("{writer}-{operation}")), + Some("hash".to_string()), + Some(4), + ); + recorder + .finish(record, OperationOutcome::Ok, Some(200), None) + .expect("finish"); + } + }) + }) + .collect::>(); + for writer in writers { + writer.join().expect("writer thread"); + } + + let records = recorder.records(); + let ids = records + .iter() + .map(|record| record.id.as_str()) + .collect::>(); + assert_eq!(records.len(), 200); + assert_eq!(ids.len(), 200); + } +} diff --git a/e2e/src/framework/host_faults.rs b/e2e/src/framework/host_faults.rs new file mode 100644 index 0000000..641420d --- /dev/null +++ b/e2e/src/framework/host_faults.rs @@ -0,0 +1,597 @@ +// Copyright 2025 RustFS Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Context, Result, ensure}; +use serde::Serialize; +use serde_json::Value; +use std::time::Duration; + +use crate::framework::{ + artifacts::ArtifactCollector, command::CommandOutput, command::CommandSpec, + config::ClusterTestConfig, kubectl::Kubectl, +}; + +const MANAGED_BY_LABEL: &str = "app.kubernetes.io/managed-by"; +const MANAGED_BY_VALUE: &str = "rustfs-operator-fault-test"; + +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] +pub struct DmVolumeMapping { + pub node: String, + pub pod: String, + pub pvc: String, + pub pv: String, + pub mount_path: String, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] +pub struct DmStatusSnapshot { + pub stage: String, + pub helper_pod: String, + pub mapping: DmVolumeMapping, + pub table: String, + pub status: String, +} + +#[derive(Debug)] +pub struct DmFlakeyGuard { + config: ClusterTestConfig, + helper_pod: String, + dm_name: String, + fault_table: String, + recovery_table: String, + mapping: DmVolumeMapping, + recovery_snapshot: Option, + restored: bool, +} + +#[derive(Debug)] +pub struct DmFlakeySpec<'a> { + pub node: &'a str, + pub mount_path: &'a str, + pub helper_image: &'a str, + pub name: &'a str, + pub fault_table: &'a str, + pub recovery_table: Option<&'a str>, + pub run_id: &'a str, +} + +pub fn apply_dm_flakey( + config: &ClusterTestConfig, + spec: &DmFlakeySpec<'_>, + collector: &ArtifactCollector, + case_name: &str, +) -> Result { + validate_dm_spec(spec)?; + let mapping = verify_dm_volume_mapping(config, spec.node, spec.mount_path)?; + let helper_pod = helper_pod_name(spec.run_id); + let manifest = dm_helper_manifest(config, &helper_pod, spec.node, spec.helper_image); + collector.write_text(case_name, "dm-helper-manifest.yaml", &manifest)?; + + let kubectl = Kubectl::new(config).namespaced(&config.test_namespace); + kubectl + .command([ + "delete", + "pod", + &helper_pod, + "--ignore-not-found", + "--wait=true", + ]) + .run_checked()?; + kubectl.create_yaml_command(manifest).run_checked()?; + + let mut guard = DmFlakeyGuard { + config: config.clone(), + helper_pod, + dm_name: spec.name.to_string(), + fault_table: spec.fault_table.to_string(), + recovery_table: String::new(), + mapping, + recovery_snapshot: None, + restored: false, + }; + guard.wait_helper_ready()?; + guard.verify_mount_source()?; + + let original_table = guard.dmsetup(["table", spec.name])?.stdout; + guard.recovery_table = spec + .recovery_table + .map(str::to_string) + .unwrap_or_else(|| original_table.trim().to_string()); + ensure!( + !guard.recovery_table.trim().is_empty(), + "dmsetup returned an empty recovery table for {:?}", + spec.name + ); + + guard.load_table(spec.fault_table, false)?; + let active = guard.snapshot("active")?; + ensure!( + active.table.split_whitespace().nth(2) == spec.fault_table.split_whitespace().nth(2), + "device-mapper target did not switch to the requested fault table; requested {:?}, active {:?}", + spec.fault_table, + active.table + ); + collector.write_text( + case_name, + "dm-flakey-active.json", + &serde_json::to_string_pretty(&active)?, + )?; + + Ok(guard) +} + +pub fn run_warp_mixed( + duration: Duration, + collector: &ArtifactCollector, + case_name: &str, + endpoint: &str, + bucket: &str, + access_key: &str, + secret_key: &str, +) -> Result<()> { + let host = endpoint + .strip_prefix("http://") + .or_else(|| endpoint.strip_prefix("https://")) + .unwrap_or(endpoint); + let duration = format!("{}s", duration.as_secs()); + let command = CommandSpec::new("warp").args([ + "mixed".to_string(), + format!("--host={host}"), + format!("--access-key={access_key}"), + format!("--secret-key={secret_key}"), + format!("--bucket={bucket}"), + format!("--duration={duration}"), + "--obj.size=4KiB".to_string(), + "--tls=false".to_string(), + "--autoterm".to_string(), + ]); + let output = command.run()?; + let display = command.display().replace( + &format!("--secret-key={secret_key}"), + "--secret-key=", + ); + collector.write_text( + case_name, + "warp-mixed.txt", + &format!( + "$ {}\nexit: {:?}\nstdout:\n{}\nstderr:\n{}", + display, output.code, output.stdout, output.stderr + ), + )?; + ensure!( + output.code == Some(0), + "warp mixed command failed with exit {:?}", + output.code + ); + Ok(()) +} + +impl DmFlakeyGuard { + pub fn ensure_active(&self, stage: &str) -> Result { + let snapshot = self.snapshot(stage)?; + ensure!( + snapshot.table.split_whitespace().nth(2) == self.fault_table.split_whitespace().nth(2), + "device-mapper target {:?} is no longer using the requested fault table at {stage}; expected {:?}, active {:?}", + self.dm_name, + self.fault_table, + snapshot.table + ); + Ok(snapshot) + } + + pub fn snapshot(&self, stage: &str) -> Result { + Ok(DmStatusSnapshot { + stage: stage.to_string(), + helper_pod: self.helper_pod.clone(), + mapping: self.mapping.clone(), + table: self.dmsetup(["table", self.dm_name.as_str()])?.stdout, + status: self.dmsetup(["status", self.dm_name.as_str()])?.stdout, + }) + } + + pub fn restore(&mut self) -> Result<()> { + let recovery_table = self.recovery_table.clone(); + self.load_table(&recovery_table, true)?; + self.recovery_snapshot = Some(self.snapshot("recovered")?); + self.delete_helper()?; + self.restored = true; + Ok(()) + } + + pub fn recovery_snapshot(&self) -> Option<&DmStatusSnapshot> { + self.recovery_snapshot.as_ref() + } + + fn wait_helper_ready(&self) -> Result<()> { + Kubectl::new(&self.config) + .namespaced(&self.config.test_namespace) + .command([ + "wait", + "--for=condition=Ready", + "pod", + &self.helper_pod, + "--timeout=60s", + ]) + .run_checked()?; + Ok(()) + } + + fn verify_mount_source(&self) -> Result<()> { + let source = self + .host_command([ + "/usr/bin/findmnt", + "-n", + "-o", + "SOURCE", + "--target", + self.mapping.mount_path.as_str(), + ])? + .stdout; + let mapper = self + .host_command([ + "/usr/bin/readlink", + "-f", + &format!("/dev/mapper/{}", self.dm_name), + ])? + .stdout; + let source = source.trim(); + let canonical_source = self + .host_command(["/usr/bin/readlink", "-f", source])? + .stdout; + ensure!( + canonical_source.trim() == mapper.trim(), + "fault-test PV mount {:?} on node {:?} is backed by {:?}, not device-mapper target {:?}", + self.mapping.mount_path, + self.mapping.node, + source, + self.dm_name + ); + Ok(()) + } + + fn load_table(&self, table: &str, noflush: bool) -> Result<()> { + self.dmsetup(dm_suspend_args(&self.dm_name, noflush))?; + let load = self.dmsetup(["load", self.dm_name.as_str(), "--table", table]); + let resume = self.dmsetup(dm_resume_args(&self.dm_name)); + load?; + resume?; + Ok(()) + } + + fn dmsetup(&self, args: I) -> Result + where + I: IntoIterator, + S: Into, + { + let mut command = vec!["/usr/sbin/dmsetup".to_string()]; + command.extend(args.into_iter().map(Into::into)); + self.host_command(command) + } + + fn host_command(&self, args: I) -> Result + where + I: IntoIterator, + S: Into, + { + let mut command = vec![ + "exec".to_string(), + self.helper_pod.clone(), + "--".to_string(), + "chroot".to_string(), + "/host".to_string(), + ]; + command.extend(args.into_iter().map(Into::into)); + Kubectl::new(&self.config) + .namespaced(&self.config.test_namespace) + .command(command) + .run_checked() + } + + fn delete_helper(&self) -> Result<()> { + Kubectl::new(&self.config) + .namespaced(&self.config.test_namespace) + .command([ + "delete", + "pod", + &self.helper_pod, + "--ignore-not-found", + "--wait=true", + ]) + .run_checked()?; + Ok(()) + } +} + +impl Drop for DmFlakeyGuard { + fn drop(&mut self) { + if !self.restored { + let recovery_table = self.recovery_table.clone(); + if !recovery_table.is_empty() { + let _ = self.load_table(&recovery_table, true); + } + let _ = self.delete_helper(); + } + } +} + +fn validate_dm_spec(spec: &DmFlakeySpec<'_>) -> Result<()> { + ensure!( + !spec.node.is_empty() + && spec + .node + .chars() + .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '.' | '-')), + "RUSTFS_FAULT_TEST_DM_NODE must be a valid node name" + ); + ensure!( + spec.mount_path.starts_with('/') && spec.mount_path != "/", + "RUSTFS_FAULT_TEST_DM_MOUNT_PATH must be an absolute non-root path" + ); + ensure!( + !spec.mount_path.contains(['\n', '\r']), + "RUSTFS_FAULT_TEST_DM_MOUNT_PATH must not contain newlines" + ); + ensure!( + !spec.name.is_empty() + && spec + .name + .chars() + .all(|ch| ch.is_ascii_alphanumeric() || matches!(ch, '.' | '_' | '-' | '+')), + "RUSTFS_FAULT_TEST_DM_NAME contains unsupported characters" + ); + ensure!( + !spec.fault_table.trim().is_empty(), + "RUSTFS_FAULT_TEST_DM_FAULT_TABLE is required" + ); + ensure!( + !spec.helper_image.trim().is_empty() + && !spec.helper_image.contains(['\n', '\r', ' ', '\t']), + "RUSTFS_FAULT_TEST_DM_HELPER_IMAGE must be a non-empty image reference" + ); + Ok(()) +} + +fn dm_resume_args(name: &str) -> [&str; 3] { + ["resume", "--noudevsync", name] +} + +fn dm_suspend_args(name: &str, noflush: bool) -> Vec<&str> { + if noflush { + vec!["suspend", "--noflush", name] + } else { + vec!["suspend", name] + } +} + +fn verify_dm_volume_mapping( + config: &ClusterTestConfig, + node: &str, + expected_mount_path: &str, +) -> Result { + let selector = format!("rustfs.tenant={}", config.tenant_name); + let pods = Kubectl::new(config) + .namespaced(&config.test_namespace) + .command(["get", "pod", "-l", &selector, "-o", "json"]) + .run_checked()?; + let pods = serde_json::from_str::(&pods.stdout).context("parse RustFS pod list")?; + let pod = pods + .pointer("/items") + .and_then(Value::as_array) + .and_then(|items| { + items + .iter() + .find(|item| item.pointer("/spec/nodeName").and_then(Value::as_str) == Some(node)) + }) + .with_context(|| format!("no RustFS fault-test Pod is scheduled on DM node {node:?}"))?; + let pod_name = pod + .pointer("/metadata/name") + .and_then(Value::as_str) + .context("DM target Pod is missing metadata.name")?; + let pvc = pod + .pointer("/spec/volumes") + .and_then(Value::as_array) + .and_then(|volumes| { + volumes.iter().find_map(|volume| { + volume + .pointer("/persistentVolumeClaim/claimName") + .and_then(Value::as_str) + }) + }) + .context("DM target Pod does not mount a PVC")?; + + let pvc_json = Kubectl::new(config) + .namespaced(&config.test_namespace) + .command(["get", "pvc", pvc, "-o", "json"]) + .run_checked()?; + let pvc_json = + serde_json::from_str::(&pvc_json.stdout).context("parse DM target PVC")?; + let pv = pvc_json + .pointer("/spec/volumeName") + .and_then(Value::as_str) + .context("DM target PVC is not bound")?; + + let pv_json = Kubectl::new(config) + .command(["get", "pv", pv, "-o", "json"]) + .run_checked()?; + let pv_json = serde_json::from_str::(&pv_json.stdout).context("parse DM target PV")?; + let local_path = pv_json + .pointer("/spec/local/path") + .and_then(Value::as_str) + .context("DM target PV is not a local PV")?; + ensure!( + local_path == expected_mount_path, + "DM target PV {pv:?} uses local path {local_path:?}, expected {expected_mount_path:?}" + ); + ensure!( + pv_targets_node(&pv_json, node), + "DM target PV {pv:?} node affinity does not target {node:?}" + ); + + Ok(DmVolumeMapping { + node: node.to_string(), + pod: pod_name.to_string(), + pvc: pvc.to_string(), + pv: pv.to_string(), + mount_path: local_path.to_string(), + }) +} + +fn pv_targets_node(pv: &Value, node: &str) -> bool { + pv.pointer("/spec/nodeAffinity/required/nodeSelectorTerms") + .and_then(Value::as_array) + .into_iter() + .flatten() + .filter_map(|term| term.get("matchExpressions").and_then(Value::as_array)) + .flatten() + .any(|expression| { + expression.get("key").and_then(Value::as_str) == Some("kubernetes.io/hostname") + && expression.get("operator").and_then(Value::as_str) == Some("In") + && expression + .get("values") + .and_then(Value::as_array) + .is_some_and(|values| values.iter().any(|value| value.as_str() == Some(node))) + }) +} + +fn helper_pod_name(run_id: &str) -> String { + let suffix = run_id + .chars() + .filter(|ch| ch.is_ascii_alphanumeric()) + .take(12) + .collect::() + .to_ascii_lowercase(); + format!("rustfs-fault-dm-helper-{suffix}") +} + +fn dm_helper_manifest(config: &ClusterTestConfig, name: &str, node: &str, image: &str) -> String { + format!( + r#"apiVersion: v1 +kind: Pod +metadata: + name: {name} + namespace: {namespace} + labels: + {managed_by_label}: {managed_by_value} +spec: + nodeName: {node} + hostPID: true + restartPolicy: Never + containers: + - name: host-tools + image: {image} + imagePullPolicy: IfNotPresent + command: ["sh", "-c", "trap : TERM INT; sleep 3600 & wait"] + securityContext: + privileged: true + volumeMounts: + - name: host-root + mountPath: /host + mountPropagation: HostToContainer + volumes: + - name: host-root + hostPath: + path: / + type: Directory +"#, + namespace = config.test_namespace, + managed_by_label = MANAGED_BY_LABEL, + managed_by_value = MANAGED_BY_VALUE, + ) +} + +#[cfg(test)] +mod tests { + use super::{ + DmFlakeySpec, dm_helper_manifest, dm_resume_args, dm_suspend_args, helper_pod_name, + pv_targets_node, validate_dm_spec, + }; + use crate::framework::fault_config::FaultTestConfig; + + #[test] + fn dm_helper_is_pinned_to_one_node_and_host_root() { + let config = FaultTestConfig::for_test("real-cluster", "fast-csi"); + let manifest = dm_helper_manifest( + &config.cluster, + "rustfs-fault-dm-helper-run123", + "worker-a", + "busybox:test", + ); + + assert!(manifest.contains("nodeName: worker-a")); + assert!(manifest.contains("privileged: true")); + assert!(manifest.contains("mountPath: /host")); + assert!(manifest.contains("path: /")); + assert!(manifest.contains("rustfs-operator-fault-test")); + } + + #[test] + fn dm_resume_disables_udev_synchronization() { + assert_eq!( + dm_resume_args("rustfs-fault-dm"), + ["resume", "--noudevsync", "rustfs-fault-dm"] + ); + } + + #[test] + fn dm_recovery_suspend_does_not_flush_faulting_io() { + assert_eq!( + dm_suspend_args("rustfs-fault-dm", true), + ["suspend", "--noflush", "rustfs-fault-dm"] + ); + assert_eq!( + dm_suspend_args("rustfs-fault-dm", false), + ["suspend", "rustfs-fault-dm"] + ); + } + + #[test] + fn dm_spec_rejects_unbounded_or_unsafe_targets() { + let valid = DmFlakeySpec { + node: "worker-a", + mount_path: "/data/rustfs-fault/dm-volume", + helper_image: "busybox:test", + name: "rustfs-fault-dm", + fault_table: "0 1024 flakey /dev/loop0 0 1 15", + recovery_table: None, + run_id: "run-123", + }; + assert!(validate_dm_spec(&valid).is_ok()); + + let root = DmFlakeySpec { + mount_path: "/", + ..valid + }; + assert!(validate_dm_spec(&root).is_err()); + } + + #[test] + fn dm_pv_affinity_must_match_target_node() { + let pv = serde_json::json!({ + "spec": {"nodeAffinity": {"required": {"nodeSelectorTerms": [{ + "matchExpressions": [{ + "key": "kubernetes.io/hostname", + "operator": "In", + "values": ["worker-a"] + }] + }]}}} + }); + + assert!(pv_targets_node(&pv, "worker-a")); + assert!(!pv_targets_node(&pv, "worker-b")); + assert_eq!( + helper_pod_name("run-ABC-123"), + "rustfs-fault-dm-helper-runabc123" + ); + } +} diff --git a/e2e/src/framework/kubectl.rs b/e2e/src/framework/kubectl.rs index 9ab45c3..102014a 100644 --- a/e2e/src/framework/kubectl.rs +++ b/e2e/src/framework/kubectl.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::framework::{command::CommandSpec, config::E2eConfig}; +use crate::framework::{command::CommandSpec, config::ClusterTestConfig}; #[derive(Debug, Clone, PartialEq, Eq)] pub struct Kubectl { @@ -21,7 +21,7 @@ pub struct Kubectl { } impl Kubectl { - pub fn new(config: &E2eConfig) -> Self { + pub fn new(config: &ClusterTestConfig) -> Self { Self { context: config.context.clone(), namespace: None, @@ -50,6 +50,10 @@ impl Kubectl { pub fn apply_yaml_command(&self, yaml: impl Into) -> CommandSpec { self.command(["apply", "-f", "-"]).stdin(yaml) } + + pub fn create_yaml_command(&self, yaml: impl Into) -> CommandSpec { + self.command(["create", "-f", "-"]).stdin(yaml) + } } #[cfg(test)] @@ -78,4 +82,15 @@ mod tests { "kubectl --context kind-rustfs-e2e apply -f -" ); } + + #[test] + fn kubectl_create_yaml_uses_stdin_without_exposing_payload() { + let kubectl = Kubectl::new(&E2eConfig::defaults()); + let command = kubectl.create_yaml_command("kind: Namespace"); + + assert_eq!( + command.display(), + "kubectl --context kind-rustfs-e2e create -f -" + ); + } } diff --git a/e2e/src/framework/live.rs b/e2e/src/framework/live.rs index 4c2e9bd..e909cc4 100644 --- a/e2e/src/framework/live.rs +++ b/e2e/src/framework/live.rs @@ -24,14 +24,6 @@ pub fn require_live_enabled(config: &E2eConfig) -> Result<()> { Ok(()) } -pub fn require_destructive_enabled(config: &E2eConfig) -> Result<()> { - ensure!( - config.destructive_enabled, - "destructive e2e faults are disabled; set RUSTFS_E2E_DESTRUCTIVE=1 explicitly" - ); - Ok(()) -} - pub fn current_context() -> Result { let output = CommandSpec::new("kubectl") .args(["config", "current-context"]) @@ -51,14 +43,13 @@ pub fn ensure_dedicated_context(config: &E2eConfig) -> Result { #[cfg(test)] mod tests { - use super::{require_destructive_enabled, require_live_enabled}; + use super::require_live_enabled; use crate::framework::config::E2eConfig; #[test] - fn live_and_destructive_guards_are_disabled_by_default() { + fn live_guard_is_disabled_by_default() { let config = E2eConfig::defaults(); assert!(require_live_enabled(&config).is_err()); - assert!(require_destructive_enabled(&config).is_err()); } } diff --git a/e2e/src/framework/mod.rs b/e2e/src/framework/mod.rs index d7d557d..de2f612 100644 --- a/e2e/src/framework/mod.rs +++ b/e2e/src/framework/mod.rs @@ -15,10 +15,16 @@ pub mod artifacts; pub mod assertions; pub mod cert_manager_tls; +pub mod chaos_mesh; +pub mod checker; pub mod command; pub mod config; pub mod console_client; pub mod deploy; +pub mod fault_config; +pub mod fault_scenarios; +pub mod history; +pub mod host_faults; pub mod images; pub mod kind; pub mod kube_client; @@ -26,9 +32,10 @@ pub mod kubectl; pub mod live; pub mod port_forward; pub mod resources; +pub mod s3_workload; pub mod storage; pub mod tenant_factory; pub mod tools; pub mod wait; -pub use config::E2eConfig; +pub use config::{ClusterTestConfig, E2eConfig}; diff --git a/e2e/src/framework/port_forward.rs b/e2e/src/framework/port_forward.rs index d751ec2..07aecdc 100644 --- a/e2e/src/framework/port_forward.rs +++ b/e2e/src/framework/port_forward.rs @@ -18,7 +18,7 @@ use std::path::{Path, PathBuf}; use std::process::Child; use uuid::Uuid; -use crate::framework::{command::CommandSpec, config::E2eConfig, kubectl::Kubectl}; +use crate::framework::{command::CommandSpec, config::ClusterTestConfig, kubectl::Kubectl}; #[derive(Debug, Clone, PartialEq, Eq)] pub struct PortForwardSpec { @@ -96,17 +96,17 @@ impl PortForwardSpec { format!("http://127.0.0.1:{}", self.local_port) } - pub fn start_console(config: &E2eConfig) -> Result { + pub fn start_console(config: &ClusterTestConfig) -> Result { let kubectl = Kubectl::new(config); Self::console(&config.operator_namespace).start_with_temp_log(&kubectl) } - pub fn start_operator_sts(config: &E2eConfig) -> Result { + pub fn start_operator_sts(config: &ClusterTestConfig) -> Result { let kubectl = Kubectl::new(config); Self::operator_sts(&config.operator_namespace).start_with_temp_log(&kubectl) } - pub fn start_tenant_io(config: &E2eConfig) -> Result { + pub fn start_tenant_io(config: &ClusterTestConfig) -> Result { let kubectl = Kubectl::new(config); Self::tenant_io(&config.test_namespace, &config.tenant_name).start_with_temp_log(&kubectl) } diff --git a/e2e/src/framework/resources.rs b/e2e/src/framework/resources.rs index c6b36bb..40b51a2 100644 --- a/e2e/src/framework/resources.rs +++ b/e2e/src/framework/resources.rs @@ -12,27 +12,35 @@ // See the License for the specific language governing permissions and // limitations under the License. -use anyhow::{Context, Result, bail}; +use anyhow::{Context, Result, bail, ensure}; +use serde_json::Value; use std::thread::sleep; use std::time::{Duration, Instant}; use crate::framework::{ command::{CommandOutput, CommandSpec}, - config::E2eConfig, + config::ClusterTestConfig, kubectl::Kubectl, tenant_factory::TenantTemplate, }; use operator::types::v1alpha1::k8s::PodManagementPolicy; -const E2E_ACCESS_KEY: &str = "e2eaccess"; -const E2E_SECRET_KEY: &str = "e2esecret"; +const TEST_ACCESS_KEY: &str = "testaccess"; +const TEST_SECRET_KEY: &str = "testsecret"; const RESOURCE_RESET_TIMEOUT: Duration = Duration::from_secs(120); const RESOURCE_RESET_POLL_INTERVAL: Duration = Duration::from_secs(2); +const MANAGED_BY_LABEL: &str = "app.kubernetes.io/managed-by"; +const FAULT_TEST_MANAGER: &str = "rustfs-operator-fault-test"; +const FAULT_TEST_TENANT_ANNOTATION: &str = "rustfs.com/fault-test-tenant"; -pub fn credential_secret_name(config: &E2eConfig) -> String { +pub fn credential_secret_name(config: &ClusterTestConfig) -> String { format!("{}-credentials", config.tenant_name) } +pub fn test_credentials() -> (&'static str, &'static str) { + (TEST_ACCESS_KEY, TEST_SECRET_KEY) +} + pub fn namespace_manifest(namespace: &str) -> String { format!( r#"apiVersion: v1 @@ -43,7 +51,26 @@ metadata: ) } -pub fn credential_secret_manifest(config: &E2eConfig) -> String { +pub fn fault_namespace_manifest(config: &ClusterTestConfig) -> String { + format!( + r#"apiVersion: v1 +kind: Namespace +metadata: + name: {namespace} + labels: + {managed_by_label}: {manager} + annotations: + {tenant_annotation}: {tenant_name} +"#, + namespace = config.test_namespace, + managed_by_label = MANAGED_BY_LABEL, + manager = FAULT_TEST_MANAGER, + tenant_annotation = FAULT_TEST_TENANT_ANNOTATION, + tenant_name = config.tenant_name, + ) +} + +pub fn credential_secret_manifest(config: &ClusterTestConfig) -> String { format!( r#"apiVersion: v1 kind: Secret @@ -57,12 +84,12 @@ stringData: "#, secret_name = credential_secret_name(config), namespace = config.test_namespace, - access_key = E2E_ACCESS_KEY, - secret_key = E2E_SECRET_KEY + access_key = TEST_ACCESS_KEY, + secret_key = TEST_SECRET_KEY ) } -pub fn smoke_tenant_template(config: &E2eConfig) -> TenantTemplate { +pub fn smoke_tenant_template(config: &ClusterTestConfig) -> TenantTemplate { let mut template = TenantTemplate::kind_local( &config.test_namespace, &config.tenant_name, @@ -81,13 +108,24 @@ pub fn smoke_tenant_template(config: &E2eConfig) -> TenantTemplate { template } -pub fn smoke_tenant_manifest(config: &E2eConfig) -> Result { +pub fn smoke_tenant_manifest(config: &ClusterTestConfig) -> Result { Ok(serde_yaml_ng::to_string( &smoke_tenant_template(config).build(), )?) } -pub fn apply_smoke_tenant_resources(config: &E2eConfig) -> Result<()> { +pub fn fault_tenant_manifest(config: &ClusterTestConfig) -> Result { + let template = TenantTemplate::real_cluster( + &config.test_namespace, + &config.tenant_name, + &config.rustfs_image, + &config.storage_class, + credential_secret_name(config), + ); + Ok(serde_yaml_ng::to_string(&template.build())?) +} + +pub fn apply_smoke_tenant_resources(config: &ClusterTestConfig) -> Result<()> { let kubectl = Kubectl::new(config); kubectl .apply_yaml_command(namespace_manifest(&config.test_namespace)) @@ -101,12 +139,41 @@ pub fn apply_smoke_tenant_resources(config: &E2eConfig) -> Result<()> { Ok(()) } -pub fn reset_and_apply_smoke_tenant_resources(config: &E2eConfig) -> Result<()> { - reset_smoke_tenant_resources(config)?; +pub fn apply_fault_tenant_resources(config: &ClusterTestConfig) -> Result<()> { + let kubectl = Kubectl::new(config); + if !ensure_fault_namespace_owned_or_absent(config)? { + kubectl + .create_yaml_command(fault_namespace_manifest(config)) + .run_checked() + .with_context(|| { + format!( + "create dedicated fault-test namespace {:?}", + config.test_namespace + ) + })?; + } + kubectl + .apply_yaml_command(credential_secret_manifest(config)) + .run_checked()?; + kubectl + .apply_yaml_command(fault_tenant_manifest(config)?) + .run_checked()?; + Ok(()) +} + +pub fn reset_fault_tenant_resources(config: &ClusterTestConfig) -> Result<()> { + if !ensure_fault_namespace_owned_or_absent(config)? { + return Ok(()); + } + reset_tenant_resources(config) +} + +pub fn reset_and_apply_smoke_tenant_resources(config: &ClusterTestConfig) -> Result<()> { + reset_tenant_resources(config)?; apply_smoke_tenant_resources(config) } -pub fn reset_smoke_tenant_resources(config: &E2eConfig) -> Result<()> { +pub fn reset_tenant_resources(config: &ClusterTestConfig) -> Result<()> { let kubectl = Kubectl::new(config); if !namespace_exists(&kubectl, &config.test_namespace)? { return Ok(()); @@ -169,7 +236,7 @@ pub fn reset_smoke_tenant_resources(config: &E2eConfig) -> Result<()> { Ok(()) } -pub fn cleanup_smoke_tenant_resources(config: &E2eConfig) -> Result<()> { +pub fn cleanup_tenant_resources(config: &ClusterTestConfig) -> Result<()> { let kubectl = Kubectl::new(config).namespaced(&config.test_namespace); let selector = format!("rustfs.tenant={}", config.tenant_name); @@ -219,6 +286,52 @@ fn namespace_exists(kubectl: &Kubectl, namespace: &str) -> Result { Ok(output.code == Some(0)) } +fn ensure_fault_namespace_owned_or_absent(config: &ClusterTestConfig) -> Result { + let output = Kubectl::new(config) + .command(["get", "namespace", &config.test_namespace, "-o", "json"]) + .run()?; + + match output.code { + Some(0) => { + validate_fault_namespace_ownership( + &output.stdout, + &config.test_namespace, + &config.tenant_name, + )?; + Ok(true) + } + _ if is_not_found(&output) => Ok(false), + _ => bail!( + "failed to inspect fault-test namespace {:?} before destructive operation\nexit: {:?}\nstdout:\n{}\nstderr:\n{}", + config.test_namespace, + output.code, + output.stdout, + output.stderr + ), + } +} + +fn validate_fault_namespace_ownership(raw: &str, namespace: &str, tenant_name: &str) -> Result<()> { + let value = serde_json::from_str::(raw) + .with_context(|| format!("parse namespace {namespace:?} json"))?; + let manager = value + .pointer("/metadata/labels/app.kubernetes.io~1managed-by") + .and_then(Value::as_str); + let owned_tenant = value + .pointer("/metadata/annotations/rustfs.com~1fault-test-tenant") + .and_then(Value::as_str); + + ensure!( + manager == Some(FAULT_TEST_MANAGER) && owned_tenant == Some(tenant_name), + "refusing destructive fault-test operation in namespace {namespace:?}: expected label \ + {MANAGED_BY_LABEL}={FAULT_TEST_MANAGER:?} and annotation \ + {FAULT_TEST_TENANT_ANNOTATION}={tenant_name:?}, got manager={manager:?}, \ + tenant={owned_tenant:?}; use a dedicated namespace or explicitly label and annotate it \ + only after verifying that it contains no non-test workloads" + ); + Ok(()) +} + fn run_delete(command: CommandSpec) -> Result<()> { command.run_checked()?; Ok(()) @@ -300,8 +413,12 @@ fn is_not_found(output: &CommandOutput) -> bool { #[cfg(test)] mod tests { - use super::{credential_secret_manifest, credential_secret_name, smoke_tenant_manifest}; + use super::{ + credential_secret_manifest, credential_secret_name, fault_namespace_manifest, + fault_tenant_manifest, smoke_tenant_manifest, validate_fault_namespace_ownership, + }; use crate::framework::config::E2eConfig; + use crate::framework::fault_config::FaultTestConfig; #[test] fn smoke_tenant_manifest_wires_secret_storage_and_image() { @@ -325,4 +442,55 @@ mod tests { assert!(manifest.contains("accesskey:")); assert!(manifest.contains("secretkey:")); } + + #[test] + fn fault_tenant_manifest_uses_real_cluster_defaults() { + let config = FaultTestConfig::for_test("real-cluster", "fast-csi"); + let manifest = fault_tenant_manifest(&config.cluster).expect("fault tenant manifest"); + + assert!(manifest.contains("namespace: rustfs-fault-test")); + assert!(manifest.contains("storageClassName: fast-csi")); + assert!(manifest.contains("storage: 100Gi")); + assert!(!manifest.contains("rustfs-storage")); + assert!(!manifest.contains("RUSTFS_UNSAFE_BYPASS_DISK_CHECK")); + } + + #[test] + fn fault_namespace_manifest_records_destructive_test_ownership() { + let config = FaultTestConfig::for_test("real-cluster", "fast-csi"); + let manifest = fault_namespace_manifest(&config.cluster); + + assert!(manifest.contains("name: rustfs-fault-test")); + assert!(manifest.contains("app.kubernetes.io/managed-by: rustfs-operator-fault-test")); + assert!(manifest.contains("rustfs.com/fault-test-tenant: fault-test-tenant")); + } + + #[test] + fn fault_namespace_ownership_requires_matching_manager_and_tenant() { + let owned = r#"{ + "metadata": { + "labels": { + "app.kubernetes.io/managed-by": "rustfs-operator-fault-test" + }, + "annotations": { + "rustfs.com/fault-test-tenant": "fault-test-tenant" + } + } + }"#; + assert!( + validate_fault_namespace_ownership(owned, "rustfs-fault-test", "fault-test-tenant") + .is_ok() + ); + + let unowned = r#"{"metadata":{"labels":{},"annotations":{}}}"#; + assert!( + validate_fault_namespace_ownership(unowned, "rustfs-fault-test", "fault-test-tenant") + .is_err() + ); + + assert!( + validate_fault_namespace_ownership(owned, "rustfs-fault-test", "another-tenant") + .is_err() + ); + } } diff --git a/e2e/src/framework/s3_workload.rs b/e2e/src/framework/s3_workload.rs new file mode 100644 index 0000000..3e8d0a8 --- /dev/null +++ b/e2e/src/framework/s3_workload.rs @@ -0,0 +1,611 @@ +// Copyright 2025 RustFS Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use anyhow::{Context, Result}; +use aws_config::BehaviorVersion; +use aws_credential_types::Credentials; +use aws_sdk_s3::{Client, config::Region, error::SdkError, primitives::ByteStream}; +use serde::Serialize; +use sha2::{Digest, Sha256}; +use std::time::Duration; +use tokio::time::timeout; + +use crate::framework::history::{OperationKind, OperationOutcome, Recorder}; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ObjectSpec { + pub key: String, + pub size_bytes: usize, + pub sha256: String, + seed: u64, + index: usize, +} + +#[derive(Debug)] +pub struct PreparedObject { + pub spec: ObjectSpec, + body: Vec, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] +pub struct WorkloadSizeClass { + pub size_bytes: usize, + pub object_count: usize, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] +pub struct WorkloadPlan { + pub seed: u64, + pub generator: &'static str, + pub object_count: usize, + pub concurrency: usize, + pub total_payload_bytes: u64, + pub size_distribution: Vec, + #[serde(skip)] + sizes: Vec, +} + +#[derive(Clone)] +pub struct S3WorkloadClient { + client: Client, + bucket: String, + request_timeout: Duration, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct GetObjectResult { + pub outcome: OperationOutcome, + pub body: Option>, +} + +impl ObjectSpec { + pub fn key_prefix(run_id: &str) -> String { + format!("fault-test/{run_id}/") + } + + pub fn prepare_seeded( + run_id: &str, + index: usize, + size_bytes: usize, + seed: u64, + ) -> PreparedObject { + let key = format!("{}object-{index:06}", Self::key_prefix(run_id)); + let body = seeded_bytes(seed, index, size_bytes); + let sha256 = sha256_hex(&body); + + PreparedObject { + spec: Self { + key, + size_bytes, + sha256, + seed, + index, + }, + body, + } + } + + pub fn prepare(&self) -> PreparedObject { + let body = seeded_bytes(self.seed, self.index, self.size_bytes); + debug_assert_eq!(sha256_hex(&body), self.sha256); + PreparedObject { + spec: self.clone(), + body, + } + } +} + +impl WorkloadPlan { + pub fn seeded(seed: u64, object_count: usize, concurrency: usize) -> Self { + const SIZE_CLASSES: &[(usize, usize)] = &[ + (4 * 1024, 85), + (16 * 1024, 10), + (8 * 1024 * 1024, 4), + (16 * 1024 * 1024, 1), + ]; + + let mut sizes = Vec::with_capacity(object_count); + let mut size_distribution = Vec::with_capacity(SIZE_CLASSES.len()); + let mut assigned = 0; + for (position, (size_bytes, weight)) in SIZE_CLASSES.iter().copied().enumerate() { + let count = if position + 1 == SIZE_CLASSES.len() { + object_count.saturating_sub(assigned) + } else { + object_count.saturating_mul(weight) / 100 + }; + sizes.extend(std::iter::repeat_n(size_bytes, count)); + size_distribution.push(WorkloadSizeClass { + size_bytes, + object_count: count, + }); + assigned += count; + } + + shuffle_sizes(&mut sizes, seed); + let total_payload_bytes = sizes.iter().map(|size| *size as u64).sum(); + Self { + seed, + generator: "splitmix64-v1", + object_count, + concurrency, + total_payload_bytes, + size_distribution, + sizes, + } + } + + pub fn size_at(&self, index: usize) -> usize { + self.sizes[index] + } +} + +impl S3WorkloadClient { + pub async fn new( + endpoint: impl Into, + bucket: impl Into, + access_key: impl Into, + secret_key: impl Into, + request_timeout: Duration, + ) -> Result { + let credentials = Credentials::new( + access_key.into(), + secret_key.into(), + None, + None, + "rustfs-fault-test-static-credentials", + ); + let shared_config = aws_config::defaults(BehaviorVersion::latest()) + .region(Region::new("us-east-1")) + .credentials_provider(credentials) + .endpoint_url(endpoint.into()) + .load() + .await; + let s3_config = aws_sdk_s3::config::Builder::from(&shared_config) + .force_path_style(true) + .build(); + + Ok(Self { + client: Client::from_conf(s3_config), + bucket: bucket.into(), + request_timeout, + }) + } + + pub async fn create_bucket(&self, recorder: &Recorder) -> Result { + let record = recorder.begin( + OperationKind::CreateBucket, + self.bucket.clone(), + None, + None, + None, + ); + let result = timeout( + self.request_timeout, + self.client.create_bucket().bucket(&self.bucket).send(), + ) + .await; + + match result { + Ok(Ok(_)) => { + recorder.finish(record, OperationOutcome::Ok, Some(200), None)?; + Ok(OperationOutcome::Ok) + } + Ok(Err(error)) => { + let outcome = classify_sdk_error(&error); + recorder.finish( + record, + outcome, + sdk_error_status(&error), + Some(format!("create bucket failed: {error}")), + )?; + Ok(outcome) + } + Err(_) => { + recorder.finish( + record, + OperationOutcome::Timeout, + None, + Some("create bucket timed out".to_string()), + )?; + Ok(OperationOutcome::Timeout) + } + } + } + + pub async fn put_object( + &self, + object: &PreparedObject, + recorder: &Recorder, + ) -> Result { + let spec = &object.spec; + let record = recorder.begin( + OperationKind::Put, + self.bucket.clone(), + Some(spec.key.clone()), + Some(spec.sha256.clone()), + Some(spec.size_bytes), + ); + let result = timeout( + self.request_timeout, + self.client + .put_object() + .bucket(&self.bucket) + .key(&spec.key) + .body(ByteStream::from(object.body.clone())) + .send(), + ) + .await; + + match result { + Ok(Ok(_)) => { + recorder.finish(record, OperationOutcome::Ok, Some(200), None)?; + Ok(OperationOutcome::Ok) + } + Ok(Err(error)) => { + let outcome = classify_sdk_error(&error); + recorder.finish( + record, + outcome, + sdk_error_status(&error), + Some(format!("put object failed: {error}")), + )?; + Ok(outcome) + } + Err(_) => { + recorder.finish( + record, + OperationOutcome::Timeout, + None, + Some("put object timed out".to_string()), + )?; + Ok(OperationOutcome::Timeout) + } + } + } + + pub async fn get_object(&self, key: &str, recorder: &Recorder) -> Result>> { + Ok(self.get_object_result(key, recorder).await?.body) + } + + pub async fn get_object_result( + &self, + key: &str, + recorder: &Recorder, + ) -> Result { + let record = recorder.begin( + OperationKind::Get, + self.bucket.clone(), + Some(key.to_string()), + None, + None, + ); + let response = timeout( + self.request_timeout, + self.client + .get_object() + .bucket(&self.bucket) + .key(key) + .send(), + ) + .await; + + let output = match response { + Ok(Ok(output)) => output, + Ok(Err(error)) => { + let outcome = classify_sdk_error(&error); + recorder.finish( + record, + outcome, + sdk_error_status(&error), + Some(format!("get object failed: {error}")), + )?; + return Ok(GetObjectResult { + outcome, + body: None, + }); + } + Err(_) => { + recorder.finish( + record, + OperationOutcome::Timeout, + None, + Some("get object timed out".to_string()), + )?; + return Ok(GetObjectResult { + outcome: OperationOutcome::Timeout, + body: None, + }); + } + }; + + let body = timeout(self.request_timeout, output.body.collect()).await; + match body { + Ok(Ok(bytes)) => { + let body = bytes.into_bytes().to_vec(); + let mut record = record; + record.value_sha256 = Some(sha256_hex(&body)); + record.size_bytes = Some(body.len()); + recorder.finish(record, OperationOutcome::Ok, Some(200), None)?; + Ok(GetObjectResult { + outcome: OperationOutcome::Ok, + body: Some(body), + }) + } + Ok(Err(error)) => { + recorder.finish( + record, + OperationOutcome::Unknown, + Some(200), + Some(format!("get body read failed: {error}")), + )?; + Ok(GetObjectResult { + outcome: OperationOutcome::Unknown, + body: None, + }) + } + Err(_) => { + recorder.finish( + record, + OperationOutcome::Timeout, + Some(200), + Some("get body read timed out".to_string()), + )?; + Ok(GetObjectResult { + outcome: OperationOutcome::Timeout, + body: None, + }) + } + } + } + + pub async fn head_object(&self, key: &str, recorder: &Recorder) -> Result { + let record = recorder.begin( + OperationKind::Head, + self.bucket.clone(), + Some(key.to_string()), + None, + None, + ); + let result = timeout( + self.request_timeout, + self.client + .head_object() + .bucket(&self.bucket) + .key(key) + .send(), + ) + .await; + + match result { + Ok(Ok(_)) => { + recorder.finish(record, OperationOutcome::Ok, Some(200), None)?; + Ok(OperationOutcome::Ok) + } + Ok(Err(error)) => { + let outcome = classify_sdk_error(&error); + recorder.finish( + record, + outcome, + sdk_error_status(&error), + Some(format!("head object failed: {error}")), + )?; + Ok(outcome) + } + Err(_) => { + recorder.finish( + record, + OperationOutcome::Timeout, + None, + Some("head object timed out".to_string()), + )?; + Ok(OperationOutcome::Timeout) + } + } + } + + pub async fn list_prefix( + &self, + prefix: &str, + recorder: &Recorder, + ) -> Result>> { + let record = recorder.begin( + OperationKind::List, + self.bucket.clone(), + Some(prefix.to_string()), + None, + None, + ); + let mut keys = Vec::new(); + let mut continuation_token = None; + loop { + let mut request = self + .client + .list_objects_v2() + .bucket(&self.bucket) + .prefix(prefix); + if let Some(token) = continuation_token.as_deref() { + request = request.continuation_token(token); + } + let response = timeout(self.request_timeout, request.send()).await; + let output = match response { + Ok(Ok(output)) => output, + Ok(Err(error)) => { + let outcome = classify_sdk_error(&error); + recorder.finish( + record, + outcome, + sdk_error_status(&error), + Some(format!("list prefix failed: {error}")), + )?; + return Ok(None); + } + Err(_) => { + recorder.finish( + record, + OperationOutcome::Timeout, + None, + Some("list prefix timed out".to_string()), + )?; + return Ok(None); + } + }; + keys.extend( + output + .contents() + .iter() + .filter_map(|object| object.key().map(str::to_string)), + ); + if !output.is_truncated().unwrap_or(false) { + break; + } + continuation_token = output.next_continuation_token().map(str::to_string); + if continuation_token.is_none() { + recorder.finish( + record, + OperationOutcome::Unknown, + Some(200), + Some("truncated LIST response omitted continuation token".to_string()), + )?; + return Ok(None); + } + } + + let mut record = record; + record.size_bytes = Some(keys.len()); + recorder.finish(record, OperationOutcome::Ok, Some(200), None)?; + Ok(Some(keys)) + } +} + +pub fn sha256_hex(body: &[u8]) -> String { + let mut hasher = Sha256::new(); + hasher.update(body); + hex::encode(hasher.finalize()) +} + +pub async fn wait_for_s3_endpoint(endpoint: &str, timeout_duration: Duration) -> Result<()> { + let client = reqwest::Client::builder() + .timeout(Duration::from_secs(2)) + .build() + .context("build S3 readiness HTTP client")?; + let start = std::time::Instant::now(); + + loop { + if client.get(endpoint).send().await.is_ok() { + return Ok(()); + } + if start.elapsed() >= timeout_duration { + anyhow::bail!("timed out waiting for S3 endpoint {endpoint}"); + } + tokio::time::sleep(Duration::from_secs(1)).await; + } +} + +fn seeded_bytes(seed: u64, index: usize, size_bytes: usize) -> Vec { + let mut generator = SplitMix64::new(seed ^ (index as u64).wrapping_mul(0xD6E8_FEB8_6659_FD93)); + let mut body = vec![0; size_bytes]; + for chunk in body.chunks_mut(8) { + let bytes = generator.next_u64().to_le_bytes(); + chunk.copy_from_slice(&bytes[..chunk.len()]); + } + body +} + +fn shuffle_sizes(sizes: &mut [usize], seed: u64) { + let mut generator = SplitMix64::new(seed ^ 0xA076_1D64_78BD_642F); + for index in (1..sizes.len()).rev() { + let swap_with = (generator.next_u64() % (index as u64 + 1)) as usize; + sizes.swap(index, swap_with); + } +} + +struct SplitMix64 { + state: u64, +} + +impl SplitMix64 { + fn new(seed: u64) -> Self { + Self { state: seed } + } + + fn next_u64(&mut self) -> u64 { + self.state = self.state.wrapping_add(0x9E37_79B9_7F4A_7C15); + let mut value = self.state; + value = (value ^ (value >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9); + value = (value ^ (value >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB); + value ^ (value >> 31) + } +} + +fn classify_sdk_error(error: &SdkError) -> OperationOutcome { + match error { + SdkError::TimeoutError(_) => OperationOutcome::Timeout, + SdkError::DispatchFailure(_) | SdkError::ResponseError(_) => OperationOutcome::Unknown, + SdkError::ConstructionFailure(_) | SdkError::ServiceError(_) => OperationOutcome::Failed, + _ => OperationOutcome::Unknown, + } +} + +fn sdk_error_status(error: &SdkError) -> Option { + match error { + SdkError::ServiceError(context) => Some(context.raw().status().as_u16()), + SdkError::ResponseError(context) => Some(context.raw().status().as_u16()), + _ => None, + } +} + +#[cfg(test)] +mod tests { + use super::{ObjectSpec, WorkloadPlan, sha256_hex}; + + #[test] + fn seeded_objects_have_stable_keys_sizes_and_hashes() { + let object = ObjectSpec::prepare_seeded("run-1", 7, 4096, 42); + let same = ObjectSpec::prepare_seeded("run-1", 7, 4096, 42); + + assert_eq!(ObjectSpec::key_prefix("run-1"), "fault-test/run-1/"); + assert_eq!(object.spec.key, "fault-test/run-1/object-000007"); + assert_eq!(object.spec.size_bytes, 4096); + assert_eq!(object.spec.sha256, same.spec.sha256); + assert_eq!(object.spec.sha256, sha256_hex(&same.body)); + assert_ne!( + object.spec.sha256, + ObjectSpec::prepare_seeded("run-1", 7, 4096, 43).spec.sha256 + ); + } + + #[test] + fn workload_plan_is_weighted_shuffled_and_reproducible() { + let plan = WorkloadPlan::seeded(42, 40000, 80); + let same = WorkloadPlan::seeded(42, 40000, 80); + let different = WorkloadPlan::seeded(43, 40000, 80); + + assert_eq!(plan, same); + assert_ne!(plan.sizes, different.sizes); + assert_eq!( + plan.size_distribution + .iter() + .map(|class| (class.size_bytes, class.object_count)) + .collect::>(), + vec![ + (4 * 1024, 34000), + (16 * 1024, 4000), + (8 * 1024 * 1024, 1600), + (16 * 1024 * 1024, 400), + ] + ); + assert_eq!(plan.total_payload_bytes, 20_337_459_200); + assert_eq!(plan.concurrency, 80); + } +} diff --git a/e2e/src/framework/tenant_factory.rs b/e2e/src/framework/tenant_factory.rs index ca21fc2..af806d3 100644 --- a/e2e/src/framework/tenant_factory.rs +++ b/e2e/src/framework/tenant_factory.rs @@ -13,9 +13,11 @@ // limitations under the License. use k8s_openapi::api::core::v1::{ - EnvVar, LocalObjectReference, PersistentVolumeClaimSpec, VolumeResourceRequirements, + Affinity, EnvVar, LocalObjectReference, PersistentVolumeClaimSpec, PodAffinityTerm, + PodAntiAffinity, VolumeResourceRequirements, }; use k8s_openapi::apimachinery::pkg::api::resource::Quantity; +use k8s_openapi::apimachinery::pkg::apis::meta::v1::LabelSelector; use operator::types::v1alpha1::k8s::ImagePullPolicy; use operator::types::v1alpha1::k8s::PodManagementPolicy; use operator::types::v1alpha1::persistence::PersistenceConfig; @@ -32,8 +34,11 @@ pub struct TenantTemplate { pub credential_secret_name: String, pub servers: i32, pub volumes_per_server: i32, + pub storage_request: String, pub pod_management_policy: Option, pub unsafe_bypass_disk_check: bool, + pub node_selector: Option>, + pub affinity: Option, } impl TenantTemplate { @@ -52,8 +57,39 @@ impl TenantTemplate { credential_secret_name: credential_secret_name.into(), servers: 4, volumes_per_server: 2, + storage_request: "10Gi".to_string(), pod_management_policy: Some(PodManagementPolicy::Parallel), unsafe_bypass_disk_check: true, + node_selector: Some( + [("rustfs-storage".to_string(), "true".to_string())] + .into_iter() + .collect(), + ), + affinity: None, + } + } + + pub fn real_cluster( + namespace: impl Into, + name: impl Into, + image: impl Into, + storage_class: impl Into, + credential_secret_name: impl Into, + ) -> Self { + let name = name.into(); + Self { + namespace: namespace.into(), + name: name.clone(), + image: image.into(), + storage_class: storage_class.into(), + credential_secret_name: credential_secret_name.into(), + servers: 4, + volumes_per_server: 1, + storage_request: "100Gi".to_string(), + pod_management_policy: Some(PodManagementPolicy::Parallel), + unsafe_bypass_disk_check: false, + node_selector: None, + affinity: Some(fault_tenant_pod_anti_affinity(&name)), } } @@ -67,9 +103,12 @@ impl TenantTemplate { access_modes: Some(vec!["ReadWriteOnce".to_string()]), resources: Some(VolumeResourceRequirements { requests: Some( - [("storage".to_string(), Quantity("10Gi".to_string()))] - .into_iter() - .collect(), + [( + "storage".to_string(), + Quantity(self.storage_request.clone()), + )] + .into_iter() + .collect(), ), ..Default::default() }), @@ -79,11 +118,8 @@ impl TenantTemplate { ..PersistenceConfig::default() }, scheduling: SchedulingConfig { - node_selector: Some( - [("rustfs-storage".to_string(), "true".to_string())] - .into_iter() - .collect::>(), - ), + node_selector: self.node_selector.clone(), + affinity: self.affinity.clone(), ..SchedulingConfig::default() }, }; @@ -120,6 +156,27 @@ impl TenantTemplate { } } +fn fault_tenant_pod_anti_affinity(tenant_name: &str) -> Affinity { + Affinity { + pod_anti_affinity: Some(PodAntiAffinity { + required_during_scheduling_ignored_during_execution: Some(vec![PodAffinityTerm { + label_selector: Some(LabelSelector { + match_labels: Some( + [("rustfs.tenant".to_string(), tenant_name.to_string())] + .into_iter() + .collect(), + ), + ..LabelSelector::default() + }), + topology_key: "kubernetes.io/hostname".to_string(), + ..PodAffinityTerm::default() + }]), + ..PodAntiAffinity::default() + }), + ..Affinity::default() + } +} + #[cfg(test)] mod tests { use super::TenantTemplate; @@ -162,5 +219,62 @@ mod tests { .any(|env| env.name == "RUSTFS_UNSAFE_BYPASS_DISK_CHECK" && env.value.as_deref() == Some("true")) ); + assert_eq!( + tenant.spec.pools[0] + .scheduling + .node_selector + .as_ref() + .and_then(|selector| selector.get("rustfs-storage")) + .map(String::as_str), + Some("true") + ); + } + + #[test] + fn real_cluster_tenant_uses_fault_storage_spread_and_disk_checks() { + let tenant = TenantTemplate::real_cluster( + "rustfs-fault-test", + "fault-test-tenant", + "rustfs/rustfs:latest", + "fast-csi", + "fault-test-tenant-credentials", + ) + .build(); + + assert_eq!(tenant.spec.pools[0].persistence.volumes_per_server, 1); + assert_eq!( + tenant.spec.pools[0] + .scheduling + .affinity + .as_ref() + .and_then(|affinity| affinity.pod_anti_affinity.as_ref()) + .and_then(|anti_affinity| { + anti_affinity + .required_during_scheduling_ignored_during_execution + .as_ref() + }) + .and_then(|terms| terms.first()) + .map(|term| term.topology_key.as_str()), + Some("kubernetes.io/hostname") + ); + assert_eq!( + tenant.spec.pools[0] + .persistence + .volume_claim_template + .as_ref() + .and_then(|claim| claim.resources.as_ref()) + .and_then(|resources| resources.requests.as_ref()) + .and_then(|requests| requests.get("storage")) + .map(|quantity| quantity.0.as_str()), + Some("100Gi") + ); + assert!(tenant.spec.pools[0].scheduling.node_selector.is_none()); + assert!( + tenant + .spec + .env + .iter() + .all(|env| env.name != "RUSTFS_UNSAFE_BYPASS_DISK_CHECK") + ); } } diff --git a/e2e/tests/faults.rs b/e2e/tests/faults.rs index b531cfc..54cf4de 100644 --- a/e2e/tests/faults.rs +++ b/e2e/tests/faults.rs @@ -12,25 +12,1361 @@ // See the License for the specific language governing permissions and // limitations under the License. -use anyhow::Result; -use rustfs_operator_e2e::framework::{config::E2eConfig, live}; +use anyhow::{Context, Result, bail, ensure}; +use futures::{StreamExt, TryStreamExt, stream}; +use kube::Api; +use operator::types::v1alpha1::tenant::Tenant; +use rustfs_operator_e2e::framework::{ + artifacts::ArtifactCollector, + chaos_mesh::{self, ChaosGuard, IoChaosSpec, NetworkChaosSpec, PodChaosSpec}, + checker, + command::CommandSpec, + config::ClusterTestConfig, + fault_config::FaultTestConfig, + fault_scenarios::{ + self, DISK_FULL_SCENARIO, FaultBackend, FaultIsolation, FaultScenario, + IO_READ_MISTAKE_SCENARIO, + }, + history::OperationOutcome, + history::Recorder, + host_faults::{self, DmFlakeyGuard, DmFlakeySpec, DmStatusSnapshot}, + kube_client, + kubectl::Kubectl, + port_forward::{PortForwardGuard, PortForwardSpec}, + resources, + s3_workload::{ObjectSpec, S3WorkloadClient, WorkloadPlan, wait_for_s3_endpoint}, + wait, +}; +use serde::Serialize; +use std::collections::BTreeSet; +use std::thread::sleep; +use std::time::{Duration, Instant}; +use tokio::time::sleep as async_sleep; +use uuid::Uuid; -#[test] -fn faults_are_not_destructive_without_explicit_opt_in() { - let config = E2eConfig::defaults(); +const RUSTFS_DATA_VOLUME: &str = "/data/rustfs0"; +const FAULT_TENANT_POD_COUNT: usize = 4; +const RUSTFS_POD_STABLE_WINDOW: Duration = Duration::from_secs(60); - assert!(!config.destructive_enabled); - assert!(live::require_destructive_enabled(&config).is_err()); +#[tokio::test] +#[ignore = "destructive RustFS workload fault scenario; select with RUSTFS_FAULT_TEST_SCENARIO"] +async fn fault_selected_scenario() -> Result<()> { + let config = FaultTestConfig::from_env()?; + let scenario = FaultScenario::from_config(&config)?; + let spec = fault_scenarios::scenario_spec(&scenario.name)?; + + config.require_destructive_enabled()?; + config.validate_cluster(spec.backend == FaultBackend::DeviceMapper)?; + eprintln!( + "running destructive RustFS fault scenario {} against real Kubernetes context: {}", + scenario.name, config.cluster.context + ); + + let collector = ArtifactCollector::new(&config.cluster.artifacts_dir); + let result = run_fault_case(&config, &collector, &scenario).await; + + if let Err(error) = &result { + match collector.collect_kubernetes_snapshot(scenario.case_name, &config.cluster) { + Ok(report) => { + eprintln!( + "collected fault-test artifacts under {}", + report.dir.display() + ); + eprintln!("{}", report.diagnosis); + } + Err(artifact_error) => { + eprintln!("failed to collect fault-test artifacts after {error}: {artifact_error}"); + } + } + } + + result +} + +async fn run_fault_case( + config: &FaultTestConfig, + collector: &ArtifactCollector, + scenario: &FaultScenario, +) -> Result<()> { + let spec = fault_scenarios::scenario_spec(&scenario.name)?; + require_fault_backend(config, spec.backend)?; + cleanup_fault_backend(config, spec.backend)?; + + prepare_fault_fixture(&config.cluster, spec.isolation)?; + wait_for_ready_tenant(&config.cluster).await?; + wait_for_stable_rustfs_pods(&config.cluster, RUSTFS_POD_STABLE_WINDOW).await?; + + let run_id = format!("run-{}", Uuid::new_v4()); + let workload_seed = config.workload_seed.unwrap_or_else(generated_seed); + let workload_plan = WorkloadPlan::seeded( + workload_seed, + scenario.object_count, + config.workload_concurrency, + ); + let bucket = bucket_name(&run_id); + let history_path = collector.case_dir(scenario.case_name).join("history.jsonl"); + let history = Recorder::create(history_path, &scenario.name, &run_id)?; + collector.write_text( + scenario.case_name, + "workload-plan.json", + &serde_json::to_string_pretty(&workload_plan)?, + )?; + eprintln!( + "fault workload seed={} objects={} concurrency={} payload_bytes={}", + workload_plan.seed, + workload_plan.object_count, + workload_plan.concurrency, + workload_plan.total_payload_bytes + ); + + let cluster = &config.cluster; + let (endpoint, mut port_forward) = s3_access(config)?; + ensure_s3_access(&mut port_forward, cluster, &endpoint).await?; + + let (access_key, secret_key) = resources::test_credentials(); + let s3 = S3WorkloadClient::new( + &endpoint, + &bucket, + access_key, + secret_key, + config.request_timeout, + ) + .await?; + let bucket_outcome = s3.create_bucket(&history).await?; + ensure!( + bucket_outcome == OperationOutcome::Ok, + "fault workload bucket creation did not succeed: {bucket_outcome:?}" + ); + + let prefilled = prefill_objects( + &s3, + &history, + &run_id, + &workload_plan, + scenario.prefill_count(), + ) + .await?; + let pods_before = rustfs_pod_identities(cluster)?; + let mut fault = AppliedFault::apply(config, collector, scenario, spec.backend, &run_id)?; + + if let Err(error) = fault.wait_active(cluster.timeout) { + collect_fault_artifacts(collector, scenario.case_name, &fault, "wait-active-failed")?; + return Err(error); + } + let active_snapshot = fault.snapshot("active")?; + + if let Err(error) = ensure_s3_access(&mut port_forward, cluster, &endpoint).await { + collect_fault_artifacts(collector, scenario.case_name, &fault, "port-forward-failed")?; + return Err(error); + } + + if spec.backend == FaultBackend::MinioWarpWithChaos { + let warp_bucket = warp_bucket_name(&run_id); + if let Err(error) = host_faults::run_warp_mixed( + config.warp_duration, + collector, + scenario.case_name, + &endpoint, + &warp_bucket, + access_key, + secret_key, + ) { + collect_fault_artifacts(collector, scenario.case_name, &fault, "warp-failed")?; + return Err(error); + } + + if let Err(error) = ensure_s3_access(&mut port_forward, cluster, &endpoint).await { + collect_fault_artifacts( + collector, + scenario.case_name, + &fault, + "post-warp-port-forward-failed", + )?; + return Err(error); + } + } + + let mut workload = match run_mixed_workload( + &s3, + &history, + &run_id, + &workload_plan, + &prefilled, + scenario.prefill_count(), + scenario.mixed_workload_count(), + ) + .await + { + Ok(workload) => workload, + Err(error) => { + collect_fault_artifacts(collector, scenario.case_name, &fault, "workload-failed")?; + return Err(error); + } + }; + collector.write_text( + scenario.case_name, + "workload-summary.json", + &serde_json::to_string_pretty(&workload.summary)?, + )?; + if let Err(error) = workload + .summary + .require_fault_evidence(config.require_client_disruption) + { + collect_fault_artifacts( + collector, + scenario.case_name, + &fault, + "workload-no-fault-evidence", + )?; + return Err(error); + } + if let Err(error) = fault.ensure_active("after fault workload") { + collect_fault_artifacts( + collector, + scenario.case_name, + &fault, + "workload-outlived-fault", + )?; + return Err(error); + } + let workload_snapshot = fault.snapshot("after-workload")?; + + if let Err(error) = fault.delete(cluster.timeout) { + collect_fault_artifacts(collector, scenario.case_name, &fault, "delete-failed")?; + return Err(error); + } + + wait_for_ready_tenant(cluster).await?; + wait_for_stable_rustfs_pods(cluster, RUSTFS_POD_STABLE_WINDOW).await?; + let pods_after = rustfs_pod_identities(cluster)?; + ensure_s3_access(&mut port_forward, cluster, &endpoint).await?; + workload.summary.recommitted_after_recovery = recommit_unconfirmed_objects( + &s3, + &history, + &workload.unconfirmed_puts, + workload_plan.concurrency, + ) + .await?; + collector.write_text( + scenario.case_name, + "workload-summary.json", + &serde_json::to_string_pretty(&workload.summary)?, + )?; + let report = checker::check_s3_history(&s3, &history, true, workload_plan.concurrency).await?; + collector.write_text( + scenario.case_name, + "checker-report.json", + &serde_json::to_string_pretty(&report)?, + )?; + let evidence = FaultEvidence { + scenario: scenario.name.clone(), + backend: format!("{:?}", spec.backend), + target: spec.target.to_string(), + injected: true, + active_during_workload: true, + recovered: report.tenant_recovered, + client_disruptions: workload.summary.disrupted(), + workload_plan, + pods_before, + pods_after, + active_snapshot, + workload_snapshot, + dm_recovery_snapshot: fault.recovery_dm_snapshot(), + }; + collector.write_text( + scenario.case_name, + "fault-evidence.json", + &serde_json::to_string_pretty(&evidence)?, + )?; + ensure!( + report.committed_puts == scenario.object_count, + "fault scenario {} expected {} committed objects after recovery reconciliation, got {}", + scenario.name, + scenario.object_count, + report.committed_puts + ); + report.require_success()?; + + Ok(()) +} + +fn require_fault_backend(config: &FaultTestConfig, backend: FaultBackend) -> Result<()> { + let cluster = &config.cluster; + match backend { + FaultBackend::ChaosMeshIoChaos => chaos_mesh::require_iochaos_crd(cluster), + FaultBackend::MinioWarpWithChaos => { + chaos_mesh::require_iochaos_crd(cluster)?; + require_tool("warp", ["--help"]) + } + FaultBackend::ChaosMeshPodChaos => chaos_mesh::require_podchaos_crd(cluster), + FaultBackend::ChaosMeshNetworkChaos => chaos_mesh::require_networkchaos_crd(cluster), + FaultBackend::DeviceMapper => require_dm_flakey_preflight(config), + } +} + +fn require_tool(program: &'static str, args: I) -> Result<()> +where + I: IntoIterator, + S: Into, +{ + CommandSpec::new(program) + .args(args) + .run_checked() + .with_context(|| format!("{program} is required for the selected fault scenario"))?; + Ok(()) +} + +fn require_dm_flakey_preflight(config: &FaultTestConfig) -> Result<()> { + config + .dm_name + .as_deref() + .context("RUSTFS_FAULT_TEST_DM_NAME is required for dm-flakey")?; + config + .dm_node + .as_deref() + .context("RUSTFS_FAULT_TEST_DM_NODE is required for dm-flakey")?; + config + .dm_mount_path + .as_deref() + .context("RUSTFS_FAULT_TEST_DM_MOUNT_PATH is required for dm-flakey")?; + config + .dm_fault_table + .as_deref() + .context("RUSTFS_FAULT_TEST_DM_FAULT_TABLE is required for dm-flakey")?; + Ok(()) +} + +fn cleanup_fault_backend(config: &FaultTestConfig, backend: FaultBackend) -> Result<()> { + match backend { + FaultBackend::ChaosMeshIoChaos | FaultBackend::MinioWarpWithChaos => { + chaos_mesh::cleanup_managed_iochaos(&config.cluster, &config.chaos_namespace) + } + FaultBackend::ChaosMeshPodChaos => { + chaos_mesh::cleanup_managed_podchaos(&config.cluster, &config.chaos_namespace) + } + FaultBackend::ChaosMeshNetworkChaos => { + chaos_mesh::cleanup_managed_networkchaos(&config.cluster, &config.chaos_namespace) + } + FaultBackend::DeviceMapper => Ok(()), + } +} + +fn prepare_fault_fixture(config: &ClusterTestConfig, isolation: FaultIsolation) -> Result<()> { + match isolation { + FaultIsolation::ReusableTenant => resources::apply_fault_tenant_resources(config)?, + FaultIsolation::FreshTenant | FaultIsolation::DedicatedLinuxBlockDevice => { + resources::reset_fault_tenant_resources(config)?; + resources::apply_fault_tenant_resources(config)?; + } + } + Ok(()) +} + +enum AppliedFault { + Chaos { + guard: Box, + active_required: bool, + }, + PodKill { + guard: Box, + before_pods: Vec, + config: Box, + }, + DmFlakey(Box), } -#[test] -#[ignore = "reserved for destructive fault scenarios; run through `make e2e-live-faults`"] -fn fault_live_suite_requires_explicit_destructive_opt_in() -> Result<()> { - let config = E2eConfig::from_env(); +impl AppliedFault { + fn apply( + config: &FaultTestConfig, + collector: &ArtifactCollector, + scenario: &FaultScenario, + backend: FaultBackend, + run_id: &str, + ) -> Result { + let cluster = &config.cluster; + match backend { + FaultBackend::ChaosMeshIoChaos if scenario.name == DISK_FULL_SCENARIO => { + let chaos = IoChaosSpec::enospc_on_rustfs_volume( + cluster, + &config.chaos_namespace, + run_id, + &scenario.name, + RUSTFS_DATA_VOLUME, + scenario.percent, + scenario.duration, + )?; + collector.write_text( + scenario.case_name, + "chaos-manifest.yaml", + &chaos.manifest(), + )?; + Ok(Self::Chaos { + guard: Box::new(chaos_mesh::apply_iochaos(cluster, &chaos)?), + active_required: true, + }) + } + FaultBackend::ChaosMeshIoChaos if scenario.name == IO_READ_MISTAKE_SCENARIO => { + let chaos = IoChaosSpec::read_mistake_on_rustfs_volume( + cluster, + &config.chaos_namespace, + run_id, + &scenario.name, + RUSTFS_DATA_VOLUME, + scenario.percent, + scenario.duration, + )?; + collector.write_text( + scenario.case_name, + "chaos-manifest.yaml", + &chaos.manifest(), + )?; + Ok(Self::Chaos { + guard: Box::new(chaos_mesh::apply_iochaos(cluster, &chaos)?), + active_required: true, + }) + } + FaultBackend::ChaosMeshIoChaos => { + let chaos = IoChaosSpec::eio_on_rustfs_volume( + cluster, + &config.chaos_namespace, + run_id, + &scenario.name, + RUSTFS_DATA_VOLUME, + scenario.percent, + scenario.duration, + )?; + collector.write_text( + scenario.case_name, + "chaos-manifest.yaml", + &chaos.manifest(), + )?; + Ok(Self::Chaos { + guard: Box::new(chaos_mesh::apply_iochaos(cluster, &chaos)?), + active_required: true, + }) + } + FaultBackend::ChaosMeshPodChaos => { + let before_pods = rustfs_pod_identities(cluster)?; + let chaos = PodChaosSpec::kill_one_rustfs_pod( + cluster, + &config.chaos_namespace, + run_id, + &scenario.name, + ); + collector.write_text( + scenario.case_name, + "chaos-manifest.yaml", + &chaos.manifest(), + )?; + Ok(Self::PodKill { + guard: Box::new(chaos_mesh::apply_podchaos(cluster, &chaos)?), + before_pods, + config: Box::new(cluster.clone()), + }) + } + FaultBackend::ChaosMeshNetworkChaos => { + let chaos = NetworkChaosSpec::partition_one_rustfs_pod( + cluster, + &config.chaos_namespace, + run_id, + &scenario.name, + scenario.duration, + )?; + collector.write_text( + scenario.case_name, + "chaos-manifest.yaml", + &chaos.manifest(), + )?; + Ok(Self::Chaos { + guard: Box::new(chaos_mesh::apply_networkchaos(cluster, &chaos)?), + active_required: true, + }) + } + FaultBackend::DeviceMapper => { + let name = config + .dm_name + .as_deref() + .context("RUSTFS_FAULT_TEST_DM_NAME is required for dm-flakey")?; + let fault_table = config + .dm_fault_table + .as_deref() + .context("RUSTFS_FAULT_TEST_DM_FAULT_TABLE is required for dm-flakey")?; + let node = config + .dm_node + .as_deref() + .context("RUSTFS_FAULT_TEST_DM_NODE is required for dm-flakey")?; + let mount_path = config + .dm_mount_path + .as_deref() + .context("RUSTFS_FAULT_TEST_DM_MOUNT_PATH is required for dm-flakey")?; + Ok(Self::DmFlakey(Box::new(host_faults::apply_dm_flakey( + cluster, + &DmFlakeySpec { + node, + mount_path, + helper_image: &config.dm_helper_image, + name, + fault_table, + recovery_table: config.dm_recovery_table.as_deref(), + run_id, + }, + collector, + scenario.case_name, + )?))) + } + FaultBackend::MinioWarpWithChaos => { + let chaos = IoChaosSpec::eio_on_rustfs_volume( + cluster, + &config.chaos_namespace, + run_id, + &scenario.name, + RUSTFS_DATA_VOLUME, + scenario.percent, + scenario.duration, + )?; + collector.write_text( + scenario.case_name, + "chaos-manifest.yaml", + &chaos.manifest(), + )?; + let guard = chaos_mesh::apply_iochaos(cluster, &chaos)?; + Ok(Self::Chaos { + guard: Box::new(guard), + active_required: true, + }) + } + } + } + + fn wait_active(&self, timeout: Duration) -> Result<()> { + match self { + Self::Chaos { + guard, + active_required, + } if *active_required => guard.wait_active(timeout), + Self::PodKill { + before_pods, + config, + .. + } => wait_for_rustfs_pod_deletion(config, before_pods, timeout), + Self::Chaos { .. } | Self::DmFlakey(_) => Ok(()), + } + } + + fn ensure_active(&self, stage: &str) -> Result<()> { + match self { + Self::Chaos { + guard, + active_required, + } if *active_required => guard.ensure_active(stage), + Self::PodKill { .. } | Self::Chaos { .. } => Ok(()), + Self::DmFlakey(guard) => { + guard.ensure_active("after fault workload")?; + Ok(()) + } + } + } + + fn delete(&mut self, timeout: Duration) -> Result<()> { + match self { + Self::Chaos { guard, .. } => guard.delete(), + Self::PodKill { + guard, + before_pods, + config, + } => { + guard.delete()?; + wait_for_rustfs_pod_replacement(config, before_pods, timeout) + } + Self::DmFlakey(guard) => guard.restore(), + } + } + + fn chaos_guard(&self) -> Option<&ChaosGuard> { + match self { + Self::Chaos { guard, .. } | Self::PodKill { guard, .. } => Some(guard.as_ref()), + Self::DmFlakey(_) => None, + } + } - live::require_live_enabled(&config)?; - live::ensure_dedicated_context(&config)?; - live::require_destructive_enabled(&config)?; + fn snapshot(&self, stage: &str) -> Result { + match self { + Self::Chaos { guard, .. } | Self::PodKill { guard, .. } => Ok(FaultStatusSnapshot { + stage: stage.to_string(), + resource_kind: Some(guard.kind().to_string()), + resource_name: Some(guard.name().to_string()), + chaos_status: Some(serde_json::from_str(&guard.json()?)?), + dm_status: None, + }), + Self::DmFlakey(guard) => Ok(FaultStatusSnapshot { + stage: stage.to_string(), + resource_kind: Some("device-mapper".to_string()), + resource_name: None, + chaos_status: None, + dm_status: Some(guard.snapshot(stage)?), + }), + } + } + + fn recovery_dm_snapshot(&self) -> Option { + match self { + Self::DmFlakey(guard) => guard.recovery_snapshot().cloned(), + Self::Chaos { .. } | Self::PodKill { .. } => None, + } + } +} + +#[derive(Debug, Clone, Serialize)] +struct FaultStatusSnapshot { + stage: String, + resource_kind: Option, + resource_name: Option, + chaos_status: Option, + dm_status: Option, +} + +#[derive(Debug, Clone, Serialize)] +struct FaultEvidence { + scenario: String, + backend: String, + target: String, + injected: bool, + active_during_workload: bool, + recovered: bool, + client_disruptions: usize, + workload_plan: WorkloadPlan, + pods_before: Vec, + pods_after: Vec, + active_snapshot: FaultStatusSnapshot, + workload_snapshot: FaultStatusSnapshot, + dm_recovery_snapshot: Option, +} + +fn collect_fault_artifacts( + collector: &ArtifactCollector, + case_name: &str, + fault: &AppliedFault, + suffix: &str, +) -> Result<()> { + let status = fault + .snapshot(suffix) + .and_then(|snapshot| serde_json::to_string_pretty(&snapshot).map_err(Into::into)) + .unwrap_or_else(|error| format!("failed to collect fault status: {error}")); + collector.write_text(case_name, &format!("fault-status-{suffix}.json"), &status)?; + + if let Some(guard) = fault.chaos_guard() { + let describe = guard + .describe() + .unwrap_or_else(|error| format!("failed to describe chaos before cleanup: {error}")); + collector.write_text( + case_name, + &format!("chaos-describe-{suffix}.txt"), + &describe, + )?; + + let yaml = guard + .yaml() + .unwrap_or_else(|error| format!("failed to get chaos yaml before cleanup: {error}")); + collector.write_text(case_name, &format!("chaos-{suffix}.yaml"), &yaml)?; + } Ok(()) } + +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] +struct PodIdentity { + name: String, + uid: String, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct PodRuntimeState { + name: String, + uid: String, + phase: String, + containers_ready: bool, + restart_count: u64, + terminating: bool, +} + +fn rustfs_pod_identities(config: &ClusterTestConfig) -> Result> { + let selector = format!("rustfs.tenant={}", config.tenant_name); + let output = rustfs_operator_e2e::framework::kubectl::Kubectl::new(config) + .namespaced(&config.test_namespace) + .command(["get", "pod", "-l", &selector, "-o", "json"]) + .run_checked()?; + let value = serde_json::from_str::(&output.stdout) + .context("parse RustFS pod list json")?; + let items = value + .pointer("/items") + .and_then(serde_json::Value::as_array) + .context("RustFS pod list did not contain an items array")?; + let pods = items + .iter() + .filter_map(|item| { + let metadata = item.get("metadata")?; + Some(PodIdentity { + name: metadata.get("name")?.as_str()?.to_string(), + uid: metadata.get("uid")?.as_str()?.to_string(), + }) + }) + .collect::>(); + ensure!( + !pods.is_empty(), + "no RustFS pods found for selector {selector} in namespace {}", + config.test_namespace + ); + Ok(pods) +} + +fn rustfs_pod_runtime_states(config: &ClusterTestConfig) -> Result> { + let selector = format!("rustfs.tenant={}", config.tenant_name); + let output = Kubectl::new(config) + .namespaced(&config.test_namespace) + .command(["get", "pod", "-l", &selector, "-o", "json"]) + .run_checked()?; + let value = serde_json::from_str::(&output.stdout) + .context("parse RustFS pod list json")?; + let items = value + .pointer("/items") + .and_then(serde_json::Value::as_array) + .context("RustFS pod list did not contain an items array")?; + let mut pods = items + .iter() + .map(|item| { + let metadata = item + .get("metadata") + .context("RustFS pod did not contain metadata")?; + let name = metadata + .get("name") + .and_then(serde_json::Value::as_str) + .context("RustFS pod metadata did not contain a name")?; + let uid = metadata + .get("uid") + .and_then(serde_json::Value::as_str) + .context("RustFS pod metadata did not contain a uid")?; + let phase = item + .pointer("/status/phase") + .and_then(serde_json::Value::as_str) + .unwrap_or("Unknown"); + let container_statuses = item + .pointer("/status/containerStatuses") + .and_then(serde_json::Value::as_array); + let containers_ready = container_statuses.is_some_and(|statuses| { + !statuses.is_empty() + && statuses.iter().all(|status| { + status + .get("ready") + .and_then(serde_json::Value::as_bool) + .unwrap_or(false) + }) + }); + let restart_count = container_statuses + .into_iter() + .flatten() + .filter_map(|status| status.get("restartCount")) + .filter_map(serde_json::Value::as_u64) + .sum(); + + Ok(PodRuntimeState { + name: name.to_string(), + uid: uid.to_string(), + phase: phase.to_string(), + containers_ready, + restart_count, + terminating: metadata.get("deletionTimestamp").is_some(), + }) + }) + .collect::>>()?; + pods.sort_by(|left, right| left.name.cmp(&right.name)); + Ok(pods) +} + +fn stable_pod_fingerprint(pods: &[PodRuntimeState]) -> Option> { + if pods.len() != FAULT_TENANT_POD_COUNT + || pods + .iter() + .any(|pod| pod.phase != "Running" || !pod.containers_ready || pod.terminating) + { + return None; + } + + Some( + pods.iter() + .map(|pod| (pod.uid.clone(), pod.restart_count)) + .collect(), + ) +} + +async fn wait_for_stable_rustfs_pods( + config: &ClusterTestConfig, + stable_window: Duration, +) -> Result<()> { + let deadline = Instant::now() + config.timeout; + let mut stable_since = None; + let mut stable_fingerprint = None; + let mut last_snapshot = Vec::new(); + let mut last_error = "not checked yet".to_string(); + + eprintln!( + "waiting for {FAULT_TENANT_POD_COUNT} RustFS pods to remain ready without restarts for {stable_window:?}" + ); + loop { + if Instant::now() >= deadline { + bail!( + "timed out waiting for stable RustFS pods after {:?}\nlast: {last_snapshot:?}\nlast error: {last_error}", + config.timeout + ); + } + + match rustfs_pod_runtime_states(config) { + Ok(current) => { + if let Some(fingerprint) = stable_pod_fingerprint(¤t) { + if stable_fingerprint.as_ref() != Some(&fingerprint) { + stable_since = Some(Instant::now()); + stable_fingerprint = Some(fingerprint); + } + if stable_since.is_some_and(|started| started.elapsed() >= stable_window) { + eprintln!("RustFS pods remained stable for {stable_window:?}"); + return Ok(()); + } + } else { + stable_since = None; + stable_fingerprint = None; + } + last_snapshot = current; + last_error = "none".to_string(); + } + Err(error) => { + stable_since = None; + stable_fingerprint = None; + last_error = error.to_string(); + } + } + + async_sleep(Duration::from_secs(1)).await; + } +} + +fn wait_for_rustfs_pod_replacement( + config: &ClusterTestConfig, + before: &[PodIdentity], + timeout: Duration, +) -> Result<()> { + let deadline = Instant::now() + timeout; + let mut last_snapshot = Vec::new(); + let mut last_error = "not checked yet".to_string(); + + loop { + if Instant::now() >= deadline { + bail!( + "timed out waiting for PodChaos to replace a RustFS pod after {timeout:?}\nbefore: {before:?}\nlast: {last_snapshot:?}\nlast error: {last_error}", + ); + } + + match rustfs_pod_identities(config) { + Ok(current) => { + if pod_replacement_observed(before, ¤t) { + return Ok(()); + } + last_snapshot = current; + last_error = "none".to_string(); + } + Err(error) => { + last_error = error.to_string(); + } + } + + sleep(Duration::from_secs(1)); + } +} + +fn wait_for_rustfs_pod_deletion( + config: &ClusterTestConfig, + before: &[PodIdentity], + timeout: Duration, +) -> Result<()> { + let deadline = Instant::now() + timeout; + let mut last_snapshot = Vec::new(); + let mut last_error = "not checked yet".to_string(); + + loop { + if Instant::now() >= deadline { + bail!( + "timed out waiting for PodChaos to delete a RustFS pod after {timeout:?}\nbefore: {before:?}\nlast: {last_snapshot:?}\nlast error: {last_error}", + ); + } + + match rustfs_pod_identities(config) { + Ok(current) => { + if pod_deletion_observed(before, ¤t) { + return Ok(()); + } + last_snapshot = current; + last_error = "none".to_string(); + } + Err(error) => { + last_error = error.to_string(); + } + } + + sleep(Duration::from_millis(250)); + } +} + +fn pod_deletion_observed(before: &[PodIdentity], current: &[PodIdentity]) -> bool { + let current_uids = current + .iter() + .map(|pod| pod.uid.as_str()) + .collect::>(); + !before.is_empty() + && before + .iter() + .any(|pod| !current_uids.contains(pod.uid.as_str())) +} + +fn pod_replacement_observed(before: &[PodIdentity], current: &[PodIdentity]) -> bool { + if before.is_empty() || current.is_empty() { + return false; + } + + let before_uids = before + .iter() + .map(|pod| pod.uid.as_str()) + .collect::>(); + let current_uids = current + .iter() + .map(|pod| pod.uid.as_str()) + .collect::>(); + let old_uid_removed = before_uids.iter().any(|uid| !current_uids.contains(uid)); + let new_uid_added = current_uids.iter().any(|uid| !before_uids.contains(uid)); + + old_uid_removed && new_uid_added +} + +async fn wait_for_ready_tenant(config: &ClusterTestConfig) -> Result { + let client = kube_client::default_client().await?; + let tenants: Api = kube_client::tenant_api(client, &config.test_namespace); + wait::wait_for_tenant_ready(tenants, &config.tenant_name, config.timeout).await +} + +fn s3_access(config: &FaultTestConfig) -> Result<(String, Option)> { + let cluster = &config.cluster; + if config.use_cluster_ip { + let service = format!("{}-io", cluster.tenant_name); + let output = Kubectl::new(cluster) + .namespaced(&cluster.test_namespace) + .command([ + "get".to_string(), + "service".to_string(), + service.clone(), + "-o".to_string(), + "jsonpath={.spec.clusterIP}".to_string(), + ]) + .run_checked() + .with_context(|| format!("read ClusterIP for fault-test service {service:?}"))?; + let cluster_ip = output.stdout.trim(); + ensure!( + !cluster_ip.is_empty() && cluster_ip != "None", + "fault-test service {service:?} does not have a ClusterIP" + ); + let host = if cluster_ip.contains(':') { + format!("[{cluster_ip}]") + } else { + cluster_ip.to_string() + }; + return Ok((format!("http://{host}:9000"), None)); + } + + let spec = PortForwardSpec::tenant_io(&cluster.test_namespace, &cluster.tenant_name); + let endpoint = spec.local_base_url(); + Ok((endpoint, Some(PortForwardSpec::start_tenant_io(cluster)?))) +} + +async fn ensure_s3_access( + port_forward: &mut Option, + config: &ClusterTestConfig, + endpoint: &str, +) -> Result<()> { + if let Some(guard) = port_forward { + if guard.ensure_running().is_err() { + *guard = PortForwardSpec::start_tenant_io(config)?; + } + return wait_for_tenant_s3(guard, endpoint, config.timeout).await; + } + + wait_for_s3_endpoint(endpoint, config.timeout).await +} + +async fn wait_for_tenant_s3( + port_forward: &mut PortForwardGuard, + endpoint: &str, + timeout: Duration, +) -> Result<()> { + port_forward.ensure_running()?; + wait_for_s3_endpoint(endpoint, timeout) + .await + .with_context(|| { + format!( + "S3 port-forward was not ready; command: {}; log {}:\n{}", + port_forward.command_display(), + port_forward.log_path().display(), + port_forward.log_contents() + ) + }) +} + +async fn prefill_objects( + s3: &S3WorkloadClient, + history: &Recorder, + run_id: &str, + plan: &WorkloadPlan, + count: usize, +) -> Result> { + let tasks = (0..count).map(|index| { + let s3 = s3.clone(); + let history = history.clone(); + let run_id = run_id.to_string(); + let size_bytes = plan.size_at(index); + let seed = plan.seed; + async move { + let object = ObjectSpec::prepare_seeded(&run_id, index, size_bytes, seed); + let spec = object.spec.clone(); + let put_outcome = s3.put_object(&object, &history).await?; + ensure!( + put_outcome == OperationOutcome::Ok, + "prefill PUT failed before fault injection for key {}: {put_outcome:?}", + spec.key + ); + let head_outcome = s3.head_object(&spec.key, &history).await?; + ensure!( + head_outcome == OperationOutcome::Ok, + "prefill HEAD failed before fault injection for key {}: {head_outcome:?}", + spec.key + ); + Ok::<_, anyhow::Error>((index, spec)) + } + }); + let mut objects = stream::iter(tasks) + .buffer_unordered(plan.concurrency) + .try_collect::>() + .await?; + objects.sort_by_key(|(index, _)| *index); + + Ok(objects.into_iter().map(|(_, object)| object).collect()) +} + +async fn run_mixed_workload( + s3: &S3WorkloadClient, + history: &Recorder, + run_id: &str, + plan: &WorkloadPlan, + prefilled: &[ObjectSpec], + start_index: usize, + count: usize, +) -> Result { + let tasks = (0..count).map(|offset| { + let s3 = s3.clone(); + let history = history.clone(); + let run_id = run_id.to_string(); + let index = start_index + offset; + let size_bytes = plan.size_at(index); + let seed = plan.seed; + let existing = prefilled[offset % prefilled.len()].clone(); + async move { + let object = ObjectSpec::prepare_seeded(&run_id, index, size_bytes, seed); + let spec = object.spec.clone(); + let put_outcome = s3.put_object(&object, &history).await?; + let get_outcome = s3.get_object_result(&existing.key, &history).await?.outcome; + Ok::<_, anyhow::Error>(MixedTaskResult { + index, + object: spec, + put_outcome, + get_outcome, + }) + } + }); + let results = stream::iter(tasks) + .buffer_unordered(plan.concurrency) + .collect::>() + .await; + let mut completed = Vec::with_capacity(count); + for result in results { + completed.push(result?); + } + completed.sort_by_key(|result| result.index); + + let mut summary = WorkloadSummary::new(plan); + let mut unconfirmed_puts = Vec::new(); + for result in completed { + summary.puts.record(result.put_outcome); + summary.gets.record(result.get_outcome); + if result.put_outcome != OperationOutcome::Ok { + unconfirmed_puts.push(result.object); + } + } + + summary.require_exercised()?; + Ok(MixedWorkloadResult { + summary, + unconfirmed_puts, + }) +} + +async fn recommit_unconfirmed_objects( + s3: &S3WorkloadClient, + history: &Recorder, + objects: &[ObjectSpec], + concurrency: usize, +) -> Result { + let tasks = objects.iter().cloned().map(|object| { + let s3 = s3.clone(); + let history = history.clone(); + async move { + let prepared = object.prepare(); + let outcome = s3.put_object(&prepared, &history).await?; + Ok::<_, anyhow::Error>((object.key, outcome)) + } + }); + let results = stream::iter(tasks) + .buffer_unordered(concurrency) + .collect::>() + .await; + for result in results { + let (key, outcome) = result?; + ensure!( + outcome == OperationOutcome::Ok, + "PUT for previously unconfirmed object {} did not commit after recovery: {outcome:?}", + key + ); + } + Ok(objects.len()) +} + +#[derive(Debug)] +struct MixedTaskResult { + index: usize, + object: ObjectSpec, + put_outcome: OperationOutcome, + get_outcome: OperationOutcome, +} + +#[derive(Debug)] +struct MixedWorkloadResult { + summary: WorkloadSummary, + unconfirmed_puts: Vec, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize)] +struct WorkloadSummary { + seed: u64, + object_count: usize, + concurrency: usize, + total_payload_bytes: u64, + puts: OutcomeCounts, + gets: OutcomeCounts, + recommitted_after_recovery: usize, +} + +impl WorkloadSummary { + fn new(plan: &WorkloadPlan) -> Self { + Self { + seed: plan.seed, + object_count: plan.object_count, + concurrency: plan.concurrency, + total_payload_bytes: plan.total_payload_bytes, + puts: OutcomeCounts::default(), + gets: OutcomeCounts::default(), + recommitted_after_recovery: 0, + } + } + + fn require_exercised(&self) -> Result<()> { + ensure!( + self.puts.total() > 0 && self.gets.total() > 0, + "fault workload did not exercise both PUT and GET paths: {self:?}" + ); + Ok(()) + } + + fn require_fault_evidence(&self, require_client_disruption: bool) -> Result<()> { + if require_client_disruption { + ensure!( + self.disrupted() > 0, + "fault was applied but the S3 workload observed no client-visible disrupted operation; increase RUSTFS_FAULT_TEST_WORKLOAD_OBJECTS or RUSTFS_FAULT_TEST_PERCENT, or set RUSTFS_FAULT_TEST_REQUIRE_CLIENT_DISRUPTION=0 if this is expected" + ); + } else if self.disrupted() == 0 { + eprintln!( + "fault was applied, but the S3 workload observed no client-visible disrupted operation" + ); + } + Ok(()) + } + + fn disrupted(&self) -> usize { + self.puts.disrupted() + self.gets.disrupted() + } +} + +#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize)] +struct OutcomeCounts { + ok: usize, + failed: usize, + timeout: usize, + unknown: usize, +} + +impl OutcomeCounts { + fn record(&mut self, outcome: OperationOutcome) { + match outcome { + OperationOutcome::Ok => self.ok += 1, + OperationOutcome::Failed => self.failed += 1, + OperationOutcome::Timeout => self.timeout += 1, + OperationOutcome::Unknown => self.unknown += 1, + } + } + + fn total(&self) -> usize { + self.ok + self.failed + self.timeout + self.unknown + } + + fn disrupted(&self) -> usize { + self.failed + self.timeout + self.unknown + } +} + +fn bucket_name(run_id: &str) -> String { + let suffix = run_id + .chars() + .filter(|ch| ch.is_ascii_alphanumeric()) + .take(16) + .collect::() + .to_ascii_lowercase(); + format!("rustfs-fault-{suffix}") +} + +fn generated_seed() -> u64 { + let run = Uuid::new_v4(); + let mut bytes = [0; 8]; + bytes.copy_from_slice(&run.as_bytes()[..8]); + u64::from_le_bytes(bytes) +} + +fn warp_bucket_name(run_id: &str) -> String { + format!("{}-warp", bucket_name(run_id)) +} + +#[cfg(test)] +mod tests { + use super::{ + OutcomeCounts, PodIdentity, PodRuntimeState, WorkloadSummary, bucket_name, + pod_deletion_observed, pod_replacement_observed, stable_pod_fingerprint, warp_bucket_name, + }; + use rustfs_operator_e2e::framework::history::OperationOutcome; + use rustfs_operator_e2e::framework::s3_workload::WorkloadPlan; + + #[test] + fn fault_bucket_name_is_s3_compatible_and_run_scoped() { + assert_eq!( + bucket_name("run-12345678-abcd-efgh"), + "rustfs-fault-run12345678abcde" + ); + assert_eq!( + warp_bucket_name("run-12345678-abcd-efgh"), + "rustfs-fault-run12345678abcde-warp" + ); + } + + #[test] + fn workload_summary_counts_disrupted_operations() { + let mut summary = WorkloadSummary::new(&WorkloadPlan::seeded(42, 40000, 80)); + summary.puts.record(OperationOutcome::Ok); + summary.gets.record(OperationOutcome::Timeout); + + assert_eq!(summary.puts.total(), 1); + assert_eq!(summary.gets.total(), 1); + assert_eq!(summary.disrupted(), 1); + assert!(summary.require_exercised().is_ok()); + assert!(summary.require_fault_evidence(true).is_ok()); + } + + #[test] + fn workload_summary_can_require_fault_evidence() { + let summary = WorkloadSummary { + seed: 42, + object_count: 40000, + concurrency: 80, + total_payload_bytes: 20_337_459_200, + puts: OutcomeCounts { + ok: 1, + ..OutcomeCounts::default() + }, + gets: OutcomeCounts { + ok: 1, + ..OutcomeCounts::default() + }, + recommitted_after_recovery: 0, + }; + + assert!(summary.require_fault_evidence(false).is_ok()); + assert!(summary.require_fault_evidence(true).is_err()); + } + + #[test] + fn pod_replacement_requires_old_uid_removed_and_new_uid_added() { + let before = vec![ + PodIdentity { + name: "rustfs-0".to_string(), + uid: "uid-a".to_string(), + }, + PodIdentity { + name: "rustfs-1".to_string(), + uid: "uid-b".to_string(), + }, + ]; + + assert!(!pod_replacement_observed(&before, &before)); + assert!(!pod_replacement_observed(&before, &before[..1])); + assert!(!pod_deletion_observed(&before, &before)); + assert!(pod_deletion_observed(&before, &before[..1])); + assert!(pod_replacement_observed( + &before, + &[ + PodIdentity { + name: "rustfs-0".to_string(), + uid: "uid-c".to_string(), + }, + before[1].clone(), + ], + )); + } + + #[test] + fn stable_pod_fingerprint_requires_four_ready_unchanged_pods() { + let pods = (0..4) + .map(|index| PodRuntimeState { + name: format!("rustfs-{index}"), + uid: format!("uid-{index}"), + phase: "Running".to_string(), + containers_ready: true, + restart_count: index, + terminating: false, + }) + .collect::>(); + + assert_eq!( + stable_pod_fingerprint(&pods), + Some(vec![ + ("uid-0".to_string(), 0), + ("uid-1".to_string(), 1), + ("uid-2".to_string(), 2), + ("uid-3".to_string(), 3), + ]) + ); + assert!(stable_pod_fingerprint(&pods[..3]).is_none()); + + let mut unready = pods; + unready[0].containers_ready = false; + assert!(stable_pod_fingerprint(&unready).is_none()); + } +}