diff --git a/fluss-common/src/main/java/org/apache/fluss/fs/FileStatus.java b/fluss-common/src/main/java/org/apache/fluss/fs/FileStatus.java
index ad5708e3e9..74b51571ee 100644
--- a/fluss-common/src/main/java/org/apache/fluss/fs/FileStatus.java
+++ b/fluss-common/src/main/java/org/apache/fluss/fs/FileStatus.java
@@ -46,4 +46,18 @@ public interface FileStatus {
* @return the corresponding Path to the FileStatus
*/
FsPath getPath();
+
+ /**
+ * Returns the modification time of the file in milliseconds since the epoch.
+ *
+ *
The default implementation returns {@link Long#MAX_VALUE}, which is interpreted by
+ * time-based filters (e.g. orphan-files cleanup) as "always fresh" - effectively a fail-closed
+ * default that prevents deletion when modification time is unavailable. File system
+ * implementations that can expose modification time SHOULD override this.
+ *
+ * @return the modification time in epoch millis, or {@link Long#MAX_VALUE} when unavailable
+ */
+ default long getModificationTime() {
+ return Long.MAX_VALUE;
+ }
}
diff --git a/fluss-common/src/main/java/org/apache/fluss/fs/local/LocalFileStatus.java b/fluss-common/src/main/java/org/apache/fluss/fs/local/LocalFileStatus.java
index 09184a9756..b8b04aa63b 100644
--- a/fluss-common/src/main/java/org/apache/fluss/fs/local/LocalFileStatus.java
+++ b/fluss-common/src/main/java/org/apache/fluss/fs/local/LocalFileStatus.java
@@ -67,6 +67,11 @@ public FsPath getPath() {
return this.path;
}
+ @Override
+ public long getModificationTime() {
+ return this.file.lastModified();
+ }
+
public File getFile() {
return this.file;
}
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/log/remote/RemoteLogManifest.java b/fluss-common/src/main/java/org/apache/fluss/remote/RemoteLogManifest.java
similarity index 96%
rename from fluss-server/src/main/java/org/apache/fluss/server/log/remote/RemoteLogManifest.java
rename to fluss-common/src/main/java/org/apache/fluss/remote/RemoteLogManifest.java
index b255b8718d..bc856e361d 100644
--- a/fluss-server/src/main/java/org/apache/fluss/server/log/remote/RemoteLogManifest.java
+++ b/fluss-common/src/main/java/org/apache/fluss/remote/RemoteLogManifest.java
@@ -15,12 +15,10 @@
* limitations under the License.
*/
-package org.apache.fluss.server.log.remote;
+package org.apache.fluss.remote;
-import org.apache.fluss.annotation.VisibleForTesting;
import org.apache.fluss.metadata.PhysicalTablePath;
import org.apache.fluss.metadata.TableBucket;
-import org.apache.fluss.remote.RemoteLogSegment;
import java.util.ArrayList;
import java.util.Collections;
@@ -33,7 +31,7 @@
/**
* A remote log manifest is an immutable list of current {@link RemoteLogSegment} which can
- * represent a snapshot of {@link RemoteLogTablet}.
+ * represent a snapshot of a remote log tablet.
*/
public class RemoteLogManifest {
private final PhysicalTablePath physicalTablePath;
@@ -122,7 +120,6 @@ public TableBucket getTableBucket() {
return tableBucket;
}
- @VisibleForTesting
public List getRemoteLogSegmentList() {
return remoteLogSegmentList;
}
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/log/remote/RemoteLogManifestJsonSerde.java b/fluss-common/src/main/java/org/apache/fluss/remote/RemoteLogManifestJsonSerde.java
similarity index 98%
rename from fluss-server/src/main/java/org/apache/fluss/server/log/remote/RemoteLogManifestJsonSerde.java
rename to fluss-common/src/main/java/org/apache/fluss/remote/RemoteLogManifestJsonSerde.java
index 27c5488490..c90a85ea02 100644
--- a/fluss-server/src/main/java/org/apache/fluss/server/log/remote/RemoteLogManifestJsonSerde.java
+++ b/fluss-common/src/main/java/org/apache/fluss/remote/RemoteLogManifestJsonSerde.java
@@ -15,11 +15,10 @@
* limitations under the License.
*/
-package org.apache.fluss.server.log.remote;
+package org.apache.fluss.remote;
import org.apache.fluss.metadata.PhysicalTablePath;
import org.apache.fluss.metadata.TableBucket;
-import org.apache.fluss.remote.RemoteLogSegment;
import org.apache.fluss.shaded.jackson2.com.fasterxml.jackson.core.JsonGenerator;
import org.apache.fluss.shaded.jackson2.com.fasterxml.jackson.databind.JsonNode;
import org.apache.fluss.utils.json.JsonDeserializer;
diff --git a/fluss-common/src/main/java/org/apache/fluss/utils/FlussPaths.java b/fluss-common/src/main/java/org/apache/fluss/utils/FlussPaths.java
index 9a0659f180..1c75663ba3 100644
--- a/fluss-common/src/main/java/org/apache/fluss/utils/FlussPaths.java
+++ b/fluss-common/src/main/java/org/apache/fluss/utils/FlussPaths.java
@@ -74,7 +74,7 @@ public class FlussPaths {
public static final String REMOTE_LOG_DIR_NAME = "log";
/** The directory name for storing metadata files (e.g., manifest) for a log tablet. */
- private static final String REMOTE_LOG_METADATA_DIR_NAME = "metadata";
+ public static final String REMOTE_LOG_METADATA_DIR_NAME = "metadata";
/** Suffix of a manifest file. */
private static final String REMOTE_LOG_MANIFEST_FILE_SUFFIX = ".manifest";
diff --git a/fluss-common/src/test/java/org/apache/fluss/fs/FileStatusTest.java b/fluss-common/src/test/java/org/apache/fluss/fs/FileStatusTest.java
new file mode 100644
index 0000000000..f491a56b60
--- /dev/null
+++ b/fluss-common/src/test/java/org/apache/fluss/fs/FileStatusTest.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.fs;
+
+import org.junit.jupiter.api.Test;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/** Tests for default methods of {@link FileStatus}. */
+class FileStatusTest {
+
+ /**
+ * An implementation that does not override {@link FileStatus#getModificationTime()} must
+ * inherit the fail-safe default of {@link Long#MAX_VALUE}, so time-based filters treat the file
+ * as "always fresh" and never delete it when modification time is unavailable.
+ */
+ @Test
+ void defaultModificationTimeIsMaxValueFailSafe() {
+ FileStatus status =
+ new FileStatus() {
+ @Override
+ public long getLen() {
+ return 0L;
+ }
+
+ @Override
+ public boolean isDir() {
+ return false;
+ }
+
+ @Override
+ public FsPath getPath() {
+ return new FsPath("/tmp/x");
+ }
+ };
+
+ assertThat(status.getModificationTime()).isEqualTo(Long.MAX_VALUE);
+ }
+}
diff --git a/fluss-server/src/test/java/org/apache/fluss/server/log/remote/RemoteLogManifestJsonSerdeTest.java b/fluss-common/src/test/java/org/apache/fluss/remote/RemoteLogManifestJsonSerdeTest.java
similarity index 97%
rename from fluss-server/src/test/java/org/apache/fluss/server/log/remote/RemoteLogManifestJsonSerdeTest.java
rename to fluss-common/src/test/java/org/apache/fluss/remote/RemoteLogManifestJsonSerdeTest.java
index da4024ffc4..e095132158 100644
--- a/fluss-server/src/test/java/org/apache/fluss/server/log/remote/RemoteLogManifestJsonSerdeTest.java
+++ b/fluss-common/src/test/java/org/apache/fluss/remote/RemoteLogManifestJsonSerdeTest.java
@@ -15,18 +15,17 @@
* limitations under the License.
*/
-package org.apache.fluss.server.log.remote;
+package org.apache.fluss.remote;
import org.apache.fluss.metadata.PhysicalTablePath;
import org.apache.fluss.metadata.TableBucket;
import org.apache.fluss.metadata.TablePath;
-import org.apache.fluss.remote.RemoteLogSegment;
import org.apache.fluss.utils.json.JsonSerdeTestBase;
import java.util.Arrays;
import java.util.UUID;
-/** Tests of {@link org.apache.fluss.server.log.remote.RemoteLogManifestJsonSerde}. */
+/** Tests of {@link RemoteLogManifestJsonSerde}. */
class RemoteLogManifestJsonSerdeTest extends JsonSerdeTestBase {
private static final PhysicalTablePath TABLE_PATH1 =
PhysicalTablePath.of(TablePath.of("db", "mytable"));
diff --git a/fluss-filesystems/fluss-fs-hadoop/src/main/java/org/apache/fluss/fs/hdfs/HadoopFileStatus.java b/fluss-filesystems/fluss-fs-hadoop/src/main/java/org/apache/fluss/fs/hdfs/HadoopFileStatus.java
index f54033a693..47c9febcfe 100644
--- a/fluss-filesystems/fluss-fs-hadoop/src/main/java/org/apache/fluss/fs/hdfs/HadoopFileStatus.java
+++ b/fluss-filesystems/fluss-fs-hadoop/src/main/java/org/apache/fluss/fs/hdfs/HadoopFileStatus.java
@@ -52,6 +52,11 @@ public boolean isDir() {
return fileStatus.isDirectory();
}
+ @Override
+ public long getModificationTime() {
+ return fileStatus.getModificationTime();
+ }
+
// ------------------------------------------------------------------------
/**
diff --git a/fluss-flink/fluss-flink-1.18/pom.xml b/fluss-flink/fluss-flink-1.18/pom.xml
index 1636f25569..9f67b6ce9b 100644
--- a/fluss-flink/fluss-flink-1.18/pom.xml
+++ b/fluss-flink/fluss-flink-1.18/pom.xml
@@ -219,6 +219,14 @@
org.apache.fluss:fluss-client
+
+
+ org.apache.fluss:fluss-flink-common
+
+ org/apache/fluss/flink/action/**
+
+
+
@@ -226,4 +234,4 @@
-
\ No newline at end of file
+
diff --git a/fluss-flink/fluss-flink-1.19/pom.xml b/fluss-flink/fluss-flink-1.19/pom.xml
index a9df2c830a..d16e6e46a8 100644
--- a/fluss-flink/fluss-flink-1.19/pom.xml
+++ b/fluss-flink/fluss-flink-1.19/pom.xml
@@ -213,6 +213,11 @@
org.apache.fluss:fluss-client
+
+
+ org.apache.fluss.flink.action.FlussActionEntrypoint
+
+
@@ -220,4 +225,4 @@
-
\ No newline at end of file
+
diff --git a/fluss-flink/fluss-flink-1.19/src/main/resources/META-INF/services/org.apache.fluss.flink.action.ActionFactory b/fluss-flink/fluss-flink-1.19/src/main/resources/META-INF/services/org.apache.fluss.flink.action.ActionFactory
new file mode 100644
index 0000000000..c30c9dd5ab
--- /dev/null
+++ b/fluss-flink/fluss-flink-1.19/src/main/resources/META-INF/services/org.apache.fluss.flink.action.ActionFactory
@@ -0,0 +1,19 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+org.apache.fluss.flink.action.orphan.OrphanFilesCleanActionFactory
diff --git a/fluss-flink/fluss-flink-1.19/src/test/java/org/apache/fluss/flink/action/orphan/Flink119OrphanFilesCleanITCase.java b/fluss-flink/fluss-flink-1.19/src/test/java/org/apache/fluss/flink/action/orphan/Flink119OrphanFilesCleanITCase.java
new file mode 100644
index 0000000000..d775605170
--- /dev/null
+++ b/fluss-flink/fluss-flink-1.19/src/test/java/org/apache/fluss/flink/action/orphan/Flink119OrphanFilesCleanITCase.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan;
+
+/** The IT case for orphan files cleanup in Flink 1.19. */
+class Flink119OrphanFilesCleanITCase extends OrphanFilesCleanITCase {}
diff --git a/fluss-flink/fluss-flink-1.20/pom.xml b/fluss-flink/fluss-flink-1.20/pom.xml
index 25d867b398..ab0915f6e8 100644
--- a/fluss-flink/fluss-flink-1.20/pom.xml
+++ b/fluss-flink/fluss-flink-1.20/pom.xml
@@ -234,6 +234,11 @@
org.apache.fluss:fluss-client
+
+
+ org.apache.fluss.flink.action.FlussActionEntrypoint
+
+
@@ -241,4 +246,4 @@
-
\ No newline at end of file
+
diff --git a/fluss-flink/fluss-flink-1.20/src/main/resources/META-INF/services/org.apache.fluss.flink.action.ActionFactory b/fluss-flink/fluss-flink-1.20/src/main/resources/META-INF/services/org.apache.fluss.flink.action.ActionFactory
new file mode 100644
index 0000000000..c30c9dd5ab
--- /dev/null
+++ b/fluss-flink/fluss-flink-1.20/src/main/resources/META-INF/services/org.apache.fluss.flink.action.ActionFactory
@@ -0,0 +1,19 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+org.apache.fluss.flink.action.orphan.OrphanFilesCleanActionFactory
diff --git a/fluss-flink/fluss-flink-1.20/src/test/java/org/apache/fluss/flink/action/orphan/Flink120OrphanFilesCleanITCase.java b/fluss-flink/fluss-flink-1.20/src/test/java/org/apache/fluss/flink/action/orphan/Flink120OrphanFilesCleanITCase.java
new file mode 100644
index 0000000000..0dc35613f9
--- /dev/null
+++ b/fluss-flink/fluss-flink-1.20/src/test/java/org/apache/fluss/flink/action/orphan/Flink120OrphanFilesCleanITCase.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan;
+
+/** The IT case for orphan files cleanup in Flink 1.20. */
+class Flink120OrphanFilesCleanITCase extends OrphanFilesCleanITCase {}
diff --git a/fluss-flink/fluss-flink-2.2/pom.xml b/fluss-flink/fluss-flink-2.2/pom.xml
index f2ea4cb597..3337797d4c 100644
--- a/fluss-flink/fluss-flink-2.2/pom.xml
+++ b/fluss-flink/fluss-flink-2.2/pom.xml
@@ -258,6 +258,11 @@
org.apache.fluss:fluss-client
+
+
+ org.apache.fluss.flink.action.FlussActionEntrypoint
+
+
diff --git a/fluss-flink/fluss-flink-2.2/src/main/java/org/apache/fluss/flink/adapter/MultipleParameterToolAdapter.java b/fluss-flink/fluss-flink-2.2/src/main/java/org/apache/fluss/flink/adapter/MultipleParameterToolAdapter.java
index 076dcb86c8..999cd40a8e 100644
--- a/fluss-flink/fluss-flink-2.2/src/main/java/org/apache/fluss/flink/adapter/MultipleParameterToolAdapter.java
+++ b/fluss-flink/fluss-flink-2.2/src/main/java/org/apache/fluss/flink/adapter/MultipleParameterToolAdapter.java
@@ -19,6 +19,9 @@
import org.apache.flink.util.MultipleParameterTool;
+import javax.annotation.Nullable;
+
+import java.util.Collection;
import java.util.Map;
/**
@@ -43,4 +46,23 @@ public static MultipleParameterToolAdapter fromArgs(String[] args) {
public Map toMap() {
return this.multipleParameterTool.toMap();
}
+
+ /** Returns whether the given key is present in the parsed arguments. */
+ public boolean has(String key) {
+ return this.multipleParameterTool.has(key);
+ }
+
+ /** Returns the value for the given key, or {@code null} if the key is not found. */
+ @Nullable
+ public String get(String key) {
+ return this.multipleParameterTool.get(key);
+ }
+
+ /**
+ * Returns all values associated with the given key, or {@code null} if the key is not found.
+ */
+ @Nullable
+ public Collection getMultiParameter(String key) {
+ return this.multipleParameterTool.getMultiParameter(key);
+ }
}
diff --git a/fluss-flink/fluss-flink-2.2/src/main/resources/META-INF/services/org.apache.fluss.flink.action.ActionFactory b/fluss-flink/fluss-flink-2.2/src/main/resources/META-INF/services/org.apache.fluss.flink.action.ActionFactory
new file mode 100644
index 0000000000..c30c9dd5ab
--- /dev/null
+++ b/fluss-flink/fluss-flink-2.2/src/main/resources/META-INF/services/org.apache.fluss.flink.action.ActionFactory
@@ -0,0 +1,19 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+org.apache.fluss.flink.action.orphan.OrphanFilesCleanActionFactory
diff --git a/fluss-flink/fluss-flink-2.2/src/test/java/org/apache/fluss/flink/action/orphan/Flink22OrphanFilesCleanITCase.java b/fluss-flink/fluss-flink-2.2/src/test/java/org/apache/fluss/flink/action/orphan/Flink22OrphanFilesCleanITCase.java
new file mode 100644
index 0000000000..79f15997cc
--- /dev/null
+++ b/fluss-flink/fluss-flink-2.2/src/test/java/org/apache/fluss/flink/action/orphan/Flink22OrphanFilesCleanITCase.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan;
+
+/** The IT case for orphan files cleanup in Flink 2.2. */
+class Flink22OrphanFilesCleanITCase extends OrphanFilesCleanITCase {}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/Action.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/Action.java
new file mode 100644
index 0000000000..98af1da48a
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/Action.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action;
+
+import org.apache.fluss.annotation.Internal;
+
+/** Pluggable Flink action invoked from CLI via {@link FlussFlinkActionEntrypoint}. */
+@Internal
+public interface Action {
+
+ /** Optional setup hook called once before {@link #run()}. */
+ default void build() throws Exception {}
+
+ /** Execute the action. */
+ void run() throws Exception;
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/ActionFactory.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/ActionFactory.java
new file mode 100644
index 0000000000..d68c07ca8f
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/ActionFactory.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action;
+
+import org.apache.fluss.annotation.Internal;
+import org.apache.fluss.flink.adapter.MultipleParameterToolAdapter;
+
+import java.util.Optional;
+
+/** SPI for {@link Action} factories, registered via JDK {@link java.util.ServiceLoader}. */
+@Internal
+public interface ActionFactory {
+
+ /**
+ * Identifier matched against the first CLI argument after lowercasing and replacing {@code -}
+ * with {@code _}.
+ */
+ String identifier();
+
+ /** Construct the action from parsed CLI parameters. Empty when {@code --help} is requested. */
+ Optional create(MultipleParameterToolAdapter params);
+
+ /** Help text printed when {@code --help} is passed. */
+ default String help() {
+ return "";
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/ActionLoader.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/ActionLoader.java
new file mode 100644
index 0000000000..91599e7510
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/ActionLoader.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action;
+
+import org.apache.fluss.annotation.Internal;
+import org.apache.fluss.flink.adapter.MultipleParameterToolAdapter;
+
+import java.util.Arrays;
+import java.util.Optional;
+import java.util.ServiceLoader;
+
+/**
+ * Discovers {@link ActionFactory} implementations via {@link ServiceLoader} and dispatches CLI
+ * arguments to the appropriate {@link Action}.
+ */
+@Internal
+public final class ActionLoader {
+
+ private ActionLoader() {}
+
+ /**
+ * Resolve and create an action from CLI arguments.
+ *
+ * Returns {@link Optional#empty()} when no arguments are provided or when {@code --help} is
+ * requested. Throws {@link IllegalArgumentException} when the requested identifier does not
+ * resolve to a known factory.
+ */
+ public static Optional createAction(String[] args) {
+ if (args.length < 1) {
+ printDefaultHelp();
+ return Optional.empty();
+ }
+ if (isHelp(args[0])) {
+ printDefaultHelp();
+ return Optional.empty();
+ }
+ String name = args[0].toLowerCase().replace('-', '_');
+ ActionFactory factory =
+ findFactory(name)
+ .orElseThrow(
+ () ->
+ new IllegalArgumentException(
+ "Unknown action: "
+ + args[0]
+ + ". Run with --help for available actions."));
+ String[] remaining = Arrays.copyOfRange(args, 1, args.length);
+ if (hasHelp(remaining)) {
+ System.out.println(factory.help());
+ return Optional.empty();
+ }
+ MultipleParameterToolAdapter params = MultipleParameterToolAdapter.fromArgs(remaining);
+ return factory.create(params);
+ }
+
+ private static boolean isHelp(String arg) {
+ return "--help".equals(arg) || "-h".equals(arg);
+ }
+
+ private static boolean hasHelp(String[] args) {
+ for (String arg : args) {
+ if (isHelp(arg)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ private static Optional findFactory(String identifier) {
+ for (ActionFactory f : ServiceLoader.load(ActionFactory.class)) {
+ if (f.identifier().equals(identifier)) {
+ return Optional.of(f);
+ }
+ }
+ return Optional.empty();
+ }
+
+ private static void printDefaultHelp() {
+ System.out.println("Usage: [options]");
+ System.out.println("Available actions:");
+ for (ActionFactory f : ServiceLoader.load(ActionFactory.class)) {
+ System.out.println(" " + f.identifier());
+ }
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/FlussActionEntrypoint.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/FlussActionEntrypoint.java
new file mode 100644
index 0000000000..dda7d4cf93
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/FlussActionEntrypoint.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action;
+
+import java.util.Optional;
+
+/** Main entrypoint for Fluss Flink action jars. Delegates to {@link ActionLoader}. */
+public class FlussActionEntrypoint {
+
+ public static void main(String[] args) throws Exception {
+ Optional action;
+ try {
+ action = ActionLoader.createAction(args);
+ } catch (IllegalArgumentException e) {
+ System.err.println(e.getMessage());
+ System.exit(1);
+ return;
+ }
+ if (!action.isPresent()) {
+ return;
+ }
+ action.get().build();
+ action.get().run();
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/OrphanCleanUtils.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/OrphanCleanUtils.java
new file mode 100644
index 0000000000..24381ab752
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/OrphanCleanUtils.java
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan;
+
+import org.apache.fluss.annotation.Internal;
+import org.apache.fluss.client.admin.Admin;
+import org.apache.fluss.config.ConfigOptions;
+import org.apache.fluss.config.Configuration;
+import org.apache.fluss.config.cluster.ConfigEntry;
+import org.apache.fluss.fs.FsPath;
+import org.apache.fluss.metadata.PartitionInfo;
+import org.apache.fluss.metadata.PhysicalTablePath;
+import org.apache.fluss.metadata.TableBucket;
+import org.apache.fluss.metadata.TableInfo;
+import org.apache.fluss.metadata.TablePath;
+
+import javax.annotation.Nullable;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+
+import static org.apache.fluss.utils.Preconditions.checkNotNull;
+
+/** Shared utility methods for the orphan files cleanup action. */
+@Internal
+public final class OrphanCleanUtils {
+
+ private OrphanCleanUtils() {}
+
+ /**
+ * Constructs a {@link PhysicalTablePath} from a table path and an optional partition. Returns
+ * the non-partitioned form when {@code partitionInfo} is null.
+ */
+ public static PhysicalTablePath physicalPath(
+ TablePath tablePath, @Nullable PartitionInfo partitionInfo) {
+ if (partitionInfo == null) {
+ return PhysicalTablePath.of(tablePath);
+ }
+ return PhysicalTablePath.of(tablePath, partitionInfo.getPartitionName());
+ }
+
+ /**
+ * Enumerates all {@link TableBucket} instances for a table (or a single partition of that
+ * table).
+ */
+ public static List enumerateBuckets(
+ TableInfo tableInfo, @Nullable PartitionInfo partitionInfo) {
+ int n = tableInfo.getNumBuckets();
+ List buckets = new ArrayList(n);
+ long tableId = tableInfo.getTableId();
+ for (int b = 0; b < n; b++) {
+ if (partitionInfo == null) {
+ buckets.add(new TableBucket(tableId, b));
+ } else {
+ buckets.add(new TableBucket(tableId, partitionInfo.getPartitionId(), b));
+ }
+ }
+ return buckets;
+ }
+
+ /**
+ * Resolves the effective remote data directory for a table/partition target using the
+ * three-level fallback: partition-level → table-level → cluster-level. At least one level is
+ * always set because the coordinator assigns a {@code remoteDataDir} to every table at creation
+ * time via {@code RemoteDirSelector.nextDataDir()}.
+ */
+ public static String resolveRemoteDataDir(
+ TableInfo tableInfo,
+ @Nullable PartitionInfo partitionInfo,
+ @Nullable String clusterRemoteDataDir) {
+ if (partitionInfo != null && partitionInfo.getRemoteDataDir() != null) {
+ return partitionInfo.getRemoteDataDir();
+ }
+ if (tableInfo.getRemoteDataDir() != null) {
+ return tableInfo.getRemoteDataDir();
+ }
+ return checkNotNull(
+ clusterRemoteDataDir,
+ "No remote data directory resolvable: partition, table, "
+ + "and cluster levels are all null. This should not happen because the "
+ + "coordinator requires remote.data.dir or remote.data.dirs at startup.");
+ }
+
+ /**
+ * Resolves the cluster-level {@code remote.data.dir} by querying the coordinator's runtime
+ * configuration. Returns {@code null} when the cluster uses {@code remote.data.dirs}
+ * (multi-directory mode) without the legacy single {@code remote.data.dir}.
+ */
+ @Nullable
+ public static String resolveClusterRemoteDataDir(Admin admin) throws Exception {
+ return resolveClusterRemoteDataDir(fetchClusterConfigMap(admin));
+ }
+
+ /** Extracts the single-root {@code remote.data.dir} from a pre-fetched config map. */
+ @Nullable
+ public static String resolveClusterRemoteDataDir(Map configMap) {
+ return configMap.get(ConfigOptions.REMOTE_DATA_DIR.key());
+ }
+
+ /**
+ * Resolves all cluster-level remote data directories by querying the coordinator's runtime
+ * configuration. Reads both the single-root {@code remote.data.dir} and the multi-root {@code
+ * remote.data.dirs}, deduplicates by normalized form, and returns the union as the canonical
+ * root list.
+ *
+ * This is the authoritative source for determining what storage roots the cleanup action is
+ * allowed to touch.
+ *
+ * @return list of normalized roots (no trailing slash); never {@code null}, may be empty if the
+ * cluster has neither config set (which should not happen because the coordinator requires
+ * at least one remote data dir at startup).
+ */
+ public static List resolveClusterRemoteDataDirs(Admin admin) throws Exception {
+ return resolveClusterRemoteDataDirs(fetchClusterConfigMap(admin));
+ }
+
+ /** Extracts all remote data roots from a pre-fetched config map. */
+ public static List resolveClusterRemoteDataDirs(Map configMap) {
+ Configuration conf = Configuration.fromMap(configMap);
+ LinkedHashSet roots = new LinkedHashSet();
+ String singleDir = conf.get(ConfigOptions.REMOTE_DATA_DIR);
+ if (singleDir != null && !singleDir.isEmpty()) {
+ roots.add(normalizeRoot(singleDir));
+ }
+ List multiDirs = conf.get(ConfigOptions.REMOTE_DATA_DIRS);
+ if (multiDirs != null) {
+ for (String dir : multiDirs) {
+ if (dir != null && !dir.isEmpty()) {
+ roots.add(normalizeRoot(dir));
+ }
+ }
+ }
+ return new ArrayList(roots);
+ }
+
+ /**
+ * Fetches the coordinator's runtime configuration as a key-value map. Use this once and pass
+ * the result to the map-based overloads of {@link #resolveClusterRemoteDataDir(Map)} and {@link
+ * #resolveClusterRemoteDataDirs(Map)} to avoid duplicate RPCs.
+ */
+ public static Map fetchClusterConfigMap(Admin admin) throws Exception {
+ Collection entries = admin.describeClusterConfigs().get();
+ Map map = new HashMap();
+ for (ConfigEntry entry : entries) {
+ if (entry.value() != null) {
+ map.put(entry.key(), entry.value());
+ }
+ }
+ return map;
+ }
+
+ /** Constructs a remote sub-directory path, normalizing trailing slashes on the root. */
+ public static FsPath remoteSubDir(String remoteDataDir, String subDir) {
+ return new FsPath(normalizeRoot(remoteDataDir) + "/" + subDir);
+ }
+
+ /** Strips a trailing slash from a remote data directory string. */
+ public static String normalizeRoot(String remoteDataDir) {
+ return remoteDataDir.endsWith("/")
+ ? remoteDataDir.substring(0, remoteDataDir.length() - 1)
+ : remoteDataDir;
+ }
+
+ /** Formats a bucket-scope key for audit/logging purposes. */
+ public static String bucketScopeKey(long tableId, Long partitionId, int bucketId) {
+ return tableId + ":" + partitionId + ":" + bucketId;
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/OrphanFilesCleanAction.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/OrphanFilesCleanAction.java
new file mode 100644
index 0000000000..1f12090783
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/OrphanFilesCleanAction.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan;
+
+import org.apache.fluss.annotation.Internal;
+import org.apache.fluss.flink.action.Action;
+import org.apache.fluss.flink.action.orphan.config.OrphanCleanConfig;
+import org.apache.fluss.flink.action.orphan.job.CleanStats;
+import org.apache.fluss.flink.action.orphan.job.OrphanFilesCleanJob;
+
+import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Orphan files cleanup action. Delegates to a distributed Flink Batch job ({@link
+ * OrphanFilesCleanJob}) that executes a 3-stage DAG:
+ *
+ *
+ * - ScopeEnumerator (p=1): coordinator RPCs to enumerate scope and emit per-bucket work items.
+ *
- ScanAndClean (p=N): parallel FS scan + rate-limited delete.
+ *
- StatsAggregate (p=1): merge per-task stats into final summary.
+ *
+ */
+@Internal
+public class OrphanFilesCleanAction implements Action {
+
+ private static final Logger LOG = LoggerFactory.getLogger(OrphanFilesCleanAction.class);
+
+ private final OrphanCleanConfig config;
+
+ public OrphanFilesCleanAction(OrphanCleanConfig config) {
+ this.config = config;
+ }
+
+ @Override
+ public void run() throws Exception {
+ StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
+ CleanStats stats =
+ OrphanFilesCleanJob.execute(env, config, config.parallelism().orElse(null));
+ LOG.info(
+ "remove_orphan_files done: scope={} scanned={} deletedTotal={}"
+ + " emptyDirsRemoved={} failures={} bytesReclaimed={} dryRun={}",
+ scopeDescription(),
+ stats.scanned(),
+ stats.deleted(),
+ stats.emptyDirsRemoved(),
+ stats.deleteFailures(),
+ stats.bytesReclaimed(),
+ config.dryRun());
+ }
+
+ private String scopeDescription() {
+ String scope =
+ config.allDatabases() ? "all-databases" : config.database().orElse("unknown");
+ if (config.table().isPresent()) {
+ return scope + "." + config.table().get();
+ }
+ return scope;
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/OrphanFilesCleanActionFactory.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/OrphanFilesCleanActionFactory.java
new file mode 100644
index 0000000000..ef6dc7bdc6
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/OrphanFilesCleanActionFactory.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan;
+
+import org.apache.fluss.annotation.Internal;
+import org.apache.fluss.flink.action.Action;
+import org.apache.fluss.flink.action.ActionFactory;
+import org.apache.fluss.flink.action.orphan.config.OrphanCleanConfig;
+import org.apache.fluss.flink.adapter.MultipleParameterToolAdapter;
+
+import java.util.Optional;
+
+/** Factory for the shell-mode orphan files cleanup action. */
+@Internal
+public class OrphanFilesCleanActionFactory implements ActionFactory {
+
+ @Override
+ public String identifier() {
+ return "remove_orphan_files";
+ }
+
+ @Override
+ public Optional create(MultipleParameterToolAdapter params) {
+ return Optional.of(
+ new OrphanFilesCleanAction(OrphanCleanConfig.fromParams(params)));
+ }
+
+ @Override
+ public String help() {
+ return "Usage: remove_orphan_files --bootstrap-server \n"
+ + " (--database [--table ] | --all-databases)\n"
+ + " [--older-than '']\n"
+ + " [--remote-fs-op-rate-limit-per-second 100]\n"
+ + " [--dry-run]\n"
+ + " [--allow-delete-manifest]\n"
+ + " [--allow-clean-orphan-tables]\n"
+ + " [--allow-clean-orphan-partitions]\n"
+ + " [--conf =]...\n"
+ + "\n"
+ + "Notes:\n"
+ + " --older-than is an absolute wall-clock cutoff in ISO-8601 with explicit\n"
+ + " offset (e.g. '2024-01-01T00:00:00+08:00' or '2024-01-01T00:00:00Z').\n"
+ + " Files with mtime strictly less than the cutoff are deletion-eligible.\n"
+ + " Default: now - 3d, computed once at startup. The cutoff is frozen for the\n"
+ + " run, so a long scan cannot accidentally pull in files written after the\n"
+ + " action started. The cutoff must be at least 1d before now (closer cutoffs\n"
+ + " would race with mid-write files).\n"
+ + " Orphan directory detection (table/partition) relies solely on ID guards\n"
+ + " (maxKnownTableId / maxKnownPartitionId), not mtime.\n"
+ + " --table also disables the orphan-table scan (no sibling orphan-table scan in\n"
+ + " the db).\n"
+ + " --conf passes filesystem configuration for remote storage authentication.\n"
+ + " Keys use the same format as server.yaml (e.g. fs.oss.accessKeyId,\n"
+ + " fs.oss.accessKeySecret, fs.oss.endpoint, fs.oss.region). Repeatable.\n"
+ + "\n"
+ + "Examples:\n"
+ + " remove_orphan_files --bootstrap-server host:9123 --all-databases\n"
+ + " --conf fs.oss.accessKeyId=XXXX --conf fs.oss.accessKeySecret=YYYY\n"
+ + " --conf fs.oss.endpoint=oss-cn-hangzhou-internal.aliyuncs.com\n"
+ + " --conf fs.oss.region=cn-hangzhou";
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/RpcErrorClassifier.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/RpcErrorClassifier.java
new file mode 100644
index 0000000000..8f0994213f
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/RpcErrorClassifier.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan;
+
+import org.apache.fluss.annotation.Internal;
+import org.apache.fluss.exception.FlussRuntimeException;
+import org.apache.fluss.exception.PartitionNotExistException;
+import org.apache.fluss.exception.TableNotExistException;
+
+import java.io.IOException;
+import java.util.concurrent.CompletionException;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.TimeoutException;
+
+/**
+ * Classifies RPC exceptions raised during scope enumeration and per-target active-set fetch into a
+ * small, audit-stable vocabulary. The category name is what surfaces as the {@code reason=} field
+ * of {@code skip_log_target} / {@code skip_kv_target} audit events, so operators triage by exact
+ * string and the enum must not be widened lightly.
+ *
+ *
+ * - {@link Category#NOT_FOUND} — legitimate "object does not exist"; the enumerator treats it
+ * as the target having disappeared concurrently and silently skips it without alarm.
+ *
- {@link Category#TRANSIENT} — IO / timeout / ZK connection loss; the target is skipped this
+ * round and naturally retried in the next cleanup round.
+ *
- {@link Category#SERVER_ERROR} — server-side failure; same skip, but audited at higher
+ * severity so an operator can investigate.
+ *
- {@link Category#UNKNOWN} — anything not matched above; conservatively skipped + audited.
+ *
+ */
+@Internal
+public final class RpcErrorClassifier {
+
+ private RpcErrorClassifier() {}
+
+ /** Categories of RPC errors. */
+ public enum Category {
+ NOT_FOUND,
+ TRANSIENT,
+ SERVER_ERROR,
+ UNKNOWN
+ }
+
+ /**
+ * Classifies a thrown exception. Unwraps {@link CompletionException}/{@link
+ * ExecutionException}.
+ */
+ public static Category classify(Throwable t) {
+ Throwable cause = unwrap(t);
+ if (cause instanceof TableNotExistException
+ || cause instanceof PartitionNotExistException) {
+ return Category.NOT_FOUND;
+ }
+ if (cause instanceof IOException || cause instanceof TimeoutException) {
+ return Category.TRANSIENT;
+ }
+ if (cause instanceof FlussRuntimeException) {
+ return Category.SERVER_ERROR;
+ }
+ return Category.UNKNOWN;
+ }
+
+ private static Throwable unwrap(Throwable t) {
+ while (t instanceof CompletionException || t instanceof ExecutionException) {
+ if (t.getCause() == null) {
+ return t;
+ }
+ t = t.getCause();
+ }
+ return t;
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/audit/AuditLogger.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/audit/AuditLogger.java
new file mode 100644
index 0000000000..26adf5f00e
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/audit/AuditLogger.java
@@ -0,0 +1,236 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.audit;
+
+import org.apache.fluss.annotation.Internal;
+import org.apache.fluss.flink.action.orphan.rule.RuleId;
+import org.apache.fluss.fs.FsPath;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.time.Instant;
+import java.time.ZoneId;
+import java.time.format.DateTimeFormatter;
+
+/**
+ * Structured audit log writer for the orphan files cleanup action.
+ *
+ * The dedicated logger name {@code fluss.orphan.audit} can be routed to a separate sink (e.g.
+ * SLS) by deployment-specific log4j configuration.
+ */
+@Internal
+public final class AuditLogger {
+
+ private static final Logger AUDIT = LoggerFactory.getLogger("fluss.orphan.audit");
+
+ /**
+ * Formats cutoff epoch-ms back to the {@code yyyy-MM-dd HH:mm:ss} CLI grammar in the server's
+ * local zone, so the audit line and the original {@code --older-than} value can be compared
+ * verbatim.
+ */
+ private static final DateTimeFormatter CUTOFF_FORMATTER =
+ DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss").withZone(ZoneId.systemDefault());
+
+ /**
+ * One-shot startup event recording the frozen file cutoff that drives this run's deletion
+ * decisions. Emitted before any other audit line so log readers can recover the exact threshold
+ * without having to re-parse the original CLI arguments.
+ */
+ public void logCutoff(long olderThanMillis) {
+ AUDIT.info(
+ "action=cutoff older_than_iso={} older_than_ms={} ts={}",
+ CUTOFF_FORMATTER.format(Instant.ofEpochMilli(olderThanMillis)),
+ olderThanMillis,
+ Instant.now());
+ }
+
+ public void logDeleted(FsPath path, RuleId ruleId, boolean ok) {
+ AUDIT.info("action=deleted rule={} path={} ok={} ts={}", ruleId, path, ok, Instant.now());
+ }
+
+ public void logWouldDelete(FsPath path, RuleId ruleId) {
+ AUDIT.info("action=would_delete rule={} path={} ts={}", ruleId, path, Instant.now());
+ }
+
+ public void logDirDeleted(FsPath dir) {
+ AUDIT.info("action=dir_deleted path={} ts={}", dir, Instant.now());
+ }
+
+ public void logWouldDeleteDir(FsPath dir) {
+ AUDIT.info("action=would_delete_dir path={} ts={}", dir, Instant.now());
+ }
+
+ public void logSkipUnknown(FsPath path, RuleId ruleId) {
+ AUDIT.warn("action=skip_unknown rule={} path={} ts={}", ruleId, path, Instant.now());
+ }
+
+ public void logBucketAborted(String bucketStr, String reason) {
+ AUDIT.error(
+ "action=bucket_aborted bucket={} reason={} ts={}",
+ bucketStr,
+ reason,
+ Instant.now());
+ }
+
+ /** Skip an entire database during scope enumeration due to listTables failure. */
+ public void logSkipDb(String dbName, String reason) {
+ AUDIT.warn("action=skip_db reason={} db={} ts={}", reason, dbName, Instant.now());
+ }
+
+ /** Skip a single table during scope enumeration due to getTableInfo or RPC failure. */
+ public void logSkipTable(String dbName, String tableName, String reason) {
+ AUDIT.warn(
+ "action=skip_table reason={} db={} table={} ts={}",
+ reason,
+ dbName,
+ tableName,
+ Instant.now());
+ }
+
+ /**
+ * Skip listPartitionInfos for a table due to RPC failure (both active-partition cleanup and
+ * orphan-partition scan are suppressed for this table).
+ */
+ public void logSkipPartitionList(String dbName, String tableName, String reason) {
+ AUDIT.warn(
+ "action=skip_partition_list reason={} db={} table={} ts={}",
+ reason,
+ dbName,
+ tableName,
+ Instant.now());
+ }
+
+ /**
+ * Skip KV cleanup for one (tableId, partitionId) target — emitted when {@code ListKvSnapshots}
+ * fails after retries. {@code partitionId} is null for non-partitioned tables.
+ */
+ public void logSkipKvTarget(long tableId, Long partitionId, String reason) {
+ AUDIT.warn(
+ "action=skip_kv_target reason={} table_id={} partition_id={} ts={}",
+ reason,
+ tableId,
+ partitionId,
+ Instant.now());
+ }
+
+ /**
+ * Skip KV cleanup for a single bucket whose {@code ListKvSnapshots} response carried no
+ * active-snapshot entries. Empty per-bucket active set is treated as "cannot prove what is
+ * active" and the bucket is skipped to avoid mis-deletion.
+ */
+ public void logSkipKvBucket(long tableId, Long partitionId, int bucketId, String reason) {
+ AUDIT.warn(
+ "action=skip_kv_bucket reason={} table_id={} partition_id={} bucket_id={} ts={}",
+ reason,
+ tableId,
+ partitionId,
+ bucketId,
+ Instant.now());
+ }
+
+ /**
+ * Skip log cleanup for one (tableId, partitionId) target — emitted when {@code
+ * ListRemoteLogManifests} fails after retries. {@code partitionId} is null for non-partitioned
+ * tables.
+ */
+ public void logSkipLogTarget(long tableId, Long partitionId, String reason) {
+ AUDIT.warn(
+ "action=skip_log_target reason={} table_id={} partition_id={} ts={}",
+ reason,
+ tableId,
+ partitionId,
+ Instant.now());
+ }
+
+ /**
+ * Skip log cleanup for a single bucket whose remote manifest was not returned by the {@code
+ * ListRemoteLogManifests} RPC (the bucket has not yet committed any remote manifest).
+ */
+ public void logSkipLogBucket(long tableId, Long partitionId, int bucketId, String reason) {
+ AUDIT.warn(
+ "action=skip_log_bucket reason={} table_id={} partition_id={} bucket_id={} ts={}",
+ reason,
+ tableId,
+ partitionId,
+ bucketId,
+ Instant.now());
+ }
+
+ /** Default-conservative skip of an orphan-table dir (opt-in flag not set). */
+ public void logSkipOrphanTable(FsPath dir, String reason) {
+ AUDIT.info("action=skip_orphan_table reason={} path={} ts={}", reason, dir, Instant.now());
+ }
+
+ /**
+ * Skip the orphan-table scan for a database whose table-info set is incomplete (e.g. {@code
+ * --table} single-table mode, or {@code listTables}/{@code getTableInfo} failures left holes in
+ * the active table id set). Distinct from {@link #logSkipDb}, which means the whole database
+ * scope is dropped.
+ */
+ public void logSkipOrphanTableScan(String dbName, String reason) {
+ AUDIT.warn(
+ "action=skip_orphan_table_scan reason={} db={} ts={}",
+ reason,
+ dbName,
+ Instant.now());
+ }
+
+ /** Default-conservative skip of an orphan-partition dir (opt-in flag not set). */
+ public void logSkipOrphanPartition(FsPath dir, String reason) {
+ AUDIT.info(
+ "action=skip_orphan_partition reason={} path={} ts={}", reason, dir, Instant.now());
+ }
+
+ /** Skip a bucket target because its metadata-resolved root is outside cluster config. */
+ public void logSkipBucketOutOfScope(long tableId, Long partitionId, String resolvedRoot) {
+ AUDIT.info(
+ "action=skip_bucket_target reason=out-of-scope-root table_id={} partition_id={}"
+ + " resolved_root={} ts={}",
+ tableId,
+ partitionId,
+ resolvedRoot,
+ Instant.now());
+ }
+
+ /**
+ * Final summary event emitted once at the end of a run, carrying the headline counters that
+ * operators query most often ("how many files were removed and how much space was reclaimed").
+ * Routed through the dedicated audit logger so the result is queryable from the same sink as
+ * the per-file {@code action=deleted} / {@code action=skip_*} lines.
+ */
+ public void logSummary(
+ long scanned,
+ long deletedFiles,
+ long emptyDirsRemoved,
+ long deleteFailures,
+ long bytesReclaimed,
+ boolean dryRun) {
+ AUDIT.info(
+ "action=summary scanned={} deleted_total={} deleted_files={} empty_dirs_removed={}"
+ + " delete_failures={} bytes_reclaimed={} dry_run={} ts={}",
+ scanned,
+ deletedFiles + emptyDirsRemoved,
+ deletedFiles,
+ emptyDirsRemoved,
+ deleteFailures,
+ bytesReclaimed,
+ dryRun,
+ Instant.now());
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/build/ActiveRefsFetcher.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/build/ActiveRefsFetcher.java
new file mode 100644
index 0000000000..223c6b97c4
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/build/ActiveRefsFetcher.java
@@ -0,0 +1,359 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.build;
+
+import org.apache.fluss.annotation.Internal;
+import org.apache.fluss.annotation.VisibleForTesting;
+import org.apache.fluss.client.admin.Admin;
+import org.apache.fluss.client.metadata.ActiveKvSnapshots;
+import org.apache.fluss.client.metadata.RemoteLogManifestInfo;
+import org.apache.fluss.flink.action.orphan.RpcErrorClassifier;
+import org.apache.fluss.flink.action.orphan.rule.BucketActiveRefs;
+import org.apache.fluss.fs.FSDataInputStream;
+import org.apache.fluss.fs.FsPath;
+import org.apache.fluss.remote.RemoteLogManifest;
+import org.apache.fluss.remote.RemoteLogSegment;
+import org.apache.fluss.shaded.guava32.com.google.common.util.concurrent.RateLimiter;
+import org.apache.fluss.utils.FlussPaths;
+import org.apache.fluss.utils.IOUtils;
+import org.apache.fluss.utils.RetryUtils;
+
+import javax.annotation.Nullable;
+
+import java.io.ByteArrayOutputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.CompletableFuture;
+
+import static org.apache.fluss.utils.Preconditions.checkArgument;
+
+/**
+ * Builds the active reference set for a single {@code (tableId, partitionId|null)} target, sourced
+ * from coordinator metadata via RPC (not from filesystem listing).
+ *
+ *
Log path: discovers each bucket's current remote log manifest path via {@code
+ * LIST_REMOTE_LOG_MANIFESTS}, then second-reads the manifest file from object storage. The
+ * per-target RPC is retried with exponential backoff via {@link RetryUtils}; per-bucket
+ * second-reads make a single attempt — a {@link FileNotFoundException} (manifest upserted between
+ * RPC and read) or any other IO failure immediately marks the bucket as {@link
+ * LogActiveRefsFetchResult.ManifestReadStatus#READ_FAILED} and recovery is left to the next cleanup
+ * round, avoiding {@code N × retries × IO} blow-up on cluster-wide turbulence.
+ *
+ *
KV path: {@code LIST_KV_SNAPSHOTS} returns snapshot ids directly (no second-read), so the
+ * per-target RPC retry alone is sufficient symmetry with the log path.
+ */
+@Internal
+public final class ActiveRefsFetcher {
+
+ /**
+ * Retry backoff base used by {@link RetryUtils} for per-target RPCs. With the default 3 retries
+ * and exponential backoff (200 → 400 → cap) this caps total retry delay at ~600ms — negligible
+ * vs the smoothing it gives over server jitter.
+ */
+ private static final long DEFAULT_BACKOFF_MILLIS = 200L;
+
+ private static final long MAX_BACKOFF_MILLIS = 2000L;
+
+ private static final MetadataReader DEFAULT_METADATA_READER =
+ new MetadataReader() {
+ @Override
+ public byte[] read(FsPath path) throws IOException {
+ ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
+ try (FSDataInputStream inputStream = path.getFileSystem().open(path)) {
+ IOUtils.copyBytes(inputStream, outputStream);
+ }
+ return outputStream.toByteArray();
+ }
+ };
+
+ private final AdminFacade admin;
+ private final MetadataReader metadataReader;
+ private final int maxRetries;
+ private final long backoffMillis;
+ private final RateLimiter remoteFsOpRateLimiter;
+
+ public ActiveRefsFetcher(Admin admin, int maxRetries, RateLimiter remoteFsOpRateLimiter) {
+ this(
+ wrap(admin),
+ DEFAULT_METADATA_READER,
+ maxRetries,
+ DEFAULT_BACKOFF_MILLIS,
+ remoteFsOpRateLimiter);
+ }
+
+ /** Test constructor: defaults backoff to 0 so unit tests don't pay retry sleep. */
+ @VisibleForTesting
+ ActiveRefsFetcher(AdminFacade admin, MetadataReader metadataReader, int maxRetries) {
+ this(admin, metadataReader, maxRetries, 0L);
+ }
+
+ @VisibleForTesting
+ ActiveRefsFetcher(
+ AdminFacade admin, MetadataReader metadataReader, int maxRetries, long backoffMillis) {
+ this(admin, metadataReader, maxRetries, backoffMillis, RateLimiter.create(1000.0));
+ }
+
+ @VisibleForTesting
+ ActiveRefsFetcher(
+ AdminFacade admin,
+ MetadataReader metadataReader,
+ int maxRetries,
+ long backoffMillis,
+ RateLimiter remoteFsOpRateLimiter) {
+ checkArgument(maxRetries >= 1, "maxRetries must be >= 1, got %s", maxRetries);
+ checkArgument(backoffMillis >= 0L, "backoffMillis must be >= 0, got %s", backoffMillis);
+ this.admin = admin;
+ this.metadataReader = metadataReader;
+ this.maxRetries = maxRetries;
+ this.backoffMillis = backoffMillis;
+ this.remoteFsOpRateLimiter = remoteFsOpRateLimiter;
+ }
+
+ private static AdminFacade wrap(Admin admin) {
+ return new AdminFacade() {
+ @Override
+ public CompletableFuture> listRemoteLogManifests(
+ long tableId, @Nullable Long partitionId) {
+ return admin.listRemoteLogManifests(tableId, partitionId);
+ }
+
+ @Override
+ public CompletableFuture listKvSnapshots(
+ long tableId, @Nullable Long partitionId) {
+ return admin.listKvSnapshots(tableId, partitionId);
+ }
+ };
+ }
+
+ /**
+ * Fetches per-bucket log active refs for a single {@code (tableId, partitionId|null)} target.
+ * Each bucket whose remote manifest is returned by the RPC is second-read in a single attempt;
+ * a {@link FileNotFoundException} or any other IO failure marks the bucket as {@link
+ * LogActiveRefsFetchResult.ManifestReadStatus#READ_FAILED} without affecting siblings.
+ * Per-target RPC failure (after retries) is reported via {@link
+ * LogActiveRefsFetchResult#listOk()}.
+ */
+ public LogActiveRefsFetchResult fetchLogActiveRefsByBucket(
+ long tableId, @Nullable Long partitionId) {
+ List manifests;
+ try {
+ manifests =
+ RetryUtils.executeWithRetry(
+ () -> admin.listRemoteLogManifests(tableId, partitionId).get(),
+ "listRemoteLogManifests",
+ maxRetries,
+ backoffMillis,
+ MAX_BACKOFF_MILLIS,
+ e ->
+ RpcErrorClassifier.classify(e)
+ != RpcErrorClassifier.Category.NOT_FOUND);
+ } catch (IOException e) {
+ return LogActiveRefsFetchResult.listFailed(
+ formatRpcFailureReason(tableId, partitionId, e.getCause()));
+ }
+
+ Map> entriesByBucket = new HashMap<>();
+ for (RemoteLogManifestInfo entry : manifests) {
+ int bucketId = entry.getTableBucket().getBucket();
+ entriesByBucket.computeIfAbsent(bucketId, id -> new ArrayList<>()).add(entry);
+ }
+
+ Map resolved = new HashMap<>();
+ Map readFailures = new HashMap<>();
+ for (Map.Entry> bucketEntries :
+ entriesByBucket.entrySet()) {
+ int bucketId = bucketEntries.getKey();
+ try {
+ resolved.put(bucketId, buildBucketActiveRefs(bucketEntries.getValue()));
+ } catch (FileNotFoundException e) {
+ readFailures.put(
+ bucketId,
+ formatBucketReadFailureReason(
+ "Manifest not found (likely upserted concurrently)",
+ tableId,
+ partitionId,
+ bucketId,
+ e));
+ } catch (ManifestParseException e) {
+ // Manifest payload is unreadable or violates the shared manifest serde schema.
+ // Distinct reason so operators triage separately from transient FS hiccups.
+ readFailures.put(
+ bucketId,
+ formatBucketReadFailureReason(
+ "Manifest parse failure (corrupt or unexpected schema)",
+ tableId,
+ partitionId,
+ bucketId,
+ e));
+ } catch (IOException e) {
+ readFailures.put(
+ bucketId,
+ formatBucketReadFailureReason(
+ "IO error reading manifest", tableId, partitionId, bucketId, e));
+ }
+ }
+ return LogActiveRefsFetchResult.ofPerBucket(resolved, readFailures);
+ }
+
+ /**
+ * Fetches the per-bucket active snapshot directories ({@code snap-{id}} names) for one {@code
+ * (tableId, partitionId|null)} target. The set per bucket is the union of RETAINED and
+ * STILL_IN_USE entries returned by {@link Admin#listKvSnapshots(long, Long)}. Per-target RPC
+ * failure (after retries) is reported via {@link KvActiveRefsFetchResult#listOk()}, symmetric
+ * with the log path.
+ */
+ public KvActiveRefsFetchResult fetchKvActiveSnapDirs(long tableId, @Nullable Long partitionId) {
+ ActiveKvSnapshots activeKvSnapshots;
+ try {
+ activeKvSnapshots =
+ RetryUtils.executeWithRetry(
+ () -> admin.listKvSnapshots(tableId, partitionId).get(),
+ "listKvSnapshots",
+ maxRetries,
+ backoffMillis,
+ MAX_BACKOFF_MILLIS,
+ e ->
+ RpcErrorClassifier.classify(e)
+ != RpcErrorClassifier.Category.NOT_FOUND);
+ } catch (IOException e) {
+ return KvActiveRefsFetchResult.listFailed(
+ formatRpcFailureReason(tableId, partitionId, e.getCause()));
+ }
+ Map> dirsByBucket = new HashMap<>();
+ for (Map.Entry> entry :
+ activeKvSnapshots.getSnapshotIdsByBucket().entrySet()) {
+ int bucketId = entry.getKey();
+ Set dirNames = new HashSet<>();
+ for (Long snapshotId : entry.getValue()) {
+ dirNames.add(FlussPaths.REMOTE_KV_SNAPSHOT_DIR_PREFIX + snapshotId);
+ }
+ dirsByBucket.put(bucketId, dirNames);
+ }
+ return KvActiveRefsFetchResult.ok(dirsByBucket);
+ }
+
+ private static String formatRpcFailureReason(
+ long tableId, @Nullable Long partitionId, @Nullable Throwable cause) {
+ String reason =
+ String.format("RPC failure for tableId=%s partitionId=%s", tableId, partitionId);
+ if (cause != null && cause.getMessage() != null) {
+ reason = reason + ": " + cause.getMessage();
+ }
+ return reason;
+ }
+
+ private static String formatBucketReadFailureReason(
+ String prefix,
+ long tableId,
+ @Nullable Long partitionId,
+ int bucketId,
+ Throwable cause) {
+ String reason =
+ String.format(
+ "%s for tableId=%s partitionId=%s bucketId=%s",
+ prefix, tableId, partitionId, bucketId);
+ if (cause != null && cause.getMessage() != null) {
+ reason = reason + ": " + cause.getMessage();
+ }
+ return reason;
+ }
+
+ private BucketActiveRefs buildBucketActiveRefs(List entries)
+ throws IOException {
+ Set manifestPaths = new HashSet<>();
+ Set segmentRelpaths = new HashSet<>();
+ for (RemoteLogManifestInfo entry : entries) {
+ String path = entry.getRemoteLogManifestPath();
+ manifestPaths.add(path);
+ remoteFsOpRateLimiter.acquire();
+ byte[] manifestBytes = metadataReader.read(new FsPath(path));
+ segmentRelpaths.addAll(parseLogSegmentRelativePaths(manifestBytes));
+ }
+ return new BucketActiveRefs(segmentRelpaths, Collections.emptySet(), manifestPaths);
+ }
+
+ private Set parseLogSegmentRelativePaths(byte[] manifestBytes)
+ throws ManifestParseException {
+ RemoteLogManifest manifest;
+ try {
+ manifest = RemoteLogManifest.fromJsonBytes(manifestBytes);
+ } catch (RuntimeException e) {
+ throw new ManifestParseException("Failed to parse remote log manifest", e);
+ }
+
+ Set relativePaths = new HashSet<>();
+ for (RemoteLogSegment segment : manifest.getRemoteLogSegmentList()) {
+ String segmentId = segment.remoteLogSegmentId().toString();
+ long startOffset = segment.remoteLogStartOffset();
+ long endOffset = segment.remoteLogEndOffset();
+ String baseOffset = FlussPaths.filenamePrefixFromOffset(startOffset);
+ String writerOffset = FlussPaths.filenamePrefixFromOffset(endOffset);
+
+ relativePaths.add(segmentId + "/" + baseOffset + FlussPaths.LOG_FILE_SUFFIX);
+ relativePaths.add(segmentId + "/" + baseOffset + FlussPaths.INDEX_FILE_SUFFIX);
+ relativePaths.add(segmentId + "/" + baseOffset + FlussPaths.TIME_INDEX_FILE_SUFFIX);
+ relativePaths.add(
+ segmentId + "/" + writerOffset + FlussPaths.WRITER_SNAPSHOT_FILE_SUFFIX);
+ }
+ return relativePaths;
+ }
+
+ /**
+ * Thrown when a remote-log manifest payload is structurally invalid (missing required field,
+ * wrong shape). Distinct from {@link IOException} so the bucket-read failure handler can route
+ * it to the {@code "Manifest parse failure"} reason instead of the generic {@code "IO error"}
+ * bucket — same skip-this-round outcome, different operator triage.
+ */
+ static final class ManifestParseException extends IOException {
+ ManifestParseException(String message, Throwable cause) {
+ super(message, cause);
+ }
+ }
+
+ /**
+ * Thin abstraction over the {@link FlussAdmin} read-only RPCs the builder depends on ({@code
+ * listRemoteLogManifests} for the log active manifest, {@code listKvSnapshots} for the KV
+ * active snapshot dirs). Exposed for test injection.
+ */
+ @VisibleForTesting
+ interface AdminFacade {
+ CompletableFuture> listRemoteLogManifests(
+ long tableId, @Nullable Long partitionId);
+
+ CompletableFuture listKvSnapshots(
+ long tableId, @Nullable Long partitionId);
+ }
+
+ /**
+ * Abstraction for reading manifest files from object storage. Must throw {@link
+ * FileNotFoundException} (and not a wrapped variant) when the path is absent, so the caller can
+ * distinguish "manifest pointer upserted concurrently" from genuine IO failures and surface
+ * each with a distinct failure reason.
+ */
+ @VisibleForTesting
+ interface MetadataReader {
+ byte[] read(FsPath path) throws IOException;
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/build/KvActiveRefsFetchResult.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/build/KvActiveRefsFetchResult.java
new file mode 100644
index 0000000000..7b1c6c7873
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/build/KvActiveRefsFetchResult.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.build;
+
+import org.apache.fluss.annotation.Internal;
+
+import javax.annotation.Nullable;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Result of KV active-snapshot-dir fetch for one {@code (tableId, partitionId|null)} target.
+ *
+ * Mirrors the per-target {@code listOk + listFailureReason} axis of {@link
+ * LogActiveRefsFetchResult}. KV has no per-bucket failure dimension because the {@code
+ * LIST_KV_SNAPSHOTS} RPC returns snapshot ids directly (no second-read of an external file), so the
+ * per-bucket payload is just {@code Map>} of {@code snap-{id}} directory
+ * names. Buckets absent from the map are treated by the consumer as "empty active set → skip".
+ */
+@Internal
+public final class KvActiveRefsFetchResult {
+
+ private final RpcListStatus list;
+ private final Map> activeSnapDirsByBucket;
+
+ private KvActiveRefsFetchResult(
+ RpcListStatus list, Map> activeSnapDirsByBucket) {
+ this.list = list;
+ Map> copy = new HashMap<>();
+ for (Map.Entry> e : activeSnapDirsByBucket.entrySet()) {
+ copy.put(e.getKey(), Collections.unmodifiableSet(new HashSet<>(e.getValue())));
+ }
+ this.activeSnapDirsByBucket = Collections.unmodifiableMap(copy);
+ }
+
+ /** Result for a target whose {@code LIST_KV_SNAPSHOTS} RPC failed and exhausted retries. */
+ public static KvActiveRefsFetchResult listFailed(String reason) {
+ return new KvActiveRefsFetchResult(
+ RpcListStatus.listFailed(reason), Collections.emptyMap());
+ }
+
+ /** Result for a target whose {@code LIST_KV_SNAPSHOTS} RPC succeeded. */
+ static KvActiveRefsFetchResult ok(Map> activeSnapDirsByBucket) {
+ return new KvActiveRefsFetchResult(RpcListStatus.ok(), activeSnapDirsByBucket);
+ }
+
+ /** Whether the per-target {@code LIST_KV_SNAPSHOTS} RPC succeeded. */
+ public boolean listOk() {
+ return list.isOk();
+ }
+
+ /** Reason the per-target RPC failed; {@code null} when {@link #listOk()} is true. */
+ @Nullable
+ public String listFailureReason() {
+ return list.reason();
+ }
+
+ /**
+ * Per-bucket active snapshot directory names ({@code snap-{id}}). Empty map when {@link
+ * #listOk()} is false.
+ *
+ * Bucket absent from the map means "the RPC returned no active-snapshot entries for this
+ * bucket", which the consumer must treat as "cannot prove what is active here → skip KV
+ * cleanup for this bucket and emit {@code skip_kv_bucket reason=empty_active_set}". Empty does
+ * not mean "no active snapshots exist": the server enumerates buckets from ZK and that path can
+ * transiently underreport (partial reads, znode creation lag, stale historical bucket counts),
+ * so treating empty as no-op-skip is the only response compatible with the action's "may leak,
+ * must not mis-delete" hard constraint.
+ */
+ public Map> activeSnapDirsByBucket() {
+ return activeSnapDirsByBucket;
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/build/LogActiveRefsFetchResult.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/build/LogActiveRefsFetchResult.java
new file mode 100644
index 0000000000..44c1227694
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/build/LogActiveRefsFetchResult.java
@@ -0,0 +1,143 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.build;
+
+import org.apache.fluss.annotation.Internal;
+import org.apache.fluss.flink.action.orphan.rule.BucketActiveRefs;
+
+import javax.annotation.Nullable;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Result of log active-refs fetch for one {@code (tableId, partitionId|null)} target.
+ *
+ * The result is split along two orthogonal axes so each axis can be queried independently:
+ *
+ *
+ * - Per-target: {@link #listOk()} reports whether the {@code LIST_REMOTE_LOG_MANIFESTS}
+ * RPC succeeded. When it fails the per-bucket axis is meaningless and the caller should emit
+ * a single per-target skip and bypass the per-bucket loop entirely.
+ *
- Per-bucket: {@link #statusFor(int)} reports one of {@link
+ * ManifestReadStatus#RESOLVED}, {@link ManifestReadStatus#READ_FAILED}, or {@link
+ * ManifestReadStatus#NOT_LISTED} for every bucket enumerated from table metadata. Only
+ * meaningful when {@link #listOk()} is true.
+ *
+ */
+@Internal
+public final class LogActiveRefsFetchResult {
+
+ /** Per-bucket outcome (only meaningful when {@link #listOk()} is true). */
+ public enum ManifestReadStatus {
+ /** The RPC returned an entry for this bucket and its manifest was read successfully. */
+ RESOLVED,
+ /**
+ * Per-bucket manifest second-read failed (FileNotFound from manifest upsert race, or other
+ * IO failure). The failing bucket is skipped for this round; recovery is by the next
+ * cleanup round.
+ */
+ READ_FAILED,
+ /**
+ * Table metadata enumerates the bucket, but the {@code LIST_REMOTE_LOG_MANIFESTS} response
+ * did not include an entry for it — typically because the bucket has not yet committed any
+ * remote manifest (e.g. log tiering has not produced one), or an occasional server-side
+ * underreport (e.g. partial ZK read). Cleanup has nothing to clean for this bucket.
+ */
+ NOT_LISTED
+ }
+
+ private final RpcListStatus list;
+ private final Map resolved;
+ private final Map readFailures;
+
+ private LogActiveRefsFetchResult(
+ RpcListStatus list,
+ Map resolved,
+ Map readFailures) {
+ this.list = list;
+ this.resolved = Collections.unmodifiableMap(new HashMap<>(resolved));
+ this.readFailures = Collections.unmodifiableMap(new HashMap<>(readFailures));
+ }
+
+ /**
+ * Result for a target whose {@code LIST_REMOTE_LOG_MANIFESTS} RPC failed and exhausted retries.
+ */
+ public static LogActiveRefsFetchResult listFailed(String reason) {
+ return new LogActiveRefsFetchResult(
+ RpcListStatus.listFailed(reason), Collections.emptyMap(), Collections.emptyMap());
+ }
+
+ /**
+ * Result for a target whose {@code LIST_REMOTE_LOG_MANIFESTS} RPC succeeded. {@code resolved}
+ * carries the per-bucket active refs for RESOLVED buckets; {@code readFailures} carries the
+ * per-bucket failure reasons for READ_FAILED buckets. Any bucket not present in either map is
+ * reported as {@link ManifestReadStatus#NOT_LISTED}.
+ */
+ static LogActiveRefsFetchResult ofPerBucket(
+ Map resolved, Map readFailures) {
+ return new LogActiveRefsFetchResult(RpcListStatus.ok(), resolved, readFailures);
+ }
+
+ /** Whether the per-target {@code LIST_REMOTE_LOG_MANIFESTS} RPC succeeded. */
+ public boolean listOk() {
+ return list.isOk();
+ }
+
+ /** Reason the per-target RPC failed; {@code null} when {@link #listOk()} is true. */
+ @Nullable
+ public String listFailureReason() {
+ return list.reason();
+ }
+
+ /**
+ * Per-bucket manifest read status for a bucket enumerated from table metadata. Callers must
+ * first check {@link #listOk()} and skip the per-bucket loop entirely when it is false.
+ */
+ public ManifestReadStatus statusFor(int bucketId) {
+ if (!list.isOk()) {
+ throw new IllegalStateException("Per-bucket status is not available when listOk=false");
+ }
+ if (resolved.containsKey(bucketId)) {
+ return ManifestReadStatus.RESOLVED;
+ }
+ if (readFailures.containsKey(bucketId)) {
+ return ManifestReadStatus.READ_FAILED;
+ }
+ return ManifestReadStatus.NOT_LISTED;
+ }
+
+ /** Active refs for a RESOLVED bucket. */
+ public BucketActiveRefs activeRefsOf(int bucketId) {
+ BucketActiveRefs activeRefs = resolved.get(bucketId);
+ if (activeRefs == null) {
+ throw new IllegalStateException("Bucket " + bucketId + " is not RESOLVED");
+ }
+ return activeRefs;
+ }
+
+ /** Failure reason for a READ_FAILED bucket. */
+ public String readFailureReason(int bucketId) {
+ String reason = readFailures.get(bucketId);
+ if (reason == null) {
+ throw new IllegalStateException("Bucket " + bucketId + " is not READ_FAILED");
+ }
+ return reason;
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/build/MaxKnownIdsTracker.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/build/MaxKnownIdsTracker.java
new file mode 100644
index 0000000000..c77d03323b
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/build/MaxKnownIdsTracker.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.build;
+
+import org.apache.fluss.annotation.Internal;
+
+/**
+ * Accumulates {@code maxKnownTableId} and {@code maxKnownPartitionId} during a single cleanup run.
+ *
+ * Values are updated from the successful scope-enumeration metadata lookups that already
+ * materialize concrete ids for cleanup orchestration: {@code getTableInfo()} for tables and {@code
+ * listPartitionInfos()} for partitions. The tracker is therefore pure RPC-derived and never sourced
+ * from FS dir-name parsing.
+ *
+ *
The tracked maximums serve as ID guards for orphan directory detection: only
+ * directories whose parsed ID is {@code <=} the observed maximum can be classified as orphan
+ * candidates. Directories with higher IDs are conservatively skipped as potentially freshly
+ * allocated. Because RPC failures cause the tracker to observe fewer IDs, the maximums are always a
+ * lower bound of the true cluster-wide maximum — making the guard strictly more conservative (safe
+ * direction) under partial failures.
+ */
+@Internal
+public final class MaxKnownIdsTracker {
+
+ private long maxKnownTableId = -1L;
+ private long maxKnownPartitionId = -1L;
+
+ public void observeTableId(long tableId) {
+ if (tableId > maxKnownTableId) {
+ maxKnownTableId = tableId;
+ }
+ }
+
+ public void observePartitionId(long partitionId) {
+ if (partitionId > maxKnownPartitionId) {
+ maxKnownPartitionId = partitionId;
+ }
+ }
+
+ public long maxKnownTableId() {
+ return maxKnownTableId;
+ }
+
+ public long maxKnownPartitionId() {
+ return maxKnownPartitionId;
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/build/RpcListStatus.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/build/RpcListStatus.java
new file mode 100644
index 0000000000..4113dd500c
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/build/RpcListStatus.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.build;
+
+import javax.annotation.Nullable;
+
+/**
+ * Per-target status of a list RPC (target = one {@code (tableId, partitionId|null)} pair), shared
+ * by {@link LogActiveRefsFetchResult} and {@link KvActiveRefsFetchResult}.
+ *
+ *
Captures the {@code listOk + listFailureReason} pair so both result types can delegate the
+ * per-target axis to a single value and surface identical {@code listOk()} / {@code
+ * listFailureReason()} APIs to consumers.
+ */
+final class RpcListStatus {
+
+ private static final RpcListStatus OK = new RpcListStatus(true, null);
+
+ private final boolean ok;
+ @Nullable private final String reason;
+
+ private RpcListStatus(boolean ok, @Nullable String reason) {
+ this.ok = ok;
+ this.reason = reason;
+ }
+
+ static RpcListStatus ok() {
+ return OK;
+ }
+
+ static RpcListStatus listFailed(String reason) {
+ return new RpcListStatus(false, reason);
+ }
+
+ boolean isOk() {
+ return ok;
+ }
+
+ @Nullable
+ String reason() {
+ return reason;
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/config/OrphanCleanConfig.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/config/OrphanCleanConfig.java
new file mode 100644
index 0000000000..839ca7ccc1
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/config/OrphanCleanConfig.java
@@ -0,0 +1,310 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.config;
+
+import org.apache.fluss.annotation.Internal;
+import org.apache.fluss.flink.adapter.MultipleParameterToolAdapter;
+import org.apache.fluss.utils.StringUtils;
+
+import javax.annotation.Nullable;
+
+import java.io.Serializable;
+import java.time.Duration;
+import java.time.Instant;
+import java.time.OffsetDateTime;
+import java.time.format.DateTimeParseException;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Optional;
+
+/** Parsed command-line options for the orphan files cleanup action. */
+@Internal
+public final class OrphanCleanConfig implements Serializable {
+
+ private static final long serialVersionUID = 1L;
+
+ /**
+ * Minimum gap between any user-supplied cutoff and {@code now}. A cutoff closer to {@code now}
+ * would risk classifying files that are mid-write (committed file written, snapshot/manifest
+ * not yet visible to {@code ListRemoteLogManifests} / {@code ListKvSnapshots}) as orphan and
+ * deleting them.
+ */
+ private static final Duration HARD_LOWER_BOUND = Duration.ofDays(1);
+
+ /** Default file-level cutoff: files written before {@code now - 3d} are deletion-eligible. */
+ private static final Duration DEFAULT_OLDER_THAN = Duration.ofDays(3);
+
+ private static final long DEFAULT_REMOTE_FS_OP_RATE_LIMIT_PER_SECOND = 100L;
+
+ private final String bootstrapServer;
+ private final boolean allDatabases;
+ private final @Nullable String database;
+ private final @Nullable String table;
+ private final long olderThanMillis;
+ private final boolean dryRun;
+ private final long remoteFsOpRateLimitPerSecond;
+ private final @Nullable Integer parallelism;
+ private final boolean allowDeleteManifest;
+ private final boolean allowCleanOrphanTables;
+ private final boolean allowCleanOrphanPartitions;
+ private final Map extraConfigs;
+
+ private OrphanCleanConfig(
+ String bootstrapServer,
+ boolean allDatabases,
+ @Nullable String database,
+ @Nullable String table,
+ long olderThanMillis,
+ boolean dryRun,
+ long remoteFsOpRateLimitPerSecond,
+ @Nullable Integer parallelism,
+ boolean allowDeleteManifest,
+ boolean allowCleanOrphanTables,
+ boolean allowCleanOrphanPartitions,
+ Map extraConfigs) {
+ this.bootstrapServer = bootstrapServer;
+ this.allDatabases = allDatabases;
+ this.database = database;
+ this.table = table;
+ this.olderThanMillis = olderThanMillis;
+ this.dryRun = dryRun;
+ this.remoteFsOpRateLimitPerSecond = remoteFsOpRateLimitPerSecond;
+ this.parallelism = parallelism;
+ this.allowDeleteManifest = allowDeleteManifest;
+ this.allowCleanOrphanTables = allowCleanOrphanTables;
+ this.allowCleanOrphanPartitions = allowCleanOrphanPartitions;
+ this.extraConfigs = Collections.unmodifiableMap(new HashMap<>(extraConfigs));
+ }
+
+ /** Parses a cleanup config from CLI parameters. */
+ public static OrphanCleanConfig fromParams(MultipleParameterToolAdapter params) {
+ String bootstrapServer = params.get("bootstrap-server");
+ if (StringUtils.isNullOrWhitespaceOnly(bootstrapServer)) {
+ throw new IllegalArgumentException("--bootstrap-server is required");
+ }
+
+ boolean allDatabases = params.has("all-databases");
+ String database = params.get("database");
+ if (allDatabases && !StringUtils.isNullOrWhitespaceOnly(database)) {
+ throw new IllegalArgumentException(
+ "--database and --all-databases are mutually exclusive");
+ }
+ if (!allDatabases && StringUtils.isNullOrWhitespaceOnly(database)) {
+ throw new IllegalArgumentException(
+ "Either --database or --all-databases must be provided");
+ }
+ if (allDatabases && !StringUtils.isNullOrWhitespaceOnly(params.get("table"))) {
+ throw new IllegalArgumentException(
+ "--table requires --database and cannot be used with --all-databases");
+ }
+
+ long now = System.currentTimeMillis();
+ long olderThanMillis =
+ parseCutoff("--older-than", params.get("older-than"), now, DEFAULT_OLDER_THAN);
+ long remoteFsOpRateLimitPerSecond =
+ parsePositiveRateLimit(
+ "--remote-fs-op-rate-limit-per-second",
+ params.get("remote-fs-op-rate-limit-per-second"),
+ DEFAULT_REMOTE_FS_OP_RATE_LIMIT_PER_SECOND);
+ Integer parallelism = parseParallelism(params.get("parallelism"));
+ boolean allowDeleteManifest = params.has("allow-delete-manifest");
+ boolean allowCleanOrphanTables = params.has("allow-clean-orphan-tables");
+ boolean allowCleanOrphanPartitions = params.has("allow-clean-orphan-partitions");
+
+ return new OrphanCleanConfig(
+ bootstrapServer,
+ allDatabases,
+ database,
+ params.get("table"),
+ olderThanMillis,
+ params.has("dry-run"),
+ remoteFsOpRateLimitPerSecond,
+ parallelism,
+ allowDeleteManifest,
+ allowCleanOrphanTables,
+ allowCleanOrphanPartitions,
+ parseExtraConfigs(params.getMultiParameter("conf")));
+ }
+
+ /**
+ * Parses a CLI cutoff value into an absolute epoch-ms timestamp. Empty input falls back to
+ * {@code now - defaultGap}. Explicit input must be ISO-8601 with an explicit offset (e.g.
+ * {@code 2024-01-01T00:00:00+08:00} or {@code 2024-01-01T00:00:00Z}) and must be at least
+ * {@link #HARD_LOWER_BOUND} earlier than {@code now} — closer-to-now cutoffs would race with
+ * active writes (see {@code HARD_LOWER_BOUND} javadoc).
+ */
+ private static long parseCutoff(
+ String flag, @Nullable String value, long now, Duration defaultGap) {
+ if (StringUtils.isNullOrWhitespaceOnly(value)) {
+ return now - defaultGap.toMillis();
+ }
+ OffsetDateTime parsed;
+ try {
+ parsed = OffsetDateTime.parse(value);
+ } catch (DateTimeParseException e) {
+ throw new IllegalArgumentException(
+ flag
+ + " must be an ISO-8601 timestamp with an explicit offset (e.g."
+ + " '2024-01-01T00:00:00+08:00' or '2024-01-01T00:00:00Z'); got: "
+ + value,
+ e);
+ }
+ long parsedMillis = parsed.toInstant().toEpochMilli();
+ long maxAllowed = now - HARD_LOWER_BOUND.toMillis();
+ if (parsedMillis > maxAllowed) {
+ throw new IllegalArgumentException(
+ flag
+ + " must be at least 1d before now (got "
+ + Instant.ofEpochMilli(parsedMillis)
+ + ", now is "
+ + Instant.ofEpochMilli(now)
+ + "); a closer cutoff would race with mid-write files");
+ }
+ return parsedMillis;
+ }
+
+ private static long parsePositiveRateLimit(
+ String flag, @Nullable String value, long defaultValue) {
+ if (StringUtils.isNullOrWhitespaceOnly(value)) {
+ return defaultValue;
+ }
+ long rate = Long.parseLong(value);
+ if (rate <= 0) {
+ throw new IllegalArgumentException(flag + " must be positive");
+ }
+ return rate;
+ }
+
+ @Nullable
+ private static Integer parseParallelism(@Nullable String value) {
+ if (StringUtils.isNullOrWhitespaceOnly(value)) {
+ return null;
+ }
+ int p = Integer.parseInt(value);
+ if (p <= 0) {
+ throw new IllegalArgumentException("--parallelism must be positive");
+ }
+ return p;
+ }
+
+ private static Map parseExtraConfigs(@Nullable Collection values) {
+ if (values == null || values.isEmpty()) {
+ return Collections.emptyMap();
+ }
+ Map configs = new HashMap();
+ for (String kv : values) {
+ int eqIdx = kv.indexOf('=');
+ if (eqIdx <= 0) {
+ throw new IllegalArgumentException(
+ "--conf must be in key=value format, got: " + kv);
+ }
+ configs.put(kv.substring(0, eqIdx), kv.substring(eqIdx + 1));
+ }
+ return configs;
+ }
+
+ /** Returns the bootstrap server list used to connect to Fluss. */
+ public String bootstrapServer() {
+ return bootstrapServer;
+ }
+
+ /** Returns whether the cleanup targets all databases. */
+ public boolean allDatabases() {
+ return allDatabases;
+ }
+
+ /** Returns the single targeted database when the action is not scoped to all databases. */
+ public Optional database() {
+ return Optional.ofNullable(database);
+ }
+
+ /** Returns the optional targeted table name. */
+ public Optional table() {
+ return Optional.ofNullable(table);
+ }
+
+ /**
+ * Returns the file-level cutoff as an absolute epoch-millis timestamp, frozen at action
+ * startup. A candidate file is deletion-eligible iff its mtime is strictly less than this
+ * value. The cutoff does not slide during the run — long scans cannot accidentally pull in
+ * files written after startup.
+ */
+ public long olderThanMillis() {
+ return olderThanMillis;
+ }
+
+ /** Returns whether the action runs in dry-run mode. */
+ public boolean dryRun() {
+ return dryRun;
+ }
+
+ /**
+ * Returns the best-effort job-level target rate for remote filesystem operations per second.
+ *
+ * The budget is shared by remote filesystem metadata reads, manifest reads, and deletes.
+ * Scan subtasks split this value by operator parallelism because Flink does not provide a
+ * cross-JVM limiter for this action.
+ */
+ public long remoteFsOpRateLimitPerSecond() {
+ return remoteFsOpRateLimitPerSecond;
+ }
+
+ /** Returns the optional parallelism for the ScanAndClean stage. */
+ public Optional parallelism() {
+ return Optional.ofNullable(parallelism);
+ }
+
+ /**
+ * Opt-in to delete {@code .manifest} files. Default {@code false}: mis-deleting an active
+ * manifest leaves the coordinator's manifest pointer dangling and breaks the bucket's metadata
+ * chain — the failure mode is catastrophic and asymmetric vs the trivial space cost of keeping
+ * orphan manifests (KB-sized files), so deletion is gated behind an explicit operator flag.
+ */
+ public boolean allowDeleteManifest() {
+ return allowDeleteManifest;
+ }
+
+ /**
+ * Opt-in to recursively clean files inside an orphan-table directory. Default {@code false}:
+ * the action only audits the detected orphan dir and leaves its contents untouched, because an
+ * id-based misclassification of a freshly-created table as orphan would otherwise be
+ * unrecoverable. Operators flip this on once they have reviewed the audit log.
+ */
+ public boolean allowCleanOrphanTables() {
+ return allowCleanOrphanTables;
+ }
+
+ /**
+ * Opt-in to recursively clean files inside an orphan-partition directory. Same default-audit
+ * rationale as {@link #allowCleanOrphanTables()}.
+ */
+ public boolean allowCleanOrphanPartitions() {
+ return allowCleanOrphanPartitions;
+ }
+
+ /**
+ * Returns extra configuration entries passed via {@code --conf key=value}. These are propagated
+ * to {@link org.apache.fluss.fs.FileSystem#initialize} for remote filesystem authentication
+ * (e.g. {@code fs.oss.accessKeyId}, {@code fs.oss.accessKeySecret}).
+ */
+ public Map extraConfigs() {
+ return extraConfigs;
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/fs/SafeDeleter.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/fs/SafeDeleter.java
new file mode 100644
index 0000000000..9b52fa43d8
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/fs/SafeDeleter.java
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.fs;
+
+import org.apache.fluss.annotation.Internal;
+import org.apache.fluss.flink.action.orphan.audit.AuditLogger;
+import org.apache.fluss.flink.action.orphan.rule.Decision;
+import org.apache.fluss.flink.action.orphan.rule.RuleId;
+import org.apache.fluss.fs.FileStatus;
+import org.apache.fluss.fs.FileSystem;
+import org.apache.fluss.fs.FsPath;
+import org.apache.fluss.shaded.guava32.com.google.common.util.concurrent.RateLimiter;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+
+import static org.apache.fluss.utils.Preconditions.checkArgument;
+
+/**
+ * Sole entry point for filesystem deletion within the orphan cleanup package.
+ *
+ * Only two operations are exposed:
+ *
+ *
+ * - {@link #deleteFile} - delete a single file (never recursive).
+ *
- {@link #deleteEmptyDir} - delete a directory only if it is currently empty.
+ *
+ *
+ * By design there is no recursive-delete API; any caller that needs deletion under {@code
+ * fluss-flink-common/.../action/orphan/} should go through this class. The single-entry-point
+ * invariant is currently enforced only by convention — there is no Checkstyle rule guarding it.
+ */
+@Internal
+public final class SafeDeleter {
+
+ private static final Logger LOG = LoggerFactory.getLogger(SafeDeleter.class);
+
+ private final FileSystem fs;
+ private final boolean dryRun;
+ private final AuditLogger audit;
+ private final RateLimiter remoteFsOpRateLimiter;
+
+ public SafeDeleter(
+ FileSystem fs, boolean dryRun, AuditLogger audit, RateLimiter remoteFsOpRateLimiter) {
+ this.fs = fs;
+ this.dryRun = dryRun;
+ this.audit = audit;
+ this.remoteFsOpRateLimiter = remoteFsOpRateLimiter;
+ }
+
+ /**
+ * Delete a single file.
+ *
+ * @return {@code true} if the file was actually deleted (or recorded as would-be-deleted under
+ * {@code dryRun}); {@code false} if {@link FileSystem#delete} returned {@code false}
+ * (deletion silently failed — e.g. permissions, transient remote-store error). Callers
+ * should track {@code false} returns as delete failures in their run summary.
+ */
+ public boolean deleteFile(FsPath file, Decision decision, RuleId ruleId) {
+ checkArgument(
+ decision == Decision.DELETE,
+ "deleteFile must only be called for Decision.DELETE, got %s",
+ decision);
+ if (dryRun) {
+ audit.logWouldDelete(file, ruleId);
+ return true;
+ }
+ remoteFsOpRateLimiter.acquire();
+ try {
+ boolean ok = fs.delete(file, false);
+ audit.logDeleted(file, ruleId, ok);
+ return ok;
+ } catch (IOException e) {
+ LOG.warn("Failed to delete file: {}", file, e);
+ audit.logDeleted(file, ruleId, false);
+ return false;
+ }
+ }
+
+ /**
+ * Delete a directory only if it is currently empty.
+ *
+ * @return {@code true} if the directory was actually deleted (or recorded as would-be-deleted
+ * under {@code dryRun}); {@code false} if the directory was non-empty / unreadable, or if
+ * {@link FileSystem#delete} returned {@code false}. Callers should not increment a "deleted
+ * directory" counter when this returns {@code false}.
+ */
+ public boolean deleteEmptyDir(FsPath dir) {
+ FileStatus[] children = listChildrenSilently(dir);
+ if (children == null || children.length > 0) {
+ return false;
+ }
+ if (dryRun) {
+ audit.logWouldDeleteDir(dir);
+ return true;
+ }
+ remoteFsOpRateLimiter.acquire();
+ try {
+ boolean ok = fs.delete(dir, false);
+ if (ok) {
+ audit.logDirDeleted(dir);
+ }
+ return ok;
+ } catch (IOException e) {
+ LOG.warn("Failed to delete empty directory: {}", dir, e);
+ return false;
+ }
+ }
+
+ private FileStatus[] listChildrenSilently(FsPath dir) {
+ try {
+ remoteFsOpRateLimiter.acquire();
+ return fs.listStatus(dir);
+ } catch (IOException ignored) {
+ return null;
+ }
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/job/BucketCleanTask.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/job/BucketCleanTask.java
new file mode 100644
index 0000000000..70499fd285
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/job/BucketCleanTask.java
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.job;
+
+import org.apache.fluss.annotation.Internal;
+
+import javax.annotation.Nullable;
+
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Work item for a single bucket's file-level cleanup. Carries everything needed to execute cleanup
+ * without coordinator interaction: FS paths, manifest locations for second-read, and the
+ * already-resolved KV active snapshot directory names.
+ */
+@Internal
+public final class BucketCleanTask implements CleanTask {
+
+ private static final long serialVersionUID = 1L;
+
+ @Nullable private final String logTabletDir;
+ @Nullable private final String kvTabletDir;
+ private final Set logSegmentRelativePaths;
+ private final Set logActiveManifestPaths;
+ private final Set kvActiveSnapDirs;
+ private final long cutoffMillis;
+ private final boolean dryRun;
+ private final boolean allowDeleteManifest;
+
+ public BucketCleanTask(
+ @Nullable String logTabletDir,
+ @Nullable String kvTabletDir,
+ Set logSegmentRelativePaths,
+ Set logActiveManifestPaths,
+ Set kvActiveSnapDirs,
+ long cutoffMillis,
+ boolean dryRun,
+ boolean allowDeleteManifest) {
+ this.logTabletDir = logTabletDir;
+ this.kvTabletDir = kvTabletDir;
+ this.logSegmentRelativePaths = new HashSet<>(logSegmentRelativePaths);
+ this.logActiveManifestPaths = new HashSet<>(logActiveManifestPaths);
+ this.kvActiveSnapDirs = new HashSet<>(kvActiveSnapDirs);
+ this.cutoffMillis = cutoffMillis;
+ this.dryRun = dryRun;
+ this.allowDeleteManifest = allowDeleteManifest;
+ }
+
+ @Nullable
+ public String logTabletDir() {
+ return logTabletDir;
+ }
+
+ @Nullable
+ public String kvTabletDir() {
+ return kvTabletDir;
+ }
+
+ /** Active log segment relative paths (already resolved from manifests in Stage 1). */
+ public Set logSegmentRelativePaths() {
+ return logSegmentRelativePaths;
+ }
+
+ /** Active manifest paths (already resolved from RPC in Stage 1). */
+ public Set logActiveManifestPaths() {
+ return logActiveManifestPaths;
+ }
+
+ /**
+ * KV active snapshot directory names (already resolved from RPC, no further FS read needed).
+ */
+ public Set kvActiveSnapDirs() {
+ return kvActiveSnapDirs;
+ }
+
+ public long cutoffMillis() {
+ return cutoffMillis;
+ }
+
+ public boolean dryRun() {
+ return dryRun;
+ }
+
+ public boolean allowDeleteManifest() {
+ return allowDeleteManifest;
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/job/BucketCleaner.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/job/BucketCleaner.java
new file mode 100644
index 0000000000..a1e13cf424
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/job/BucketCleaner.java
@@ -0,0 +1,180 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.job;
+
+import org.apache.fluss.annotation.Internal;
+import org.apache.fluss.flink.action.orphan.audit.AuditLogger;
+import org.apache.fluss.flink.action.orphan.fs.SafeDeleter;
+import org.apache.fluss.flink.action.orphan.rule.BucketActiveRefs;
+import org.apache.fluss.flink.action.orphan.rule.Decision;
+import org.apache.fluss.flink.action.orphan.rule.FileMeta;
+import org.apache.fluss.flink.action.orphan.rule.FileRule;
+import org.apache.fluss.flink.action.orphan.rule.RuleDispatcher;
+import org.apache.fluss.fs.FileStatus;
+import org.apache.fluss.fs.FileSystem;
+import org.apache.fluss.fs.FsPath;
+import org.apache.fluss.shaded.guava32.com.google.common.util.concurrent.RateLimiter;
+import org.apache.fluss.utils.FlussPaths;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayDeque;
+import java.util.Deque;
+
+/**
+ * Per-bucket orphan cleanup for live buckets: walks the provided bucket directories and dispatches
+ * each file to the appropriate {@link FileRule} using the caller-supplied active reference set.
+ *
+ * All deletions go through {@link SafeDeleter} (no recursive deletes). Unknown file types are
+ * skipped with an audit warning per the design's "unknown-types-not-deleted" principle.
+ */
+@Internal
+public final class BucketCleaner {
+
+ private static final Logger LOG = LoggerFactory.getLogger(BucketCleaner.class);
+
+ private final RuleDispatcher dispatcher;
+ private final SafeDeleter safeDeleter;
+ private final AuditLogger audit;
+ private final long cutoffMillis;
+ private final RateLimiter remoteFsOpRateLimiter;
+
+ public BucketCleaner(
+ RuleDispatcher dispatcher,
+ SafeDeleter safeDeleter,
+ AuditLogger audit,
+ long cutoffMillis,
+ RateLimiter remoteFsOpRateLimiter) {
+ this.dispatcher = dispatcher;
+ this.safeDeleter = safeDeleter;
+ this.audit = audit;
+ this.cutoffMillis = cutoffMillis;
+ this.remoteFsOpRateLimiter = remoteFsOpRateLimiter;
+ }
+
+ /** Cleans one bucket's log/kv subtrees using the caller-supplied active reference set. */
+ public BucketCleanStats clean(BucketActiveRefs activeRefs, FsPath... bucketDirs)
+ throws IOException {
+ BucketCleanStats stats = BucketCleanStats.empty();
+ for (FsPath bucketDir : bucketDirs) {
+ if (bucketDir != null) {
+ walkAndCleanDir(bucketDir, activeRefs, stats);
+ }
+ }
+ return stats;
+ }
+
+ private void walkAndCleanDir(FsPath root, BucketActiveRefs activeRefs, BucketCleanStats stats)
+ throws IOException {
+ FileSystem fs = root.getFileSystem();
+ remoteFsOpRateLimiter.acquire();
+ if (!fs.exists(root)) {
+ return;
+ }
+ Deque stack = new ArrayDeque();
+ stack.push(new DirVisit(root, false, false));
+ while (!stack.isEmpty()) {
+ DirVisit visit = stack.pop();
+ if (visit.postOrder) {
+ if (visit.oldEnough && safeDeleter.deleteEmptyDir(visit.dir)) {
+ stats.deleted++;
+ stats.emptyDirsRemoved++;
+ }
+ continue;
+ }
+ FileStatus[] children;
+ try {
+ remoteFsOpRateLimiter.acquire();
+ children = fs.listStatus(visit.dir);
+ } catch (IOException e) {
+ LOG.warn("Failed to list directory: {}", visit.dir, e);
+ continue;
+ }
+ if (children == null) {
+ continue;
+ }
+ if (!visit.dir.toString().equals(root.toString())) {
+ stack.push(new DirVisit(visit.dir, true, visit.oldEnough));
+ }
+ for (FileStatus child : children) {
+ FsPath childPath = child.getPath();
+ if (child.isDir()) {
+ if (FlussPaths.REMOTE_KV_SNAPSHOT_SHARED_DIR.equals(childPath.getName())) {
+ continue;
+ }
+ stack.push(
+ new DirVisit(
+ childPath, false, child.getModificationTime() < cutoffMillis));
+ continue;
+ }
+ FileMeta meta =
+ new FileMeta(childPath, child.getLen(), child.getModificationTime());
+ FileRule rule = dispatcher.dispatch(meta);
+ Decision decision = rule.evaluate(meta, activeRefs, cutoffMillis);
+ stats.scanned++;
+ switch (decision) {
+ case DELETE:
+ if (safeDeleter.deleteFile(meta.path(), decision, rule.id())) {
+ stats.deleted++;
+ stats.bytesReclaimed += meta.size();
+ } else {
+ stats.deleteFailures++;
+ }
+ break;
+ case SKIP_UNKNOWN:
+ audit.logSkipUnknown(meta.path(), rule.id());
+ break;
+ case KEEP_ACTIVE:
+ case DEFER:
+ // no-op
+ break;
+ default:
+ // unknown decision — skip defensively
+ break;
+ }
+ }
+ }
+ }
+
+ /** Per-bucket cleanup statistics. */
+ public static final class BucketCleanStats {
+ public long scanned;
+ public long deleted;
+ public long emptyDirsRemoved;
+ public long deleteFailures;
+ public long bytesReclaimed;
+
+ public static BucketCleanStats empty() {
+ return new BucketCleanStats();
+ }
+ }
+
+ private static final class DirVisit {
+ private final FsPath dir;
+ private final boolean postOrder;
+ private final boolean oldEnough;
+
+ private DirVisit(FsPath dir, boolean postOrder, boolean oldEnough) {
+ this.dir = dir;
+ this.postOrder = postOrder;
+ this.oldEnough = oldEnough;
+ }
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/job/CleanStats.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/job/CleanStats.java
new file mode 100644
index 0000000000..cfecb0096e
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/job/CleanStats.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.job;
+
+import org.apache.fluss.annotation.Internal;
+
+import java.io.Serializable;
+
+/**
+ * Per-task cleanup statistics emitted by each {@link ScanAndCleanFunction} subtask. The scalar
+ * counters are accumulated by {@link StatsAggregateOperator} via simple addition.
+ */
+@Internal
+public final class CleanStats implements Serializable {
+
+ private static final long serialVersionUID = 1L;
+
+ private final long scanned;
+ private final long deleted;
+ private final long emptyDirsRemoved;
+ private final long deleteFailures;
+ private final long bytesReclaimed;
+
+ public CleanStats(long scanned, long deleted, long deleteFailures, long bytesReclaimed) {
+ this(scanned, deleted, 0L, deleteFailures, bytesReclaimed);
+ }
+
+ public CleanStats(
+ long scanned,
+ long deleted,
+ long emptyDirsRemoved,
+ long deleteFailures,
+ long bytesReclaimed) {
+ this.scanned = scanned;
+ this.deleted = deleted;
+ this.emptyDirsRemoved = emptyDirsRemoved;
+ this.deleteFailures = deleteFailures;
+ this.bytesReclaimed = bytesReclaimed;
+ }
+
+ public static CleanStats empty() {
+ return new CleanStats(0L, 0L, 0L, 0L);
+ }
+
+ public long scanned() {
+ return scanned;
+ }
+
+ public long deleted() {
+ return deleted;
+ }
+
+ public long emptyDirsRemoved() {
+ return emptyDirsRemoved;
+ }
+
+ public long deleteFailures() {
+ return deleteFailures;
+ }
+
+ public long bytesReclaimed() {
+ return bytesReclaimed;
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/job/CleanTask.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/job/CleanTask.java
new file mode 100644
index 0000000000..69f691ce99
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/job/CleanTask.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.job;
+
+import org.apache.fluss.annotation.Internal;
+
+import java.io.Serializable;
+
+/**
+ * Marker interface for work items emitted by {@link ScopeEnumeratorFunction} and consumed by {@link
+ * ScanAndCleanFunction}. Implementations carry enough context for a single subtask to execute
+ * cleanup independently (no further coordinator interaction needed).
+ */
+@Internal
+public interface CleanTask extends Serializable {}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/job/OrphanDirCleanTask.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/job/OrphanDirCleanTask.java
new file mode 100644
index 0000000000..cd564e5b78
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/job/OrphanDirCleanTask.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.job;
+
+import org.apache.fluss.annotation.Internal;
+
+/**
+ * Work item for cleaning an orphan table or partition directory. The directory has already been
+ * identified as an orphan candidate by {@link ScopeEnumeratorFunction} (ID guard satisfied).
+ */
+@Internal
+public final class OrphanDirCleanTask implements CleanTask {
+
+ private static final long serialVersionUID = 1L;
+
+ private final String dirPath;
+ private final long cutoffMillis;
+ private final boolean dryRun;
+ private final boolean allowDeleteManifest;
+
+ public OrphanDirCleanTask(
+ String dirPath, long cutoffMillis, boolean dryRun, boolean allowDeleteManifest) {
+ this.dirPath = dirPath;
+ this.cutoffMillis = cutoffMillis;
+ this.dryRun = dryRun;
+ this.allowDeleteManifest = allowDeleteManifest;
+ }
+
+ public String dirPath() {
+ return dirPath;
+ }
+
+ public long cutoffMillis() {
+ return cutoffMillis;
+ }
+
+ public boolean dryRun() {
+ return dryRun;
+ }
+
+ public boolean allowDeleteManifest() {
+ return allowDeleteManifest;
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/job/OrphanFilesCleanJob.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/job/OrphanFilesCleanJob.java
new file mode 100644
index 0000000000..3008715c14
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/job/OrphanFilesCleanJob.java
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.job;
+
+import org.apache.fluss.annotation.Internal;
+import org.apache.fluss.flink.action.orphan.config.OrphanCleanConfig;
+
+import org.apache.flink.api.common.RuntimeExecutionMode;
+import org.apache.flink.api.common.typeinfo.TypeHint;
+import org.apache.flink.api.common.typeinfo.TypeInformation;
+import org.apache.flink.streaming.api.datastream.DataStream;
+import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
+import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
+
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+
+/**
+ * Builds and executes the 3-stage Flink Batch DAG for orphan files cleanup.
+ *
+ *
+ * Stage 1: ScopeEnumerator (p=1) — coordinator RPCs, emits CleanTask
+ * Stage 2: ScanAndClean (p=N) — FS scan + rate-limited delete, emits CleanStats
+ * Stage 3: StatsAggregate (p=1) — merge stats, emits final CleanStats
+ *
+ */
+@Internal
+public final class OrphanFilesCleanJob {
+
+ private OrphanFilesCleanJob() {}
+
+ /**
+ * Builds the DAG, executes it in batch mode, and returns the final aggregated cleanup
+ * statistics.
+ *
+ * @param env the Flink execution environment (caller configures classpath, etc.)
+ * @param config parsed orphan cleanup configuration
+ * @param parallelism the parallelism for Stage 2 (ScanAndClean); null uses env default
+ * @return the final cleanup statistics
+ */
+ public static CleanStats execute(
+ StreamExecutionEnvironment env, OrphanCleanConfig config, Integer parallelism)
+ throws Exception {
+ env.setRuntimeMode(RuntimeExecutionMode.BATCH);
+
+ // Stage 1: ScopeEnumerator (parallelism=1)
+ DataStream trigger =
+ env.fromCollection(Collections.singletonList(1), TypeInformation.of(Integer.class));
+
+ SingleOutputStreamOperator tasks =
+ trigger.process(new ScopeEnumeratorFunction(config))
+ .returns(TypeInformation.of(new TypeHint() {}))
+ .setParallelism(1)
+ .setMaxParallelism(1)
+ .name("ScopeEnumerator");
+
+ // Stage 2: ScanAndClean (parallelism=N)
+ SingleOutputStreamOperator stats =
+ tasks.rebalance()
+ .process(
+ new ScanAndCleanFunction(
+ config.remoteFsOpRateLimitPerSecond(),
+ config.extraConfigs()))
+ .returns(TypeInformation.of(new TypeHint() {}))
+ .name("ScanAndClean");
+ if (parallelism != null) {
+ stats = stats.setParallelism(parallelism);
+ }
+
+ // Stage 3: StatsAggregate (parallelism=1)
+ SingleOutputStreamOperator result =
+ stats.transform(
+ "StatsAggregate",
+ TypeInformation.of(new TypeHint() {}),
+ new StatsAggregateOperator(config.dryRun()))
+ .setParallelism(1)
+ .setMaxParallelism(1);
+
+ // Execute and collect the single result
+ List collected = collectResults(result);
+ if (collected.isEmpty()) {
+ return CleanStats.empty();
+ }
+ return collected.get(0);
+ }
+
+ @SuppressWarnings("deprecation")
+ private static List collectResults(DataStream result) throws Exception {
+ Iterator iterator = result.executeAndCollect("OrphanFilesClean");
+ List results = new java.util.ArrayList();
+ while (iterator.hasNext()) {
+ results.add(iterator.next());
+ }
+ return results;
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/job/ScanAndCleanFunction.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/job/ScanAndCleanFunction.java
new file mode 100644
index 0000000000..85a64d349b
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/job/ScanAndCleanFunction.java
@@ -0,0 +1,262 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.job;
+
+import org.apache.fluss.annotation.Internal;
+import org.apache.fluss.config.Configuration;
+import org.apache.fluss.flink.action.orphan.audit.AuditLogger;
+import org.apache.fluss.flink.action.orphan.fs.SafeDeleter;
+import org.apache.fluss.flink.action.orphan.rule.BucketActiveRefs;
+import org.apache.fluss.flink.action.orphan.rule.Decision;
+import org.apache.fluss.flink.action.orphan.rule.FileMeta;
+import org.apache.fluss.flink.action.orphan.rule.FileRule;
+import org.apache.fluss.flink.action.orphan.rule.RuleDispatcher;
+import org.apache.fluss.fs.FileStatus;
+import org.apache.fluss.fs.FileSystem;
+import org.apache.fluss.fs.FsPath;
+import org.apache.fluss.shaded.guava32.com.google.common.util.concurrent.RateLimiter;
+
+import org.apache.flink.streaming.api.functions.ProcessFunction;
+import org.apache.flink.util.Collector;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.util.ArrayDeque;
+import java.util.Deque;
+import java.util.Map;
+
+/**
+ * Stage 2 of the orphan files cleanup job. Runs at user-configured parallelism (N) and performs
+ * pure FS operations — no coordinator RPC interaction.
+ *
+ * Each subtask processes assigned {@link CleanTask} items serially:
+ *
+ *
+ * - {@link BucketCleanTask}: second-reads manifests from object storage to build the active
+ * reference set, then walks log/kv directories and deletes orphan files and old empty child
+ * directories.
+ *
- {@link OrphanDirCleanTask}: recursively walks the orphan directory and deletes all files
+ * older than the cutoff, then removes old empty directories bottom-up.
+ *
+ *
+ * Each task emits a single {@link CleanStats} containing scalar counters. Remote filesystem
+ * operation rate is limited per-subtask: {@code configuredRate / runtimeParallelism}. The serial
+ * processing within each subtask guarantees no concurrent throttler access.
+ */
+@Internal
+public final class ScanAndCleanFunction extends ProcessFunction {
+
+ private static final long serialVersionUID = 1L;
+ private static final Logger LOG = LoggerFactory.getLogger(ScanAndCleanFunction.class);
+
+ private final long remoteFsOpRateLimitPerSecond;
+ private final Map extraConfigs;
+
+ private transient AuditLogger audit;
+ private transient RateLimiter remoteFsOpRateLimiter;
+
+ public ScanAndCleanFunction(
+ long remoteFsOpRateLimitPerSecond, Map extraConfigs) {
+ this.remoteFsOpRateLimitPerSecond = remoteFsOpRateLimitPerSecond;
+ this.extraConfigs = extraConfigs;
+ }
+
+ @Override
+ public void open(org.apache.flink.api.common.functions.OpenContext openContext)
+ throws Exception {
+ super.open(openContext);
+ if (!extraConfigs.isEmpty()) {
+ FileSystem.initialize(Configuration.fromMap(extraConfigs), null);
+ }
+ audit = new AuditLogger();
+ int parallelism = getRuntimeContext().getTaskInfo().getNumberOfParallelSubtasks();
+ int subtaskIndex = getRuntimeContext().getTaskInfo().getIndexOfThisSubtask();
+ // Distribute the configured rate as base + 1 extra for the first `remainder` subtasks.
+ // Flink does not provide a cross-JVM limiter here, so this is a best-effort job-level
+ // target. Each subtask gets at least 1/s; if parallelism exceeds the configured rate, the
+ // effective aggregate can exceed the target by that floor.
+ remoteFsOpRateLimiter =
+ RateLimiter.create(
+ perSubtaskRate(remoteFsOpRateLimitPerSecond, parallelism, subtaskIndex));
+ }
+
+ @Override
+ public void processElement(CleanTask task, Context ctx, Collector out)
+ throws Exception {
+ if (task instanceof BucketCleanTask) {
+ out.collect(processBucketTask((BucketCleanTask) task));
+ } else if (task instanceof OrphanDirCleanTask) {
+ out.collect(processOrphanDirTask((OrphanDirCleanTask) task));
+ }
+ }
+
+ // -------------------------------------------------------------------------
+ // BucketCleanTask processing
+ // -------------------------------------------------------------------------
+
+ private CleanStats processBucketTask(BucketCleanTask task) throws IOException {
+ FsPath logDir = task.logTabletDir() != null ? new FsPath(task.logTabletDir()) : null;
+ FsPath kvDir = task.kvTabletDir() != null ? new FsPath(task.kvTabletDir()) : null;
+
+ FsPath anyDir = logDir != null ? logDir : kvDir;
+ if (anyDir == null) {
+ return CleanStats.empty();
+ }
+
+ BucketActiveRefs activeRefs =
+ new BucketActiveRefs(
+ task.logSegmentRelativePaths(),
+ task.kvActiveSnapDirs(),
+ task.logActiveManifestPaths());
+ RuleDispatcher dispatcher = new RuleDispatcher(task.allowDeleteManifest());
+ SafeDeleter safeDeleter = createSafeDeleter(anyDir.getFileSystem(), task.dryRun());
+ BucketCleaner cleaner =
+ new BucketCleaner(
+ dispatcher, safeDeleter, audit, task.cutoffMillis(), remoteFsOpRateLimiter);
+
+ BucketCleaner.BucketCleanStats bucketStats = cleaner.clean(activeRefs, logDir, kvDir);
+
+ return new CleanStats(
+ bucketStats.scanned,
+ bucketStats.deleted,
+ bucketStats.emptyDirsRemoved,
+ bucketStats.deleteFailures,
+ bucketStats.bytesReclaimed);
+ }
+
+ // -------------------------------------------------------------------------
+ // OrphanDirCleanTask processing
+ // -------------------------------------------------------------------------
+
+ private CleanStats processOrphanDirTask(OrphanDirCleanTask task) throws IOException {
+ FsPath dirPath = new FsPath(task.dirPath());
+ FileSystem fs = dirPath.getFileSystem();
+ remoteFsOpRateLimiter.acquire();
+ if (!fs.exists(dirPath)) {
+ return CleanStats.empty();
+ }
+
+ SafeDeleter safeDeleter = createSafeDeleter(fs, task.dryRun());
+ RuleDispatcher dispatcher = new RuleDispatcher(task.allowDeleteManifest(), true);
+
+ long scanned = 0L;
+ long deleted = 0L;
+ long emptyDirsRemoved = 0L;
+ long deleteFailures = 0L;
+ long bytesReclaimed = 0L;
+
+ remoteFsOpRateLimiter.acquire();
+ FileStatus rootStatus = fs.getFileStatus(dirPath);
+ Deque stack = new ArrayDeque();
+ stack.push(
+ new DirVisit(
+ dirPath,
+ false,
+ rootStatus.isDir()
+ && rootStatus.getModificationTime() < task.cutoffMillis()));
+ while (!stack.isEmpty()) {
+ DirVisit visit = stack.pop();
+ if (visit.postOrder) {
+ if (visit.oldEnough && safeDeleter.deleteEmptyDir(visit.dir)) {
+ deleted++;
+ emptyDirsRemoved++;
+ }
+ continue;
+ }
+ FileStatus[] children;
+ try {
+ remoteFsOpRateLimiter.acquire();
+ children = fs.listStatus(visit.dir);
+ } catch (IOException e) {
+ LOG.warn("Failed to list directory: {}", visit.dir, e);
+ continue;
+ }
+ if (children == null) {
+ continue;
+ }
+ stack.push(new DirVisit(visit.dir, true, visit.oldEnough));
+ for (FileStatus child : children) {
+ FsPath childPath = child.getPath();
+ if (child.isDir()) {
+ stack.push(
+ new DirVisit(
+ childPath,
+ false,
+ child.getModificationTime() < task.cutoffMillis()));
+ continue;
+ }
+ scanned++;
+ if (child.getModificationTime() >= task.cutoffMillis()) {
+ continue;
+ }
+ FileMeta meta =
+ new FileMeta(childPath, child.getLen(), child.getModificationTime());
+ FileRule rule = dispatcher.dispatch(meta);
+ Decision decision =
+ rule.evaluate(meta, BucketActiveRefs.empty(), task.cutoffMillis());
+ switch (decision) {
+ case DELETE:
+ if (safeDeleter.deleteFile(meta.path(), decision, rule.id())) {
+ deleted++;
+ bytesReclaimed += meta.size();
+ } else {
+ deleteFailures++;
+ }
+ break;
+ case SKIP_UNKNOWN:
+ audit.logSkipUnknown(meta.path(), rule.id());
+ break;
+ case KEEP_ACTIVE:
+ case DEFER:
+ default:
+ break;
+ }
+ }
+ }
+
+ return new CleanStats(scanned, deleted, emptyDirsRemoved, deleteFailures, bytesReclaimed);
+ }
+
+ // -------------------------------------------------------------------------
+ // Helpers
+ // -------------------------------------------------------------------------
+
+ private SafeDeleter createSafeDeleter(FileSystem fs, boolean dryRun) {
+ return new SafeDeleter(fs, dryRun, audit, remoteFsOpRateLimiter);
+ }
+
+ private static double perSubtaskRate(long totalRate, int parallelism, int subtaskIndex) {
+ long base = totalRate / parallelism;
+ long remainder = totalRate % parallelism;
+ long quota = base + (subtaskIndex < remainder ? 1L : 0L);
+ return Math.max(1.0, (double) quota);
+ }
+
+ private static final class DirVisit {
+ private final FsPath dir;
+ private final boolean postOrder;
+ private final boolean oldEnough;
+
+ private DirVisit(FsPath dir, boolean postOrder, boolean oldEnough) {
+ this.dir = dir;
+ this.postOrder = postOrder;
+ this.oldEnough = oldEnough;
+ }
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/job/ScopeEnumeratorFunction.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/job/ScopeEnumeratorFunction.java
new file mode 100644
index 0000000000..eede19cc68
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/job/ScopeEnumeratorFunction.java
@@ -0,0 +1,662 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.job;
+
+import org.apache.fluss.annotation.Internal;
+import org.apache.fluss.client.Connection;
+import org.apache.fluss.client.ConnectionFactory;
+import org.apache.fluss.client.admin.Admin;
+import org.apache.fluss.config.ConfigOptions;
+import org.apache.fluss.config.Configuration;
+import org.apache.fluss.exception.DisconnectException;
+import org.apache.fluss.exception.NetworkException;
+import org.apache.fluss.exception.UnsupportedVersionException;
+import org.apache.fluss.flink.action.orphan.OrphanCleanUtils;
+import org.apache.fluss.flink.action.orphan.RpcErrorClassifier;
+import org.apache.fluss.flink.action.orphan.audit.AuditLogger;
+import org.apache.fluss.flink.action.orphan.build.ActiveRefsFetcher;
+import org.apache.fluss.flink.action.orphan.build.KvActiveRefsFetchResult;
+import org.apache.fluss.flink.action.orphan.build.LogActiveRefsFetchResult;
+import org.apache.fluss.flink.action.orphan.build.MaxKnownIdsTracker;
+import org.apache.fluss.flink.action.orphan.config.OrphanCleanConfig;
+import org.apache.fluss.flink.action.orphan.rule.OrphanDirDetector;
+import org.apache.fluss.fs.FileStatus;
+import org.apache.fluss.fs.FileSystem;
+import org.apache.fluss.fs.FsPath;
+import org.apache.fluss.metadata.PartitionInfo;
+import org.apache.fluss.metadata.TableBucket;
+import org.apache.fluss.metadata.TableInfo;
+import org.apache.fluss.metadata.TablePath;
+import org.apache.fluss.shaded.guava32.com.google.common.util.concurrent.RateLimiter;
+import org.apache.fluss.utils.ExceptionUtils;
+import org.apache.fluss.utils.FlussPaths;
+
+import org.apache.flink.streaming.api.functions.ProcessFunction;
+import org.apache.flink.util.Collector;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.annotation.Nullable;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.function.Consumer;
+import java.util.function.Predicate;
+
+import static org.apache.fluss.flink.action.orphan.OrphanCleanUtils.enumerateBuckets;
+import static org.apache.fluss.flink.action.orphan.OrphanCleanUtils.fetchClusterConfigMap;
+import static org.apache.fluss.flink.action.orphan.OrphanCleanUtils.normalizeRoot;
+import static org.apache.fluss.flink.action.orphan.OrphanCleanUtils.physicalPath;
+import static org.apache.fluss.flink.action.orphan.OrphanCleanUtils.remoteSubDir;
+import static org.apache.fluss.flink.action.orphan.OrphanCleanUtils.resolveClusterRemoteDataDir;
+import static org.apache.fluss.flink.action.orphan.OrphanCleanUtils.resolveClusterRemoteDataDirs;
+import static org.apache.fluss.flink.action.orphan.OrphanCleanUtils.resolveRemoteDataDir;
+
+/**
+ * Stage 1 of the orphan files cleanup job. Runs at parallelism=1 and concentrates all coordinator
+ * RPC interaction in a single subtask.
+ *
+ * For each live bucket, emits a {@link BucketCleanTask} containing the FS paths and manifest
+ * locations needed for Stage 2 to execute cleanup without coordinator access. For each detected
+ * orphan directory, emits an {@link OrphanDirCleanTask}.
+ */
+@Internal
+public final class ScopeEnumeratorFunction extends ProcessFunction {
+
+ private static final long serialVersionUID = 1L;
+ private static final Logger LOG = LoggerFactory.getLogger(ScopeEnumeratorFunction.class);
+ private static final String[] TOP_LEVEL_DIRS = {
+ FlussPaths.REMOTE_LOG_DIR_NAME, FlussPaths.REMOTE_KV_DIR_NAME
+ };
+
+ private final OrphanCleanConfig config;
+
+ public ScopeEnumeratorFunction(OrphanCleanConfig config) {
+ this.config = config;
+ }
+
+ @Override
+ public void processElement(Integer trigger, Context ctx, Collector out)
+ throws Exception {
+ if (!config.extraConfigs().isEmpty()) {
+ FileSystem.initialize(Configuration.fromMap(config.extraConfigs()), null);
+ }
+
+ Configuration flussConfig = new Configuration();
+ flussConfig.setString(ConfigOptions.BOOTSTRAP_SERVERS.key(), config.bootstrapServer());
+ // Pass through client-related extra configs (e.g. security/auth).
+ for (Map.Entry entry : config.extraConfigs().entrySet()) {
+ if (entry.getKey().startsWith("client.")) {
+ flussConfig.setString(entry.getKey(), entry.getValue());
+ }
+ }
+
+ try (Connection connection = ConnectionFactory.createConnection(flussConfig);
+ Admin admin = connection.getAdmin()) {
+ // Fail fast on incompatible servers: the action jar may be deployed against an
+ // older cluster that does not implement ListRemoteLogManifests / ListKvSnapshots.
+ // Without this guard, every per-target fetch would degrade to skip_log_target /
+ // skip_kv_target audit events and the job would exit "successfully" with
+ // deleted=0, masking the incompatibility.
+ verifyServerSupportsRequiredApis(admin);
+
+ AuditLogger audit = new AuditLogger();
+ audit.logCutoff(config.olderThanMillis());
+
+ RateLimiter remoteFsOpRateLimiter =
+ RateLimiter.create((double) config.remoteFsOpRateLimitPerSecond());
+ ActiveRefsFetcher fetcher = new ActiveRefsFetcher(admin, 3, remoteFsOpRateLimiter);
+ MaxKnownIdsTracker tracker = new MaxKnownIdsTracker();
+ Map clusterConfigMap = fetchClusterConfigMap(admin);
+ String clusterRemoteDataDir = resolveClusterRemoteDataDir(clusterConfigMap);
+ List clusterRoots =
+ normalizeRoots(resolveClusterRemoteDataDirs(clusterConfigMap));
+
+ Map dbStates = enumerateActiveScope(admin, audit, tracker);
+
+ for (DbScanState dbState : dbStates.values()) {
+ for (LiveTableScope liveTable : dbState.liveTables) {
+ emitBucketTasks(
+ liveTable, fetcher, audit, clusterRemoteDataDir, clusterRoots, out);
+ emitOrphanPartitionDirTasks(
+ liveTable, tracker, clusterRoots, audit, remoteFsOpRateLimiter, out);
+ }
+ emitOrphanTableDirTasks(
+ dbState, tracker, clusterRoots, audit, remoteFsOpRateLimiter, out);
+ }
+ }
+ }
+
+ /** Normalizes each root in the list and returns a deduplicated ordered list. */
+ private static List normalizeRoots(List roots) {
+ LinkedHashSet normalized = new LinkedHashSet();
+ for (String root : roots) {
+ normalized.add(normalizeRoot(root));
+ }
+ return new ArrayList(normalized);
+ }
+
+ /**
+ * Probes the two RPCs this action depends on and throws if the connected server does not
+ * implement them. A sentinel {@code tableId} of {@link Long#MAX_VALUE} is used so that on a
+ * compatible server the call simply fails with a benign error (typically table-not-found),
+ * whereas an incompatible server raises {@link UnsupportedVersionException} during ApiVersions
+ * negotiation. Any non-{@code UnsupportedVersionException} outcome is treated as proof that the
+ * RPC is recognized.
+ */
+ private static void verifyServerSupportsRequiredApis(Admin admin) {
+ long sentinelTableId = Long.MAX_VALUE;
+ probeApi(
+ "ListRemoteLogManifests",
+ () -> admin.listRemoteLogManifests(sentinelTableId, null).get());
+ probeApi("ListKvSnapshots", () -> admin.listKvSnapshots(sentinelTableId, null).get());
+ }
+
+ private static void probeApi(String apiName, ThrowingProbe probe) {
+ try {
+ probe.run();
+ } catch (Throwable t) {
+ if (isUnsupportedVersion(t)) {
+ throw new UnsupportedOperationException(
+ "Orphan files cleanup requires the Fluss server to support the "
+ + apiName
+ + " RPC, which the connected cluster does not. Upgrade the"
+ + " cluster to a version that exposes this RPC, or run an"
+ + " older orphan-files-cleanup action that targets this server.",
+ t);
+ }
+ if (isConnectionFailure(t)) {
+ throw new IllegalStateException(
+ "Failed to connect to Fluss cluster while probing "
+ + apiName
+ + " RPC. The bootstrap server may be unreachable.",
+ t);
+ }
+ // Any other failure means the RPC is recognized; the call merely failed because of
+ // the sentinel target id. Compatibility is satisfied.
+ }
+ }
+
+ private static boolean isConnectionFailure(Throwable t) {
+ Throwable cause = ExceptionUtils.stripExecutionException(t);
+ while (cause != null) {
+ if (cause instanceof NetworkException
+ || cause instanceof DisconnectException
+ || cause instanceof IOException) {
+ return true;
+ }
+ cause = cause.getCause();
+ }
+ return false;
+ }
+
+ private static boolean isUnsupportedVersion(Throwable t) {
+ Throwable cause = t;
+ while (cause != null) {
+ if (cause instanceof UnsupportedVersionException) {
+ return true;
+ }
+ cause = cause.getCause();
+ }
+ return false;
+ }
+
+ @FunctionalInterface
+ private interface ThrowingProbe {
+ void run() throws Exception;
+ }
+
+ // -------------------------------------------------------------------------
+ // Scope enumeration (coordinator RPCs only)
+ // -------------------------------------------------------------------------
+
+ private Map enumerateActiveScope(
+ Admin admin, AuditLogger audit, MaxKnownIdsTracker tracker) {
+ List dbs = resolveDatabasesToScan(admin, audit);
+ Map result = new LinkedHashMap();
+ for (String dbName : dbs) {
+ DbScanState dbState = new DbScanState(dbName);
+ result.put(dbName, dbState);
+ if (config.table().isPresent()) {
+ dbState.tableInfosComplete = false;
+ resolveTable(admin, audit, tracker, dbState, config.table().get(), true);
+ continue;
+ }
+ List tableNames;
+ try {
+ tableNames = admin.listTables(dbName).get();
+ } catch (Exception e) {
+ audit.logSkipDb(dbName, classifyName(e));
+ dbState.tableInfosComplete = false;
+ continue;
+ }
+ for (String tableName : tableNames) {
+ resolveTable(admin, audit, tracker, dbState, tableName, false);
+ }
+ }
+ return result;
+ }
+
+ private List resolveDatabasesToScan(Admin admin, AuditLogger audit) {
+ if (config.allDatabases()) {
+ try {
+ return admin.listDatabases().get();
+ } catch (Exception e) {
+ audit.logSkipDb("*", classifyName(e));
+ throw new IllegalStateException(
+ "Failed to list databases from Fluss cluster. "
+ + "The coordinator server may be unreachable.",
+ e);
+ }
+ }
+ String databaseName = config.database().get();
+ try {
+ if (admin.databaseExists(databaseName).get()) {
+ return Collections.singletonList(databaseName);
+ }
+ } catch (Exception e) {
+ audit.logSkipDb(databaseName, classifyName(e));
+ throw new IllegalStateException(
+ "Failed to check existence of database '"
+ + databaseName
+ + "'. "
+ + "The coordinator server may be unreachable.",
+ e);
+ }
+ audit.logSkipDb(databaseName, RpcErrorClassifier.Category.NOT_FOUND.name());
+ return Collections.emptyList();
+ }
+
+ private void resolveTable(
+ Admin admin,
+ AuditLogger audit,
+ MaxKnownIdsTracker tracker,
+ DbScanState dbState,
+ String tableName,
+ boolean explicitTableTarget) {
+ TablePath tablePath = TablePath.of(dbState.dbName, tableName);
+ TableInfo tableInfo;
+ try {
+ tableInfo = admin.getTableInfo(tablePath).get();
+ } catch (Exception e) {
+ RpcErrorClassifier.Category category = RpcErrorClassifier.classify(e);
+ if (category != RpcErrorClassifier.Category.NOT_FOUND || explicitTableTarget) {
+ audit.logSkipTable(dbState.dbName, tableName, category.name());
+ dbState.tableInfosComplete = false;
+ }
+ return;
+ }
+ tracker.observeTableId(tableInfo.getTableId());
+ dbState.activeTableIds.add(tableInfo.getTableId());
+
+ LiveTableScope liveTable = new LiveTableScope(dbState.dbName, tableName, tableInfo);
+ dbState.liveTables.add(liveTable);
+ if (!tableInfo.isPartitioned()) {
+ return;
+ }
+ try {
+ List partitions = admin.listPartitionInfos(tablePath).get();
+ TableInfo confirm = admin.getTableInfo(tablePath).get();
+ if (confirm.getTableId() != tableInfo.getTableId()) {
+ audit.logSkipTable(dbState.dbName, tableName, "table-recreated-during-enumeration");
+ liveTable.partitionInfosComplete = false;
+ return;
+ }
+ for (PartitionInfo partition : partitions) {
+ liveTable.partitions.add(partition);
+ liveTable.activePartitionIds.add(partition.getPartitionId());
+ tracker.observePartitionId(partition.getPartitionId());
+ }
+ } catch (Exception e) {
+ audit.logSkipPartitionList(dbState.dbName, tableName, classifyName(e));
+ liveTable.partitionInfosComplete = false;
+ }
+ }
+
+ // -------------------------------------------------------------------------
+ // Emit BucketCleanTasks (per-target RPC + per-bucket task emission)
+ // -------------------------------------------------------------------------
+
+ private void emitBucketTasks(
+ LiveTableScope liveTable,
+ ActiveRefsFetcher fetcher,
+ AuditLogger audit,
+ @Nullable String clusterRemoteDataDir,
+ List clusterRoots,
+ Collector out) {
+ if (liveTable.partitioned && !liveTable.partitionInfosComplete) {
+ return;
+ }
+ List partitionTargets =
+ liveTable.partitioned
+ ? liveTable.partitions
+ : Collections.singletonList(null);
+ for (PartitionInfo partitionInfo : partitionTargets) {
+ emitBucketTasksForTarget(
+ liveTable,
+ partitionInfo,
+ fetcher,
+ audit,
+ clusterRemoteDataDir,
+ clusterRoots,
+ out);
+ }
+ }
+
+ private void emitBucketTasksForTarget(
+ LiveTableScope liveTable,
+ @Nullable PartitionInfo partitionInfo,
+ ActiveRefsFetcher fetcher,
+ AuditLogger audit,
+ @Nullable String clusterRemoteDataDir,
+ List clusterRoots,
+ Collector out) {
+ Long partitionId = partitionInfo == null ? null : partitionInfo.getPartitionId();
+
+ String remoteDataDir =
+ resolveRemoteDataDir(liveTable.tableInfo, partitionInfo, clusterRemoteDataDir);
+
+ // Scope guard: skip this target if its metadata-resolved root is not part of the
+ // cluster's configured remote data directories.
+ if (!clusterRoots.contains(normalizeRoot(remoteDataDir))) {
+ audit.logSkipBucketOutOfScope(liveTable.tableId, partitionId, remoteDataDir);
+ return;
+ }
+
+ LogActiveRefsFetchResult logResult =
+ fetcher.fetchLogActiveRefsByBucket(liveTable.tableId, partitionId);
+ if (!logResult.listOk()) {
+ audit.logSkipLogTarget(liveTable.tableId, partitionId, logResult.listFailureReason());
+ }
+
+ Map> kvActiveByBucket = Collections.emptyMap();
+ boolean kvTargetOk = false;
+ if (liveTable.tableInfo.hasPrimaryKey()) {
+ KvActiveRefsFetchResult kvResult =
+ fetcher.fetchKvActiveSnapDirs(liveTable.tableId, partitionId);
+ if (kvResult.listOk()) {
+ kvActiveByBucket = kvResult.activeSnapDirsByBucket();
+ kvTargetOk = true;
+ } else {
+ audit.logSkipKvTarget(liveTable.tableId, partitionId, kvResult.listFailureReason());
+ }
+ }
+
+ FsPath remoteLogDir = remoteSubDir(remoteDataDir, FlussPaths.REMOTE_LOG_DIR_NAME);
+ FsPath remoteKvDir = remoteSubDir(remoteDataDir, FlussPaths.REMOTE_KV_DIR_NAME);
+
+ for (TableBucket tableBucket : enumerateBuckets(liveTable.tableInfo, partitionInfo)) {
+ int bucketId = tableBucket.getBucket();
+
+ String logTabletDir = null;
+
+ Set logSegmentRelativePaths = Collections.emptySet();
+ Set logActiveManifestPaths = Collections.emptySet();
+
+ if (logResult.listOk()) {
+ switch (logResult.statusFor(bucketId)) {
+ case RESOLVED:
+ logTabletDir =
+ FlussPaths.remoteLogTabletDir(
+ remoteLogDir,
+ physicalPath(liveTable.tablePath, partitionInfo),
+ tableBucket)
+ .toString();
+ logSegmentRelativePaths =
+ logResult.activeRefsOf(bucketId).logSegmentRelativePaths();
+ logActiveManifestPaths =
+ logResult.activeRefsOf(bucketId).logActiveManifestPaths();
+ break;
+ case READ_FAILED:
+ audit.logBucketAborted(
+ OrphanCleanUtils.bucketScopeKey(
+ liveTable.tableId, partitionId, bucketId),
+ logResult.readFailureReason(bucketId));
+ break;
+ case NOT_LISTED:
+ audit.logSkipLogBucket(
+ liveTable.tableId, partitionId, bucketId, "no_remote_manifest");
+ break;
+ default:
+ break;
+ }
+ }
+
+ String kvTabletDir = null;
+ Set kvActiveSnaps = Collections.emptySet();
+ if (kvTargetOk && kvActiveByBucket.containsKey(bucketId)) {
+ kvTabletDir =
+ FlussPaths.remoteKvTabletDir(
+ remoteKvDir,
+ physicalPath(liveTable.tablePath, partitionInfo),
+ tableBucket)
+ .toString();
+ kvActiveSnaps = kvActiveByBucket.get(bucketId);
+ } else if (kvTargetOk) {
+ audit.logSkipKvBucket(liveTable.tableId, partitionId, bucketId, "empty_active_set");
+ }
+
+ if (logTabletDir == null && kvTabletDir == null) {
+ continue;
+ }
+
+ out.collect(
+ new BucketCleanTask(
+ logTabletDir,
+ kvTabletDir,
+ logSegmentRelativePaths,
+ logActiveManifestPaths,
+ kvActiveSnaps,
+ config.olderThanMillis(),
+ config.dryRun(),
+ config.allowDeleteManifest()));
+ }
+ }
+
+ // -------------------------------------------------------------------------
+ // Emit OrphanDirCleanTasks
+ // -------------------------------------------------------------------------
+
+ private void emitOrphanTableDirTasks(
+ DbScanState dbState,
+ MaxKnownIdsTracker tracker,
+ List clusterRoots,
+ AuditLogger audit,
+ RateLimiter remoteFsOpRateLimiter,
+ Collector out)
+ throws IOException {
+ if (!dbState.tableInfosComplete) {
+ audit.logSkipOrphanTableScan(dbState.dbName, "tableInfos-incomplete");
+ return;
+ }
+ Set activeTableIds = dbState.activeTableIds;
+ long maxKnownTableId = tracker.maxKnownTableId();
+ boolean emit = config.allowCleanOrphanTables();
+ for (String root : clusterRoots) {
+ for (String topLevel : TOP_LEVEL_DIRS) {
+ FsPath dbDir = remoteSubDir(root, topLevel + "/" + dbState.dbName);
+ if (emit) {
+ forEachOrphanDirUnderParent(
+ dbDir,
+ dirName ->
+ OrphanDirDetector.isOrphanTable(
+ dirName, activeTableIds, maxKnownTableId),
+ remoteFsOpRateLimiter,
+ dir ->
+ out.collect(
+ new OrphanDirCleanTask(
+ dir.toString(),
+ config.olderThanMillis(),
+ config.dryRun(),
+ config.allowDeleteManifest())));
+ } else {
+ forEachOrphanDirUnderParent(
+ dbDir,
+ dirName ->
+ OrphanDirDetector.isOrphanTable(
+ dirName, activeTableIds, maxKnownTableId),
+ remoteFsOpRateLimiter,
+ dir -> audit.logSkipOrphanTable(dir, "default-conservative"));
+ }
+ }
+ }
+ }
+
+ private void emitOrphanPartitionDirTasks(
+ LiveTableScope liveTable,
+ MaxKnownIdsTracker tracker,
+ List clusterRoots,
+ AuditLogger audit,
+ RateLimiter remoteFsOpRateLimiter,
+ Collector out)
+ throws IOException {
+ if (!liveTable.partitioned || !liveTable.partitionInfosComplete) {
+ return;
+ }
+ Set activePartitionIds = liveTable.activePartitionIds;
+ long maxKnownPartitionId = tracker.maxKnownPartitionId();
+ boolean emit = config.allowCleanOrphanPartitions();
+ for (String root : clusterRoots) {
+ for (String topLevel : TOP_LEVEL_DIRS) {
+ FsPath tableDir =
+ FlussPaths.remoteTableDir(
+ remoteSubDir(root, topLevel),
+ liveTable.tablePath,
+ liveTable.tableId);
+ if (emit) {
+ forEachOrphanDirUnderParent(
+ tableDir,
+ dirName ->
+ OrphanDirDetector.isOrphanPartition(
+ dirName, activePartitionIds, maxKnownPartitionId),
+ remoteFsOpRateLimiter,
+ dir ->
+ out.collect(
+ new OrphanDirCleanTask(
+ dir.toString(),
+ config.olderThanMillis(),
+ config.dryRun(),
+ config.allowDeleteManifest())));
+ } else {
+ forEachOrphanDirUnderParent(
+ tableDir,
+ dirName ->
+ OrphanDirDetector.isOrphanPartition(
+ dirName, activePartitionIds, maxKnownPartitionId),
+ remoteFsOpRateLimiter,
+ dir -> audit.logSkipOrphanPartition(dir, "default-conservative"));
+ }
+ }
+ }
+ }
+
+ private void forEachOrphanDirUnderParent(
+ FsPath parentDir,
+ Predicate isOrphan,
+ RateLimiter remoteFsOpRateLimiter,
+ Consumer action)
+ throws IOException {
+ FileSystem fs = getFileSystemIfExists(parentDir, remoteFsOpRateLimiter);
+ if (fs == null) {
+ return;
+ }
+ FileStatus[] entries = listStatuses(fs, parentDir, remoteFsOpRateLimiter);
+ if (entries == null) {
+ return;
+ }
+ for (FileStatus entry : entries) {
+ if (!entry.isDir()) {
+ continue;
+ }
+ if (!isOrphan.test(entry.getPath().getName())) {
+ continue;
+ }
+ action.accept(entry.getPath());
+ }
+ }
+
+ // -------------------------------------------------------------------------
+ // Helpers
+ // -------------------------------------------------------------------------
+
+ private static String classifyName(Throwable e) {
+ return RpcErrorClassifier.classify(e).name();
+ }
+
+ @Nullable
+ private static FileSystem getFileSystemIfExists(FsPath dir, RateLimiter remoteFsOpRateLimiter)
+ throws IOException {
+ FileSystem fs = dir.getFileSystem();
+ remoteFsOpRateLimiter.acquire();
+ return fs.exists(dir) ? fs : null;
+ }
+
+ @Nullable
+ private static FileStatus[] listStatuses(
+ FileSystem fs, FsPath dir, RateLimiter remoteFsOpRateLimiter) {
+ try {
+ remoteFsOpRateLimiter.acquire();
+ return fs.listStatus(dir);
+ } catch (IOException e) {
+ LOG.warn("Failed to list directory: {}", dir, e);
+ return null;
+ }
+ }
+
+ // -------------------------------------------------------------------------
+ // Internal state classes
+ // -------------------------------------------------------------------------
+
+ private static final class DbScanState {
+ final String dbName;
+ boolean tableInfosComplete = true;
+ final Set activeTableIds = new LinkedHashSet();
+ final List liveTables = new ArrayList();
+
+ DbScanState(String dbName) {
+ this.dbName = dbName;
+ }
+ }
+
+ private static final class LiveTableScope {
+ final String dbName;
+ final String tableName;
+ final TablePath tablePath;
+ final long tableId;
+ final TableInfo tableInfo;
+ final boolean partitioned;
+ boolean partitionInfosComplete = true;
+ final List partitions = new ArrayList();
+ final Set activePartitionIds = new LinkedHashSet();
+
+ LiveTableScope(String dbName, String tableName, TableInfo tableInfo) {
+ this.dbName = dbName;
+ this.tableName = tableName;
+ this.tablePath = tableInfo.getTablePath();
+ this.tableId = tableInfo.getTableId();
+ this.tableInfo = tableInfo;
+ this.partitioned = tableInfo.isPartitioned();
+ }
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/job/StatsAggregateOperator.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/job/StatsAggregateOperator.java
new file mode 100644
index 0000000000..2e1686ddbf
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/job/StatsAggregateOperator.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.job;
+
+import org.apache.fluss.annotation.Internal;
+import org.apache.fluss.flink.action.orphan.audit.AuditLogger;
+
+import org.apache.flink.streaming.api.operators.AbstractStreamOperator;
+import org.apache.flink.streaming.api.operators.BoundedOneInput;
+import org.apache.flink.streaming.api.operators.OneInputStreamOperator;
+import org.apache.flink.streaming.runtime.streamrecord.StreamRecord;
+
+/**
+ * Stage 3 of the orphan files cleanup job. Runs at parallelism=1 to aggregate per-subtask {@link
+ * CleanStats} records.
+ *
+ * Implemented as a custom operator (not ProcessFunction) because {@code ProcessOperator} does
+ * not implement {@link BoundedOneInput} — the {@code endInput()} callback would never fire.
+ *
+ *
Scalar counters are accumulated into longs and the final summary is emitted in {@link
+ * #endInput()}.
+ */
+@Internal
+public final class StatsAggregateOperator extends AbstractStreamOperator
+ implements OneInputStreamOperator, BoundedOneInput {
+
+ private static final long serialVersionUID = 2L;
+
+ private final boolean dryRun;
+
+ private transient long scanned;
+ private transient long deleted;
+ private transient long emptyDirsRemoved;
+ private transient long deleteFailures;
+ private transient long bytesReclaimed;
+
+ public StatsAggregateOperator(boolean dryRun) {
+ this.dryRun = dryRun;
+ }
+
+ @Override
+ public void open() throws Exception {
+ super.open();
+ scanned = 0L;
+ deleted = 0L;
+ emptyDirsRemoved = 0L;
+ deleteFailures = 0L;
+ bytesReclaimed = 0L;
+ }
+
+ @Override
+ public void processElement(StreamRecord element) {
+ CleanStats stats = element.getValue();
+ scanned += stats.scanned();
+ deleted += stats.deleted();
+ emptyDirsRemoved += stats.emptyDirsRemoved();
+ deleteFailures += stats.deleteFailures();
+ bytesReclaimed += stats.bytesReclaimed();
+ }
+
+ @Override
+ public void endInput() {
+ AuditLogger audit = new AuditLogger();
+ CleanStats finalStats =
+ new CleanStats(scanned, deleted, emptyDirsRemoved, deleteFailures, bytesReclaimed);
+
+ audit.logSummary(
+ scanned,
+ deleted - emptyDirsRemoved,
+ emptyDirsRemoved,
+ deleteFailures,
+ bytesReclaimed,
+ dryRun);
+
+ output.collect(new StreamRecord<>(finalStats));
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/BucketActiveRefs.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/BucketActiveRefs.java
new file mode 100644
index 0000000000..73a847dd75
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/BucketActiveRefs.java
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.rule;
+
+import org.apache.fluss.annotation.Internal;
+
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+/** Immutable view of all active references for a single bucket / table partition. */
+@Internal
+public final class BucketActiveRefs {
+
+ private static final BucketActiveRefs EMPTY =
+ new BucketActiveRefs(
+ Collections.emptySet(), Collections.emptySet(), Collections.emptySet());
+
+ private final Set logSegmentRelativePaths;
+ private final Set kvActiveSnapDirs;
+ private final Set logActiveManifestPaths;
+
+ public BucketActiveRefs(
+ Set logSegmentRelativePaths,
+ Set kvActiveSnapDirs,
+ Set logActiveManifestPaths) {
+ this.logSegmentRelativePaths =
+ Collections.unmodifiableSet(new HashSet<>(logSegmentRelativePaths));
+ this.kvActiveSnapDirs = Collections.unmodifiableSet(new HashSet<>(kvActiveSnapDirs));
+ this.logActiveManifestPaths =
+ Collections.unmodifiableSet(new HashSet<>(logActiveManifestPaths));
+ }
+
+ public static BucketActiveRefs empty() {
+ return EMPTY;
+ }
+
+ public Set logSegmentRelativePaths() {
+ return logSegmentRelativePaths;
+ }
+
+ /**
+ * Returns the set of active {@code snap-} directory names for the bucket.
+ *
+ * The set is the union of two server-side categories the {@code ListKvSnapshots} RPC emits
+ * as one flat list (client does not distinguish):
+ *
+ *
+ * - RETAINED — the most recent N completed snapshots kept per the retention window.
+ *
- STILL_IN_USE — snapshots pinned by an active lease; emitted unconditionally even when
+ * the corresponding ZK znode has been removed, on the principle "may over-count active,
+ * must never under-count."
+ *
+ *
+ * A KV snap-private file is preserved iff its parent directory's name is in this set.
+ */
+ public Set kvActiveSnapDirs() {
+ return kvActiveSnapDirs;
+ }
+
+ /**
+ * Returns the set of active log manifest paths reported by {@code ListRemoteLogManifests}. The
+ * "current" manifest for a bucket is always also a member of this set, so {@link
+ * LogManifestRule} only needs to check this single collection.
+ */
+ public Set logActiveManifestPaths() {
+ return logActiveManifestPaths;
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/Decision.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/Decision.java
new file mode 100644
index 0000000000..491281a22e
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/Decision.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.rule;
+
+import org.apache.fluss.annotation.Internal;
+
+/** Decision returned by a {@link FileRule} for a given file. */
+@Internal
+public enum Decision {
+
+ /** File is orphan and should be deleted. */
+ DELETE,
+
+ /** File is referenced by an active object (manifest, snapshot, etc.). */
+ KEEP_ACTIVE,
+
+ /**
+ * File is not in the active set but its age is under the {@code --older-than} threshold; the
+ * deletion verdict is deferred to a future cleanup round, by which time the file will either
+ * have entered the active set (KEEP_ACTIVE) or aged past the threshold (DELETE). The grace
+ * window prevents racing in-flight writes whose manifest entry has not yet been committed.
+ */
+ DEFER,
+
+ /** File path or extension is not recognized; skip without deletion. */
+ SKIP_UNKNOWN
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/FileMeta.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/FileMeta.java
new file mode 100644
index 0000000000..74072de4fa
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/FileMeta.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.rule;
+
+import org.apache.fluss.annotation.Internal;
+import org.apache.fluss.fs.FsPath;
+
+/** Immutable metadata describing a candidate file evaluated by {@link FileRule}. */
+@Internal
+public final class FileMeta {
+
+ private final FsPath path;
+ private final long size;
+ private final long modificationTime;
+
+ public FileMeta(FsPath path, long size, long modificationTime) {
+ this.path = path;
+ this.size = size;
+ this.modificationTime = modificationTime;
+ }
+
+ public FsPath path() {
+ return path;
+ }
+
+ public long size() {
+ return size;
+ }
+
+ public long modificationTime() {
+ return modificationTime;
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/FileRule.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/FileRule.java
new file mode 100644
index 0000000000..af9a01468a
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/FileRule.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.rule;
+
+import org.apache.fluss.annotation.Internal;
+
+/** Rule that decides whether a single file is orphan. */
+@Internal
+public interface FileRule {
+
+ /** Stable identifier used in audit logs. */
+ RuleId id();
+
+ /**
+ * Decide what to do with the given file.
+ *
+ * @param cutoffMillis absolute epoch-ms cutoff: a file whose mtime is {@code < cutoffMillis} is
+ * age-eligible for deletion (a {@link Decision#DELETE}); a file whose mtime is {@code >=
+ * cutoffMillis} is {@link Decision#DEFER}red. Pre-frozen at action start; does not slide
+ * during a run.
+ */
+ Decision evaluate(FileMeta file, BucketActiveRefs activeRefs, long cutoffMillis);
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/KvSharedSstRule.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/KvSharedSstRule.java
new file mode 100644
index 0000000000..8fc1e5b2c0
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/KvSharedSstRule.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.rule;
+
+import org.apache.fluss.annotation.Internal;
+import org.apache.fluss.fs.FsPath;
+import org.apache.fluss.utils.FlussPaths;
+
+/**
+ * Rule for shared SST files under the {@code shared/} KV directory.
+ *
+ * Always returns {@link Decision#KEEP_ACTIVE}. The true active set for shared SSTs lives inside
+ * the engine's {@code SharedKvFileRegistry}; orphan cleanup has no read path into that registry, so
+ * any deletion here would be a guess. Per the action's hard constraint "prefer leak over
+ * mis-delete," the rule never deletes, and as a consequence orphan PK-table / orphan-partition
+ * directories permanently retain their {@code shared/} subtree as accepted residue (recovering that
+ * residue would require a registry-backed GC channel that is out of scope for this action).
+ */
+@Internal
+public final class KvSharedSstRule implements FileRule {
+
+ @Override
+ public RuleId id() {
+ return RuleId.KV_SHARED_SST;
+ }
+
+ @Override
+ public Decision evaluate(FileMeta file, BucketActiveRefs activeRefs, long cutoffMillis) {
+ FsPath parent = file.path().getParent();
+ if (parent == null || !FlussPaths.REMOTE_KV_SNAPSHOT_SHARED_DIR.equals(parent.getName())) {
+ return Decision.SKIP_UNKNOWN;
+ }
+ if (!file.path().getName().endsWith(".sst")) {
+ return Decision.SKIP_UNKNOWN;
+ }
+ return Decision.KEEP_ACTIVE;
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/KvSnapshotFileRule.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/KvSnapshotFileRule.java
new file mode 100644
index 0000000000..0700b9563f
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/KvSnapshotFileRule.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.rule;
+
+import org.apache.fluss.annotation.Internal;
+import org.apache.fluss.fs.FsPath;
+import org.apache.fluss.utils.FlussPaths;
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Rule for files under a {@code snap-/} KV snapshot directory.
+ *
+ * Match key is the file's parent {@code snap-} directory name: if that name is in {@link
+ * BucketActiveRefs#kvActiveSnapDirs()} (which carries the per-bucket union of RETAINED +
+ * STILL_IN_USE entries from {@code ListKvSnapshots}, see that getter's javadoc) the file is {@link
+ * Decision#KEEP_ACTIVE}.
+ *
+ * The set-based check is what prevents retained non-latest snapshots from being misclassified as
+ * orphan — e.g. with {@code kv.snapshot.num-retained=2}, {@code snap-9} is still active while
+ * {@code snap-10} is the latest.
+ */
+@Internal
+public final class KvSnapshotFileRule implements FileRule {
+
+ private static final String SNAP_DIR_PREFIX = FlussPaths.REMOTE_KV_SNAPSHOT_DIR_PREFIX;
+
+ private static final Set KNOWN_FIXED_NAMES =
+ new HashSet(Arrays.asList("_METADATA", "CURRENT", "LOG", "IDENTITY"));
+
+ @Override
+ public RuleId id() {
+ return RuleId.KV_SNAPSHOT_FILE;
+ }
+
+ @Override
+ public Decision evaluate(FileMeta file, BucketActiveRefs activeRefs, long cutoffMillis) {
+ FsPath parent = file.path().getParent();
+ if (parent == null) {
+ return Decision.SKIP_UNKNOWN;
+ }
+
+ String parentName = parent.getName();
+ if (!parentName.startsWith(SNAP_DIR_PREFIX)) {
+ return Decision.SKIP_UNKNOWN;
+ }
+
+ // Parent must be snap-; reject e.g. snap-, snap-abc.
+ String snapIdPart = parentName.substring(SNAP_DIR_PREFIX.length());
+ if (snapIdPart.isEmpty()) {
+ return Decision.SKIP_UNKNOWN;
+ }
+ for (int i = 0; i < snapIdPart.length(); i++) {
+ if (!Character.isDigit(snapIdPart.charAt(i))) {
+ return Decision.SKIP_UNKNOWN;
+ }
+ }
+
+ if (!isKnownSnapshotFile(file.path().getName())) {
+ return Decision.SKIP_UNKNOWN;
+ }
+
+ if (activeRefs.kvActiveSnapDirs().contains(parentName)) {
+ return Decision.KEEP_ACTIVE;
+ }
+
+ return file.modificationTime() < cutoffMillis ? Decision.DELETE : Decision.DEFER;
+ }
+
+ private static boolean isKnownSnapshotFile(String fileName) {
+ if (KNOWN_FIXED_NAMES.contains(fileName)) {
+ return true;
+ }
+ if (fileName.startsWith("MANIFEST-") || fileName.startsWith("OPTIONS-")) {
+ return true;
+ }
+ return fileName.endsWith(".sst") || fileName.endsWith(".log");
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/LogManifestRule.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/LogManifestRule.java
new file mode 100644
index 0000000000..23fb5d5edd
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/LogManifestRule.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.rule;
+
+import org.apache.fluss.annotation.Internal;
+import org.apache.fluss.fs.FsPath;
+import org.apache.fluss.utils.FlussPaths;
+
+/**
+ * Rule for manifest files under the {@code metadata/} directory of a log bucket.
+ *
+ * Default behavior is to return {@link Decision#KEEP_ACTIVE} for every manifest. The asymmetry
+ * is the reason: mis-deleting an active manifest leaves the coordinator's manifest pointer dangling
+ * and breaks the bucket's metadata chain entirely, while keeping orphan manifests is structurally
+ * harmless (KB-sized files). Operators opt into the destructive path via {@code
+ * allowDeleteManifest=true} (driven by the {@code --allow-delete-manifest} CLI flag); only then
+ * does the rule consult the active-manifest set and apply the file-level age threshold.
+ */
+@Internal
+public final class LogManifestRule implements FileRule {
+
+ private final boolean allowDeleteManifest;
+
+ /** Default-conservative constructor: {@code allowDeleteManifest=false}. */
+ public LogManifestRule() {
+ this(false);
+ }
+
+ public LogManifestRule(boolean allowDeleteManifest) {
+ this.allowDeleteManifest = allowDeleteManifest;
+ }
+
+ @Override
+ public RuleId id() {
+ return RuleId.LOG_MANIFEST;
+ }
+
+ @Override
+ public Decision evaluate(FileMeta file, BucketActiveRefs activeRefs, long cutoffMillis) {
+ FsPath path = file.path();
+ FsPath parent = path.getParent();
+ if (parent == null
+ || !FlussPaths.REMOTE_LOG_METADATA_DIR_NAME.equals(parent.getName())
+ || !path.getName().endsWith(".manifest")) {
+ return Decision.SKIP_UNKNOWN;
+ }
+
+ // Default-conservative: never delete a manifest. Keeping orphans is harmless; deleting an
+ // active manifest leaves the coordinator's manifest pointer dangling and breaks the
+ // bucket's metadata chain.
+ if (!allowDeleteManifest) {
+ return Decision.KEEP_ACTIVE;
+ }
+
+ // Opt-in path: preserve the original active-set + cutoff semantics. The "current" bucket
+ // manifest is always present in logActiveManifestPaths (the server emits one path per
+ // bucket in ListRemoteLogManifests), so a single set lookup suffices.
+ String pathString = path.toString();
+ if (activeRefs.logActiveManifestPaths().contains(pathString)) {
+ return Decision.KEEP_ACTIVE;
+ }
+
+ return file.modificationTime() < cutoffMillis ? Decision.DELETE : Decision.DEFER;
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/LogSegmentRule.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/LogSegmentRule.java
new file mode 100644
index 0000000000..1ac4156e8f
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/LogSegmentRule.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.rule;
+
+import org.apache.fluss.annotation.Internal;
+import org.apache.fluss.fs.FsPath;
+import org.apache.fluss.utils.FlussPaths;
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+/**
+ * Rule for log-segment files under a remote log bucket.
+ *
+ *
{@code .writer_snapshot} files are only eligible for deletion in orphan-directory mode. In
+ * active-bucket mode the engine's own TTL cleanup handles them; the orphan tool conservatively
+ * keeps them to avoid any risk of racing a concurrent write.
+ */
+@Internal
+public final class LogSegmentRule implements FileRule {
+
+ private static final Pattern SEGMENT_DIR_PATTERN =
+ Pattern.compile(
+ "[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}"
+ + "-[0-9a-fA-F]{12}");
+
+ private static final Set KNOWN_SUFFIXES =
+ new HashSet(Arrays.asList(".log", ".index", ".timeindex", ".writer_snapshot"));
+
+ private final boolean orphanDirMode;
+
+ public LogSegmentRule() {
+ this(false);
+ }
+
+ public LogSegmentRule(boolean orphanDirMode) {
+ this.orphanDirMode = orphanDirMode;
+ }
+
+ @Override
+ public RuleId id() {
+ return RuleId.LOG_SEGMENT;
+ }
+
+ @Override
+ public Decision evaluate(FileMeta file, BucketActiveRefs activeRefs, long cutoffMillis) {
+ FsPath path = file.path();
+ FsPath parent = path.getParent();
+ if (parent == null || !isSegmentDir(parent.getName()) || !hasKnownSuffix(path.getName())) {
+ return Decision.SKIP_UNKNOWN;
+ }
+
+ String relativePath = parent.getName() + "/" + path.getName();
+ if (activeRefs.logSegmentRelativePaths().contains(relativePath)) {
+ return Decision.KEEP_ACTIVE;
+ }
+
+ if (path.getName().endsWith(FlussPaths.WRITER_SNAPSHOT_FILE_SUFFIX) && !orphanDirMode) {
+ return Decision.KEEP_ACTIVE;
+ }
+
+ return file.modificationTime() < cutoffMillis ? Decision.DELETE : Decision.DEFER;
+ }
+
+ static boolean isSegmentDir(String dirName) {
+ return SEGMENT_DIR_PATTERN.matcher(dirName).matches();
+ }
+
+ private static boolean hasKnownSuffix(String fileName) {
+ String name = fileName;
+ if (name.endsWith(FlussPaths.DELETED_FILE_SUFFIX)) {
+ name = name.substring(0, name.length() - FlussPaths.DELETED_FILE_SUFFIX.length());
+ }
+ for (String suffix : KNOWN_SUFFIXES) {
+ if (name.endsWith(suffix)) {
+ return true;
+ }
+ }
+ return false;
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/OrphanDirDetector.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/OrphanDirDetector.java
new file mode 100644
index 0000000000..5762ff51c2
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/OrphanDirDetector.java
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.rule;
+
+import org.apache.fluss.annotation.Internal;
+import org.apache.fluss.annotation.VisibleForTesting;
+
+import javax.annotation.Nullable;
+
+import java.util.Set;
+
+/**
+ * Detects orphan table and partition directories by ID guard.
+ *
+ * A directory is an orphan candidate iff its parsed ID is not in the active set and does not
+ * exceed the last-known maximum (conservatively treating IDs above the max as freshly allocated).
+ * Unrecognizable directory names are never flagged.
+ */
+@Internal
+public final class OrphanDirDetector {
+
+ private OrphanDirDetector() {}
+
+ /**
+ * Returns {@code true} if the directory name matches {@code {name}-{tableId}} and the parsed ID
+ * is not in {@code activeTableIds} and is {@code <= maxKnownTableId}.
+ */
+ public static boolean isOrphanTable(
+ String dirName, Set activeTableIds, long maxKnownTableId) {
+ Long parsed = parseTableId(dirName);
+ if (parsed == null) {
+ return false;
+ }
+ if (activeTableIds.contains(parsed)) {
+ return false;
+ }
+ return parsed <= maxKnownTableId;
+ }
+
+ /**
+ * Returns {@code true} if the directory name matches {@code {name}-p{partitionId}} and the
+ * parsed ID is not in {@code activePartitionIds} and is {@code <= maxKnownPartitionId}.
+ */
+ public static boolean isOrphanPartition(
+ String dirName, Set activePartitionIds, long maxKnownPartitionId) {
+ Long parsed = parsePartitionId(dirName);
+ if (parsed == null) {
+ return false;
+ }
+ if (activePartitionIds.contains(parsed)) {
+ return false;
+ }
+ return parsed <= maxKnownPartitionId;
+ }
+
+ @VisibleForTesting
+ @Nullable
+ static Long parseTableId(String dirName) {
+ int dash = dirName.lastIndexOf('-');
+ if (dash <= 0 || dash == dirName.length() - 1) {
+ return null;
+ }
+ String idPart = dirName.substring(dash + 1);
+ for (int i = 0; i < idPart.length(); i++) {
+ if (!Character.isDigit(idPart.charAt(i))) {
+ return null;
+ }
+ }
+ try {
+ return Long.parseLong(idPart);
+ } catch (NumberFormatException e) {
+ return null;
+ }
+ }
+
+ @VisibleForTesting
+ @Nullable
+ static Long parsePartitionId(String dirName) {
+ int dashP = dirName.lastIndexOf("-p");
+ if (dashP <= 0 || dashP == dirName.length() - 2) {
+ return null;
+ }
+ String idPart = dirName.substring(dashP + 2);
+ for (int i = 0; i < idPart.length(); i++) {
+ if (!Character.isDigit(idPart.charAt(i))) {
+ return null;
+ }
+ }
+ try {
+ return Long.parseLong(idPart);
+ } catch (NumberFormatException e) {
+ return null;
+ }
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/RuleDispatcher.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/RuleDispatcher.java
new file mode 100644
index 0000000000..9880c6e64d
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/RuleDispatcher.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.rule;
+
+import org.apache.fluss.annotation.Internal;
+import org.apache.fluss.fs.FsPath;
+import org.apache.fluss.utils.FlussPaths;
+
+/** Dispatches a candidate file to the matching orphan-cleanup rule. */
+@Internal
+public final class RuleDispatcher {
+
+ private static final FileRule UNKNOWN_RULE =
+ new FileRule() {
+ @Override
+ public RuleId id() {
+ return RuleId.UNKNOWN;
+ }
+
+ @Override
+ public Decision evaluate(
+ FileMeta file, BucketActiveRefs activeRefs, long cutoffMillis) {
+ return Decision.SKIP_UNKNOWN;
+ }
+ };
+
+ private final FileRule logSegmentRule;
+ private final FileRule logManifestRule;
+ private final FileRule kvSnapshotFileRule = new KvSnapshotFileRule();
+ private final FileRule kvSharedSstRule = new KvSharedSstRule();
+
+ public RuleDispatcher() {
+ this(false, false);
+ }
+
+ public RuleDispatcher(boolean allowDeleteManifest) {
+ this(allowDeleteManifest, false);
+ }
+
+ public RuleDispatcher(boolean allowDeleteManifest, boolean orphanDirMode) {
+ this.logSegmentRule = new LogSegmentRule(orphanDirMode);
+ this.logManifestRule = new LogManifestRule(allowDeleteManifest);
+ }
+
+ public FileRule dispatch(FileMeta file) {
+ FsPath path = file.path();
+ FsPath parent = path.getParent();
+ if (parent == null) {
+ return UNKNOWN_RULE;
+ }
+
+ String parentName = parent.getName();
+ if (FlussPaths.REMOTE_LOG_METADATA_DIR_NAME.equals(parentName)) {
+ return logManifestRule;
+ }
+ if (FlussPaths.REMOTE_KV_SNAPSHOT_SHARED_DIR.equals(parentName)) {
+ return kvSharedSstRule;
+ }
+ if (parentName.startsWith(FlussPaths.REMOTE_KV_SNAPSHOT_DIR_PREFIX)) {
+ return kvSnapshotFileRule;
+ }
+ if (LogSegmentRule.isSegmentDir(parentName)) {
+ return logSegmentRule;
+ }
+ return UNKNOWN_RULE;
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/RuleId.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/RuleId.java
new file mode 100644
index 0000000000..a27ef07624
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/action/orphan/rule/RuleId.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.rule;
+
+import org.apache.fluss.annotation.Internal;
+
+/** Enumeration of all file-level rule identifiers used in orphan cleanup audit logs. */
+@Internal
+public enum RuleId {
+ LOG_SEGMENT("log-segment"),
+ LOG_MANIFEST("log-manifest"),
+ KV_SNAPSHOT_FILE("kv-snapshot-file"),
+ KV_SHARED_SST("kv-shared-sst"),
+ UNKNOWN("unknown");
+
+ private final String auditTag;
+
+ RuleId(String auditTag) {
+ this.auditTag = auditTag;
+ }
+
+ @Override
+ public String toString() {
+ return auditTag;
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/adapter/MultipleParameterToolAdapter.java b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/adapter/MultipleParameterToolAdapter.java
index 9c1972ce6c..21a8fb3afc 100644
--- a/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/adapter/MultipleParameterToolAdapter.java
+++ b/fluss-flink/fluss-flink-common/src/main/java/org/apache/fluss/flink/adapter/MultipleParameterToolAdapter.java
@@ -19,6 +19,9 @@
import org.apache.flink.api.java.utils.MultipleParameterTool;
+import javax.annotation.Nullable;
+
+import java.util.Collection;
import java.util.Map;
/**
@@ -43,4 +46,23 @@ public static MultipleParameterToolAdapter fromArgs(String[] args) {
public Map toMap() {
return this.multipleParameterTool.toMap();
}
+
+ /** Returns whether the given key is present in the parsed arguments. */
+ public boolean has(String key) {
+ return this.multipleParameterTool.has(key);
+ }
+
+ /** Returns the value for the given key, or {@code null} if the key is not found. */
+ @Nullable
+ public String get(String key) {
+ return this.multipleParameterTool.get(key);
+ }
+
+ /**
+ * Returns all values associated with the given key, or {@code null} if the key is not found.
+ */
+ @Nullable
+ public Collection getMultiParameter(String key) {
+ return this.multipleParameterTool.getMultiParameter(key);
+ }
}
diff --git a/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/action/orphan/OrphanFilesCleanITCase.java b/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/action/orphan/OrphanFilesCleanITCase.java
new file mode 100644
index 0000000000..6128495b2f
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/action/orphan/OrphanFilesCleanITCase.java
@@ -0,0 +1,1209 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan;
+
+import org.apache.fluss.client.Connection;
+import org.apache.fluss.client.ConnectionFactory;
+import org.apache.fluss.client.admin.Admin;
+import org.apache.fluss.config.ConfigOptions;
+import org.apache.fluss.config.Configuration;
+import org.apache.fluss.flink.action.orphan.config.OrphanCleanConfig;
+import org.apache.fluss.flink.adapter.MultipleParameterToolAdapter;
+import org.apache.fluss.fs.FsPath;
+import org.apache.fluss.metadata.DatabaseDescriptor;
+import org.apache.fluss.metadata.PartitionInfo;
+import org.apache.fluss.metadata.PartitionSpec;
+import org.apache.fluss.metadata.PhysicalTablePath;
+import org.apache.fluss.metadata.Schema;
+import org.apache.fluss.metadata.TableBucket;
+import org.apache.fluss.metadata.TableDescriptor;
+import org.apache.fluss.metadata.TableInfo;
+import org.apache.fluss.metadata.TablePath;
+import org.apache.fluss.server.testutils.FlussClusterExtension;
+import org.apache.fluss.server.zk.ZooKeeperClient;
+import org.apache.fluss.server.zk.data.BucketSnapshot;
+import org.apache.fluss.server.zk.data.RemoteLogManifestHandle;
+import org.apache.fluss.server.zk.data.ZkData.BucketSnapshotsZNode;
+import org.apache.fluss.server.zk.data.ZkData.PartitionZNode;
+import org.apache.fluss.types.DataTypes;
+import org.apache.fluss.utils.FlussPaths;
+
+import org.apache.flink.test.util.AbstractTestBase;
+import org.apache.logging.log4j.Level;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.core.LogEvent;
+import org.apache.logging.log4j.core.LoggerContext;
+import org.apache.logging.log4j.core.appender.AbstractAppender;
+import org.apache.logging.log4j.core.config.LoggerConfig;
+import org.junit.jupiter.api.AfterAll;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.extension.RegisterExtension;
+
+import java.net.URI;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.attribute.FileTime;
+import java.time.Duration;
+import java.time.OffsetDateTime;
+import java.time.ZoneOffset;
+import java.time.format.DateTimeFormatter;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.UUID;
+import java.util.concurrent.CopyOnWriteArrayList;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/** End-to-end tests for orphan files cleanup safety scenarios. */
+abstract class OrphanFilesCleanITCase extends AbstractTestBase {
+
+ @RegisterExtension
+ static final FlussClusterExtension FLUSS_CLUSTER_EXTENSION =
+ FlussClusterExtension.builder()
+ .setClusterConf(buildClusterConf())
+ .setNumOfTabletServers(1)
+ .build();
+
+ private static Configuration buildClusterConf() {
+ Configuration clusterConf = new Configuration();
+ clusterConf.set(ConfigOptions.KV_MAX_RETAINED_SNAPSHOTS, 2);
+ return clusterConf;
+ }
+
+ private static Connection connection;
+ private static Admin admin;
+ private static String bootstrapServers;
+
+ private CapturingAppender auditAppender;
+ private LoggerConfig auditLoggerConfig;
+ private Level previousAuditLevel;
+
+ @BeforeAll
+ static void beforeAll() {
+ bootstrapServers = FLUSS_CLUSTER_EXTENSION.getBootstrapServers();
+ Configuration clientConfig = new Configuration();
+ clientConfig.setString(ConfigOptions.BOOTSTRAP_SERVERS.key(), bootstrapServers);
+ connection = ConnectionFactory.createConnection(clientConfig);
+ admin = connection.getAdmin();
+ }
+
+ @AfterAll
+ static void afterAll() throws Exception {
+ if (admin != null) {
+ admin.close();
+ admin = null;
+ }
+ if (connection != null) {
+ connection.close();
+ connection = null;
+ }
+ }
+
+ @BeforeEach
+ void setUp() {
+ attachAuditAppender();
+ }
+
+ @AfterEach
+ void tearDown() {
+ detachAuditAppender();
+ }
+
+ private Path remoteDataRoot() {
+ return Paths.get(URI.create(FLUSS_CLUSTER_EXTENSION.getRemoteDataDir()));
+ }
+
+ private List auditMessages() {
+ return auditAppender.messages();
+ }
+
+ private void attachAuditAppender() {
+ LoggerContext context = (LoggerContext) LogManager.getContext(false);
+ org.apache.logging.log4j.core.config.Configuration config = context.getConfiguration();
+ auditAppender = new CapturingAppender("orphan-clean-it-audit");
+ auditAppender.start();
+ auditLoggerConfig = config.getLoggerConfig("fluss.orphan.audit");
+ previousAuditLevel = auditLoggerConfig.getLevel();
+ auditLoggerConfig.setLevel(Level.DEBUG);
+ auditLoggerConfig.addAppender(auditAppender, Level.DEBUG, null);
+ context.updateLoggers();
+ }
+
+ private void detachAuditAppender() {
+ if (auditLoggerConfig != null && auditAppender != null) {
+ auditLoggerConfig.removeAppender(auditAppender.getName());
+ auditLoggerConfig.setLevel(previousAuditLevel);
+ ((LoggerContext) LogManager.getContext(false)).updateLoggers();
+ auditAppender.stop();
+ }
+ }
+
+ private static final Duration OLD_ENOUGH = Duration.ofDays(2);
+
+ @Test
+ void mixedOrphanAndActiveFilesInSameBucket() throws Exception {
+ String dbName = newDatabaseName("mixed");
+ TablePath tablePath = createLogTable(dbName, "mixed_bucket");
+ TableInfo tableInfo = admin.getTableInfo(tablePath).get();
+ TableBucket tableBucket = new TableBucket(tableInfo.getTableId(), 0);
+ FsPath remoteLogTabletDir =
+ FlussPaths.remoteLogTabletDir(
+ new FsPath(remoteDataRoot().resolve("log").toUri().toString()),
+ PhysicalTablePath.of(tablePath),
+ tableBucket);
+
+ // Two active segments registered in manifest
+ String activeId1 = UUID.randomUUID().toString();
+ String activeId2 = UUID.randomUUID().toString();
+ FsPath manifestPath =
+ new FsPath(
+ localPath(remoteLogTabletDir)
+ .resolve("metadata/p0.manifest")
+ .toUri()
+ .toString());
+ Path manifest = localPath(manifestPath);
+ Files.createDirectories(manifest.getParent());
+ String manifestContent =
+ "{\"version\":1,"
+ + "\"database\":\"db\","
+ + "\"table\":\"t\","
+ + "\"table_id\":0,"
+ + "\"bucket_id\":0,"
+ + "\"remote_log_segments\":["
+ + "{\"segment_id\":\""
+ + activeId1
+ + "\",\"start_offset\":0,\"end_offset\":99,"
+ + "\"max_timestamp\":0,\"size_in_bytes\":1},"
+ + "{\"segment_id\":\""
+ + activeId2
+ + "\",\"start_offset\":100,\"end_offset\":199,"
+ + "\"max_timestamp\":0,\"size_in_bytes\":1}"
+ + "]}";
+ Files.write(manifest, manifestContent.getBytes(StandardCharsets.UTF_8));
+ makeOld(manifest);
+ upsertManifest(tableBucket, manifestPath, 199L);
+
+ Path activeFile1 = writeSegmentFile(remoteLogTabletDir, activeId1, 0L);
+ Path activeFile2 = writeSegmentFile(remoteLogTabletDir, activeId2, 100L);
+
+ // Two orphan segments NOT in manifest
+ String orphanId1 = UUID.randomUUID().toString();
+ String orphanId2 = UUID.randomUUID().toString();
+ Path orphanFile1 = writeSegmentFile(remoteLogTabletDir, orphanId1, 500L);
+ Path orphanFile2 = writeSegmentFile(remoteLogTabletDir, orphanId2, 600L);
+
+ runCleanerForDatabase(false, dbName);
+
+ // Active files must survive
+ assertThat(Files.exists(activeFile1)).as("active segment 1 must survive cleanup").isTrue();
+ assertThat(Files.exists(activeFile2)).as("active segment 2 must survive cleanup").isTrue();
+
+ // Orphan files must be deleted
+ assertThat(Files.exists(orphanFile1)).as("orphan segment 1 must be deleted").isFalse();
+ assertThat(Files.exists(orphanFile2)).as("orphan segment 2 must be deleted").isFalse();
+
+ // Audit confirms deletions for both orphans
+ assertThat(auditMessages())
+ .anyMatch(
+ m ->
+ m.contains("action=deleted")
+ && m.contains("rule=log-segment")
+ && m.contains(orphanFile1.toString()));
+ assertThat(auditMessages())
+ .anyMatch(
+ m ->
+ m.contains("action=deleted")
+ && m.contains("rule=log-segment")
+ && m.contains(orphanFile2.toString()));
+
+ // No deletion audit for active files
+ assertThat(auditMessages())
+ .noneMatch(m -> m.contains("action=deleted") && m.contains(activeFile1.toString()));
+ assertThat(auditMessages())
+ .noneMatch(m -> m.contains("action=deleted") && m.contains(activeFile2.toString()));
+ }
+
+ @Test
+ void dryRunDoesNotDeleteFiles() throws Exception {
+ String dbName = newDatabaseName("dryrun");
+ TablePath tablePath = createLogTable(dbName, "dry_run");
+ Path activeSegment = seedActiveBucketManifest(tablePath);
+ Path orphan = createOldSegmentFile(tablePath, "99999999999999999999.log");
+
+ runCleanerForDatabase(true, dbName);
+
+ assertThat(Files.exists(orphan)).isTrue();
+ assertThat(Files.exists(activeSegment)).isTrue();
+ assertThat(auditMessages())
+ .anyMatch(
+ m ->
+ m.contains("action=would_delete")
+ && m.contains("rule=log-segment")
+ && m.contains(orphan.toString()));
+ assertThat(auditMessages()).noneMatch(m -> m.contains("action=deleted"));
+ // Catch a regression that targets the active segment with a would_delete intent: the
+ // file-existence checks above would silently pass under dry-run even if the planner
+ // mis-marked the active segment, because dry-run never touches disk.
+ assertThat(auditMessages())
+ .noneMatch(
+ m ->
+ m.contains("action=would_delete")
+ && m.contains(activeSegment.toString()));
+ }
+
+ /**
+ * Seeds a remote log manifest + matching active segment under a freshly-allocated UUID so the
+ * active-file cleanup reaches {@code ManifestReadStatus.RESOLVED} for bucket 0 of the given log
+ * table. Returns the active segment's {@code .log} path so callers can assert it survives
+ * cleanup.
+ *
+ * Without a manifest the bucket falls back to {@code ManifestReadStatus.NOT_LISTED} and the
+ * active-file cleanup skips the entire bucket (see §4.3.1 of the design doc) — which would
+ * prevent any orphan file under the bucket from being visited at all.
+ */
+ private Path seedActiveBucketManifest(TablePath tablePath) throws Exception {
+ TableInfo tableInfo = admin.getTableInfo(tablePath).get();
+ TableBucket tableBucket = new TableBucket(tableInfo.getTableId(), 0);
+ FsPath remoteLogTabletDir =
+ FlussPaths.remoteLogTabletDir(
+ new FsPath(remoteDataRoot().resolve("log").toUri().toString()),
+ PhysicalTablePath.of(tablePath),
+ tableBucket);
+ FsPath manifestPath =
+ new FsPath(
+ localPath(remoteLogTabletDir)
+ .resolve("metadata/p0.manifest")
+ .toUri()
+ .toString());
+ String activeSegmentId = UUID.randomUUID().toString();
+ Path activeSegment =
+ seedManifestAndSegment(remoteLogTabletDir, manifestPath, activeSegmentId, 0L, 0L);
+ upsertManifest(tableBucket, manifestPath, 0L);
+ return activeSegment;
+ }
+
+ @Test
+ void defaultDoesNotEnterOrphanTableDir() throws Exception {
+ String dbName = newDatabaseName("defaultskip");
+ long tableId = allocateDroppedTableId(dbName, "seed_table");
+ createLogTable(dbName, "live_anchor");
+ OrphanTableLayout layout =
+ createOldOrphanTableLayout(
+ remoteDataRoot(),
+ dbName,
+ tableId,
+ "ghost_table",
+ "99999999999999999999.log");
+
+ runCleanerForAllDatabases(false);
+
+ assertThat(Files.exists(layout.orphanFile)).isTrue();
+ assertThat(Files.exists(layout.tableDir)).isTrue();
+ assertThat(auditMessages())
+ .anyMatch(
+ m ->
+ m.contains("action=skip_orphan_table")
+ && m.contains("default-conservative")
+ && m.contains(layout.tableDir.toString()));
+ }
+
+ @Test
+ void optInCleansOrphanTableDirWhenEnabled() throws Exception {
+ String dbName = newDatabaseName("optin");
+ long tableId = allocateDroppedTableId(dbName, "seed_table");
+ createLogTable(dbName, "live_anchor");
+ OrphanTableLayout layout =
+ createOldOrphanTableLayout(
+ remoteDataRoot(),
+ dbName,
+ tableId,
+ "ghost_table",
+ "99999999999999999999.log");
+
+ runCleanerForAllDatabases(false, "--allow-clean-orphan-tables");
+
+ assertThat(Files.exists(layout.orphanFile)).isFalse();
+ assertThat(Files.exists(layout.tableDir)).isFalse();
+ assertThat(auditMessages())
+ .anyMatch(
+ m ->
+ m.contains("action=deleted")
+ && m.contains("rule=log-segment")
+ && m.contains(layout.orphanFile.toString()));
+ }
+
+ @Test
+ void pkOrphanTableRetainsSharedSstEvenWithOptIn() throws Exception {
+ String dbName = newDatabaseName("orphankv");
+ long tableId = allocateDroppedPrimaryKeyTableId(dbName, "seed_pk_table");
+ createLogTable(dbName, "live_anchor");
+ OrphanTableLayout layout =
+ createOldOrphanKvTableLayout(
+ remoteDataRoot(),
+ dbName,
+ tableId,
+ "ghost_pk_table",
+ "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa-orphan.sst");
+
+ runCleanerForDatabase(false, dbName, "--allow-clean-orphan-tables");
+
+ assertThat(Files.exists(layout.orphanFile)).isTrue();
+ assertThat(Files.exists(layout.tableDir)).isTrue();
+ assertThat(auditMessages())
+ .noneMatch(
+ m ->
+ m.contains("rule=kv-shared-sst")
+ && m.contains(layout.orphanFile.toString()));
+ }
+
+ @Test
+ void manifestPreservedByDefault() throws Exception {
+ String dbName = newDatabaseName("manifest");
+ TablePath tablePath = createLogTable(dbName, "manifest_default");
+ Path orphanManifest = createOldLogManifestFile(tablePath, "orphan.manifest");
+
+ runCleanerForDatabase(false, dbName);
+
+ assertThat(Files.exists(orphanManifest)).isTrue();
+ assertThat(auditMessages())
+ .noneMatch(
+ m ->
+ m.contains("rule=log-manifest")
+ && m.contains(orphanManifest.toString()));
+ }
+
+ @Test
+ void retainedNonLatestSnapshotPreserved() throws Exception {
+ String dbName = newDatabaseName("retained");
+ TablePath tablePath = createPrimaryKeyTable(dbName, "retained_pk");
+ TableInfo tableInfo = admin.getTableInfo(tablePath).get();
+ TableBucket tableBucket = new TableBucket(tableInfo.getTableId(), 0);
+ FsPath remoteKvTabletDir =
+ FlussPaths.remoteKvTabletDir(
+ new FsPath(remoteDataRoot().resolve("kv").toUri().toString()),
+ PhysicalTablePath.of(tablePath),
+ tableBucket);
+
+ seedKvSnapshots(tableBucket, remoteKvTabletDir, new long[] {1L, 2L, 3L, 4L});
+
+ // Drop a snapshot directory locally without registering it in ZK to model a
+ // crash-leftover. The active set is derived from ZK references, so this
+ // unreferenced snapshot must still be cleaned — guarding the assertions below
+ // from passing trivially when the cleaner fails to scan at all.
+ long unreferencedSnapshotId = 99L;
+ Path unreferencedSnapshotDir =
+ localPath(
+ FlussPaths.remoteKvSnapshotDir(remoteKvTabletDir, unreferencedSnapshotId));
+ Files.createDirectories(unreferencedSnapshotDir);
+ Path unreferencedMeta = unreferencedSnapshotDir.resolve("_METADATA");
+ Files.write(unreferencedMeta, new byte[] {0x33});
+ makeOld(unreferencedMeta);
+ makeOld(unreferencedSnapshotDir);
+
+ runCleanerForDatabase(false, dbName);
+
+ // Every snapshot still referenced in ZK is preserved, regardless of recency.
+ assertThat(Files.exists(localPath(FlussPaths.remoteKvSnapshotDir(remoteKvTabletDir, 1L))))
+ .isTrue();
+ assertThat(Files.exists(localPath(FlussPaths.remoteKvSnapshotDir(remoteKvTabletDir, 2L))))
+ .isTrue();
+ assertThat(Files.exists(localPath(FlussPaths.remoteKvSnapshotDir(remoteKvTabletDir, 3L))))
+ .isTrue();
+ assertThat(Files.exists(localPath(FlussPaths.remoteKvSnapshotDir(remoteKvTabletDir, 4L))))
+ .isTrue();
+ assertThat(Files.exists(unreferencedSnapshotDir)).isFalse();
+ }
+
+ @Test
+ void listPartitionInfosFailureScopesToSingleTable() throws Exception {
+ String dbName = newDatabaseName("partfail");
+ PartitionedTableLayout tableA = createPartitionedLogTable(dbName, "table_a", "pa");
+ PartitionedTableLayout tableB = createPartitionedLogTable(dbName, "table_b", "pb");
+
+ long orphanPartitionIdForA =
+ Math.max(
+ tableA.partitionInfo.getPartitionId(),
+ tableB.partitionInfo.getPartitionId());
+ long orphanPartitionIdForB =
+ Math.min(
+ tableA.partitionInfo.getPartitionId(),
+ tableB.partitionInfo.getPartitionId());
+
+ OrphanPartitionLayout orphanA =
+ createOldOrphanPartitionLayout(
+ remoteDataRoot(),
+ tableA.tablePath,
+ tableA.tableId,
+ "ghost-a",
+ orphanPartitionIdForA,
+ "99999999999999999999.log");
+ OrphanPartitionLayout orphanB =
+ createOldOrphanPartitionLayout(
+ remoteDataRoot(),
+ tableB.tablePath,
+ tableB.tableId,
+ "ghost-b",
+ orphanPartitionIdForB,
+ "99999999999999999999.log");
+
+ ZooKeeperClient zk = FLUSS_CLUSTER_EXTENSION.getZooKeeperClient();
+ String brokenPartitionPath =
+ PartitionZNode.path(tableA.tablePath, tableA.partitionInfo.getPartitionName());
+ byte[] originalPartitionBytes =
+ zk.getCuratorClient().getData().forPath(brokenPartitionPath);
+ zk.getCuratorClient()
+ .setData()
+ .forPath(brokenPartitionPath, "not-json".getBytes(StandardCharsets.UTF_8));
+ try {
+ runCleanerForDatabase(false, dbName, "--allow-clean-orphan-partitions");
+ } finally {
+ zk.getCuratorClient().setData().forPath(brokenPartitionPath, originalPartitionBytes);
+ }
+
+ assertThat(Files.exists(orphanA.partitionDir)).isTrue();
+ assertThat(Files.exists(orphanA.orphanFile)).isTrue();
+ assertThat(Files.exists(orphanB.partitionDir)).isFalse();
+ assertThat(Files.exists(orphanB.orphanFile)).isFalse();
+ assertThat(auditMessages())
+ .anyMatch(
+ m ->
+ m.contains("action=skip_partition_list")
+ && m.contains("table=" + tableA.tablePath.getTableName()));
+ }
+
+ @Test
+ void multipleRoundsConvergeAfterManifestUpsert() throws Exception {
+ String dbName = newDatabaseName("converge");
+ TablePath tablePath = createLogTable(dbName, "converge_log");
+ TableInfo tableInfo = admin.getTableInfo(tablePath).get();
+ TableBucket tableBucket = new TableBucket(tableInfo.getTableId(), 0);
+ FsPath remoteLogTabletDir =
+ FlussPaths.remoteLogTabletDir(
+ new FsPath(remoteDataRoot().resolve("log").toUri().toString()),
+ PhysicalTablePath.of(tablePath),
+ tableBucket);
+
+ String segmentId = UUID.randomUUID().toString();
+ FsPath manifest0 =
+ new FsPath(
+ localPath(remoteLogTabletDir)
+ .resolve("metadata/p0.manifest")
+ .toUri()
+ .toString());
+ Path oldSegment = seedManifestAndSegment(remoteLogTabletDir, manifest0, segmentId, 0L, 0L);
+ upsertManifest(tableBucket, manifest0, 0L);
+
+ runCleanerForDatabase(false, dbName);
+
+ assertThat(Files.exists(oldSegment)).isTrue();
+
+ FsPath manifest1 =
+ new FsPath(
+ localPath(remoteLogTabletDir)
+ .resolve("metadata/p1.manifest")
+ .toUri()
+ .toString());
+ Path newSegment =
+ seedManifestAndSegment(remoteLogTabletDir, manifest1, segmentId, 100L, 100L);
+ upsertManifest(tableBucket, manifest1, 100L);
+
+ runCleanerForDatabase(false, dbName);
+
+ assertThat(Files.exists(oldSegment)).isFalse();
+ assertThat(Files.exists(newSegment)).isTrue();
+ assertThat(auditMessages())
+ .anyMatch(
+ m ->
+ m.contains("action=deleted")
+ && m.contains("rule=log-segment")
+ && m.contains(oldSegment.toString()));
+ }
+
+ @Test
+ void singleTableModeSkipsOrphanTableScan() throws Exception {
+ String dbName = newDatabaseName("singletable");
+ long orphanTableId = allocateDroppedTableId(dbName, "orphan_seed");
+ TablePath liveTable = createLogTable(dbName, "live_target");
+ OrphanTableLayout layout =
+ createOldOrphanTableLayout(
+ remoteDataRoot(),
+ dbName,
+ orphanTableId,
+ "ghost_table",
+ "99999999999999999999.log");
+
+ runCleanerForDatabase(
+ false, dbName, "--table", liveTable.getTableName(), "--allow-clean-orphan-tables");
+
+ // The orphan-table scan must skip because tableInfosComplete=false in --table
+ // single-table mode.
+ // Sibling orphan must be preserved even with --allow-clean-orphan-tables set.
+ assertThat(Files.exists(layout.orphanFile)).isTrue();
+ assertThat(Files.exists(layout.tableDir)).isTrue();
+ assertThat(auditMessages())
+ .anyMatch(
+ m ->
+ m.contains("action=skip_orphan_table_scan")
+ && m.contains("reason=tableInfos-incomplete")
+ && m.contains("db=" + dbName));
+ // Must use the dedicated event, not the older skip_db.
+ assertThat(auditMessages())
+ .noneMatch(m -> m.contains("action=skip_db") && m.contains("db=" + dbName));
+ }
+
+ @Test
+ void kvUnitFailureDoesNotBlockLogCleanup() throws Exception {
+ String dbName = newDatabaseName("crossflow");
+ TablePath tablePath = createPrimaryKeyTable(dbName, "fail_kv_keep_log");
+ TableInfo tableInfo = admin.getTableInfo(tablePath).get();
+ TableBucket tableBucket = new TableBucket(tableInfo.getTableId(), 0);
+
+ // Seed a valid KV snapshot in ZK so listBucketSnapshots returns a child to decode.
+ FsPath remoteKvTabletDir =
+ FlussPaths.remoteKvTabletDir(
+ new FsPath(remoteDataRoot().resolve("kv").toUri().toString()),
+ PhysicalTablePath.of(tablePath),
+ tableBucket);
+ long activeSnapshotId = 1L;
+ seedKvSnapshots(tableBucket, remoteKvTabletDir, new long[] {activeSnapshotId});
+
+ // Seed a log manifest + active segment so the log bucket reaches RESOLVED in the
+ // active-file cleanup.
+ Path activeLogSegment = seedActiveBucketManifest(tablePath);
+
+ // -----------------------------------------------------------------
+ // Step 1 — baseline (no fault injection)
+ // Plant an orphan KV snapshot dir under snap-99 (NOT registered in ZK) plus an
+ // orphan log segment. With the cluster wired normally, cleanup MUST delete them:
+ // this establishes the negative control that proves the phase-2 preservation
+ // claim is meaningful and not just an accidental no-op.
+ // -----------------------------------------------------------------
+ long baselineOrphanSnapshotId = 99L;
+ FsPath baselineOrphanKvDir =
+ FlussPaths.remoteKvSnapshotDir(remoteKvTabletDir, baselineOrphanSnapshotId);
+ Path baselineOrphanKvMetadata = localPath(baselineOrphanKvDir).resolve("_METADATA");
+ Path baselineOrphanKvSst =
+ localPath(baselineOrphanKvDir).resolve(baselineOrphanSnapshotId + ".sst");
+ Files.createDirectories(localPath(baselineOrphanKvDir));
+ Files.write(baselineOrphanKvMetadata, new byte[] {0x55});
+ Files.write(baselineOrphanKvSst, new byte[] {0x66});
+ makeOld(baselineOrphanKvMetadata);
+ makeOld(baselineOrphanKvSst);
+
+ Path baselineOrphanLogSegment = createOldSegmentFile(tablePath, "99999999999999999999.log");
+
+ runCleanerForDatabase(false, dbName);
+
+ // Baseline: snap-99 files were DELETED, proving normal cleanup would have killed
+ // them. Path-specific assertions guarantee these audit events refer to phase 1.
+ assertThat(Files.exists(baselineOrphanKvMetadata))
+ .as(
+ "phase 1 baseline: snap-99/_METADATA must be DELETED "
+ + "(cleanup would normally remove orphan KV files)")
+ .isFalse();
+ assertThat(Files.exists(baselineOrphanKvSst))
+ .as("phase 1 baseline: snap-99/.sst must be DELETED")
+ .isFalse();
+ assertThat(auditMessages())
+ .anyMatch(
+ m ->
+ m.contains("action=deleted")
+ && m.contains("rule=kv-snapshot-file")
+ && m.contains(baselineOrphanKvMetadata.toString()));
+ assertThat(auditMessages())
+ .anyMatch(
+ m ->
+ m.contains("action=deleted")
+ && m.contains("rule=kv-snapshot-file")
+ && m.contains(baselineOrphanKvSst.toString()));
+ // Baseline: orphan log segment was DELETED and the active segment survived. Phase 1's
+ // log deletion is asserted both via Files.exists and via the audit stream so the final
+ // phase-2 assertion can require TWO deletion events on the same path (one per phase).
+ assertThat(Files.exists(baselineOrphanLogSegment))
+ .as("phase 1 baseline: orphan log segment must be DELETED")
+ .isFalse();
+ assertThat(Files.exists(activeLogSegment))
+ .as("phase 1: active log segment must survive cleanup")
+ .isTrue();
+ assertThat(auditMessages())
+ .filteredOn(
+ m ->
+ m.contains("action=deleted")
+ && m.contains("rule=log-segment")
+ && m.contains(baselineOrphanLogSegment.toString()))
+ .as("phase 1 baseline: orphan log segment deletion must appear in audit stream")
+ .hasSizeGreaterThanOrEqualTo(1);
+
+ // -----------------------------------------------------------------
+ // Step 2 — fault injection
+ // Re-plant orphan KV files under a DIFFERENT snap-77 dir so path-specific audit
+ // assertions are unambiguous (phase-1 audits target snap-99, phase-2 audits
+ // target snap-77). Re-plant the orphan log segment at its original path (phase 1
+ // deleted it) so we can verify log cleanup still proceeds when the KV unit fails.
+ // -----------------------------------------------------------------
+ long faultInjectionOrphanSnapshotId = 77L;
+ FsPath faultInjectionOrphanKvDir =
+ FlussPaths.remoteKvSnapshotDir(remoteKvTabletDir, faultInjectionOrphanSnapshotId);
+ Path faultInjectionOrphanKvMetadata =
+ localPath(faultInjectionOrphanKvDir).resolve("_METADATA");
+ Path faultInjectionOrphanKvSst =
+ localPath(faultInjectionOrphanKvDir)
+ .resolve(faultInjectionOrphanSnapshotId + ".sst");
+ Files.createDirectories(localPath(faultInjectionOrphanKvDir));
+ Files.write(faultInjectionOrphanKvMetadata, new byte[] {0x55});
+ Files.write(faultInjectionOrphanKvSst, new byte[] {0x66});
+ makeOld(faultInjectionOrphanKvMetadata);
+ makeOld(faultInjectionOrphanKvSst);
+
+ // Re-planted at the SAME path as baselineOrphanLogSegment (createOldSegmentFile uses a
+ // fixed UUID + filename), so the audit stream will contain TWO delete events targeting
+ // this path -- one from each phase. The final
+ // filteredOn(...).hasSizeGreaterThanOrEqualTo(2)
+ // assertion below verifies both.
+ Path faultInjectionOrphanLogSegment =
+ createOldSegmentFile(tablePath, "99999999999999999999.log");
+
+ // Inject a non-numeric child znode under BucketSnapshotsZNode so server-side
+ // listBucketSnapshotIds throws NumberFormatException on Long.parseLong. Client-side
+ // fetchKvActiveSnapDirs propagates the exception and cleanActiveTableFiles catches it
+ // to emit skip_kv_target.
+ ZooKeeperClient zk = FLUSS_CLUSTER_EXTENSION.getZooKeeperClient();
+ String invalidChildPath = BucketSnapshotsZNode.path(tableBucket) + "/not-a-long";
+ zk.getCuratorClient().create().forPath(invalidChildPath, new byte[0]);
+ try {
+ runCleanerForDatabase(false, dbName);
+ } finally {
+ zk.getCuratorClient().delete().forPath(invalidChildPath);
+ }
+
+ // KV target was skipped: skip_kv_target audit fires AND snap-77 orphan files preserved.
+ assertThat(auditMessages())
+ .as("phase 2: skip_kv_target audit must fire when LIST_KV_SNAPSHOTS RPC fails")
+ .anyMatch(
+ m ->
+ m.contains("action=skip_kv_target")
+ && m.contains("table_id=" + tableInfo.getTableId()));
+ assertThat(Files.exists(faultInjectionOrphanKvMetadata))
+ .as(
+ "phase 2: snap-77/_METADATA must be PRESERVED "
+ + "(KV target failure must short-circuit cleanup)")
+ .isTrue();
+ assertThat(Files.exists(faultInjectionOrphanKvSst))
+ .as("phase 2: snap-77/.sst must be PRESERVED")
+ .isTrue();
+ // Defensive: nothing in the audit stream ever marked snap-77 files for deletion.
+ assertThat(auditMessages())
+ .noneMatch(
+ m ->
+ m.contains("action=deleted")
+ && m.contains("rule=kv-snapshot-file")
+ && m.contains(faultInjectionOrphanKvMetadata.toString()));
+ assertThat(auditMessages())
+ .noneMatch(
+ m ->
+ m.contains("action=deleted")
+ && m.contains("rule=kv-snapshot-file")
+ && m.contains(faultInjectionOrphanKvSst.toString()));
+
+ // Log cleanup proceeded independently: orphan log segment DELETED, active preserved.
+ // The re-planted segment lives at the same path as baselineOrphanLogSegment, so the audit
+ // stream must contain >=2 deletion events for this path: one from phase 1, one from
+ // phase 2. anyMatch alone could be satisfied by phase 1's event in isolation, which is
+ // why we count instead.
+ assertThat(Files.exists(faultInjectionOrphanLogSegment))
+ .as("phase 2: orphan log segment must be re-deleted (log cleanup is independent)")
+ .isFalse();
+ assertThat(Files.exists(activeLogSegment))
+ .as("phase 2: active log segment must still survive cleanup")
+ .isTrue();
+ assertThat(auditMessages())
+ .filteredOn(
+ m ->
+ m.contains("action=deleted")
+ && m.contains("rule=log-segment")
+ && m.contains(faultInjectionOrphanLogSegment.toString()))
+ .as(
+ "orphan log segment must be deleted in both phase 1 (baseline) and "
+ + "phase 2 (with KV fault) -- two events on the same path")
+ .hasSizeGreaterThanOrEqualTo(2);
+ }
+
+ @Test
+ void optInCleansOrphanPartitionDir() throws Exception {
+ String dbName = newDatabaseName("orphanpart");
+ // Create two partitioned tables so the tracker observes both partition IDs.
+ // The second table's partition ID is higher. We plant an orphan under the second
+ // table using the first table's (lower) ID so the guard passes:
+ // orphanId <= maxKnownPartitionId.
+ PartitionedTableLayout tableA = createPartitionedLogTable(dbName, "table_a", "pa");
+ PartitionedTableLayout tableB = createPartitionedLogTable(dbName, "table_b", "pb");
+
+ long orphanPartitionId =
+ Math.min(
+ tableA.partitionInfo.getPartitionId(),
+ tableB.partitionInfo.getPartitionId());
+ // Plant orphan under whichever table does NOT own the lower-ID partition.
+ PartitionedTableLayout targetTable =
+ (tableA.partitionInfo.getPartitionId() == orphanPartitionId) ? tableB : tableA;
+
+ OrphanPartitionLayout orphan =
+ createOldOrphanPartitionLayout(
+ remoteDataRoot(),
+ targetTable.tablePath,
+ targetTable.tableId,
+ "ghost",
+ orphanPartitionId,
+ "99999999999999999999.log");
+
+ runCleanerForDatabase(false, dbName, "--allow-clean-orphan-partitions");
+
+ assertThat(Files.exists(orphan.orphanFile))
+ .as("orphan partition file must be deleted")
+ .isFalse();
+ assertThat(Files.exists(orphan.partitionDir))
+ .as("orphan partition dir must be removed")
+ .isFalse();
+ assertThat(auditMessages())
+ .anyMatch(
+ m ->
+ m.contains("action=deleted")
+ && m.contains("rule=log-segment")
+ && m.contains(orphan.orphanFile.toString()));
+ }
+
+ @Test
+ void emptyDirsSweptAfterOrphanFileDeletion() throws Exception {
+ String dbName = newDatabaseName("emptydir");
+ TablePath tablePath = createLogTable(dbName, "emptydir_table");
+ Path activeSegment = seedActiveBucketManifest(tablePath);
+
+ // Create an orphan file as the sole content of its UUID directory.
+ Path orphan = createOldSegmentFile(tablePath, "99999999999999999999.log");
+ Path orphanSegmentDir = orphan.getParent();
+
+ // Pre-condition: the segment directory exists before cleanup.
+ assertThat(Files.exists(orphanSegmentDir)).isTrue();
+
+ runCleanerForDatabase(false, dbName);
+
+ // The orphan file must be deleted.
+ assertThat(Files.exists(orphan)).as("orphan file must be deleted").isFalse();
+ // The now-empty UUID directory must also be swept.
+ assertThat(Files.exists(orphanSegmentDir))
+ .as("empty segment dir must be swept after cleanup")
+ .isFalse();
+ // Active segment and its directory survive.
+ assertThat(Files.exists(activeSegment)).as("active segment must survive").isTrue();
+ assertThat(auditMessages())
+ .anyMatch(
+ m ->
+ m.contains("action=deleted")
+ && m.contains("rule=log-segment")
+ && m.contains(orphan.toString()));
+ }
+
+ private TablePath createLogTable(String databaseName, String tableName) throws Exception {
+ admin.createDatabase(databaseName, DatabaseDescriptor.EMPTY, true).get();
+ TablePath tablePath = TablePath.of(databaseName, tableName);
+ Schema schema =
+ Schema.newBuilder()
+ .column("id", DataTypes.INT())
+ .column("value", DataTypes.STRING())
+ .build();
+ TableDescriptor descriptor =
+ TableDescriptor.builder().schema(schema).distributedBy(1, "id").build();
+ admin.createTable(tablePath, descriptor, true).get();
+ return tablePath;
+ }
+
+ private TablePath createPrimaryKeyTable(String databaseName, String tableName)
+ throws Exception {
+ admin.createDatabase(databaseName, DatabaseDescriptor.EMPTY, true).get();
+ TablePath tablePath = TablePath.of(databaseName, tableName);
+ Schema schema =
+ Schema.newBuilder()
+ .column("id", DataTypes.INT())
+ .column("value", DataTypes.STRING())
+ .primaryKey("id")
+ .build();
+ TableDescriptor descriptor =
+ TableDescriptor.builder().schema(schema).distributedBy(1, "id").build();
+ admin.createTable(tablePath, descriptor, true).get();
+ return tablePath;
+ }
+
+ private long allocateDroppedTableId(String databaseName, String tableName) throws Exception {
+ TablePath tablePath = createLogTable(databaseName, tableName);
+ long tableId = admin.getTableInfo(tablePath).get().getTableId();
+ admin.dropTable(tablePath, false).get();
+ return tableId;
+ }
+
+ private long allocateDroppedPrimaryKeyTableId(String databaseName, String tableName)
+ throws Exception {
+ TablePath tablePath = createPrimaryKeyTable(databaseName, tableName);
+ long tableId = admin.getTableInfo(tablePath).get().getTableId();
+ admin.dropTable(tablePath, false).get();
+ return tableId;
+ }
+
+ private Path createOldSegmentFile(TablePath tablePath, String fileName) throws Exception {
+ TableInfo tableInfo = admin.getTableInfo(tablePath).get();
+ org.apache.fluss.fs.FsPath tabletDir =
+ FlussPaths.remoteLogTabletDir(
+ new org.apache.fluss.fs.FsPath(
+ FLUSS_CLUSTER_EXTENSION.getRemoteDataDir()
+ + "/"
+ + FlussPaths.REMOTE_LOG_DIR_NAME),
+ PhysicalTablePath.of(tablePath),
+ new TableBucket(tableInfo.getTableId(), 0));
+ Path segmentDir =
+ Paths.get(java.net.URI.create(tabletDir.toString()))
+ .resolve(
+ UUID.fromString("aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa").toString());
+ Files.createDirectories(segmentDir);
+ Path file = segmentDir.resolve(fileName);
+ Files.write(file, new byte[] {0x42});
+ makeOld(file);
+ makeOld(segmentDir);
+ return file;
+ }
+
+ private Path createOldLogManifestFile(TablePath tablePath, String fileName) throws Exception {
+ TableInfo tableInfo = admin.getTableInfo(tablePath).get();
+ org.apache.fluss.fs.FsPath tabletDir =
+ FlussPaths.remoteLogTabletDir(
+ new org.apache.fluss.fs.FsPath(
+ FLUSS_CLUSTER_EXTENSION.getRemoteDataDir()
+ + "/"
+ + FlussPaths.REMOTE_LOG_DIR_NAME),
+ PhysicalTablePath.of(tablePath),
+ new TableBucket(tableInfo.getTableId(), 0));
+ Path metadataDir = Paths.get(java.net.URI.create(tabletDir.toString())).resolve("metadata");
+ Files.createDirectories(metadataDir);
+ Path file = metadataDir.resolve(fileName);
+ Files.write(file, new byte[] {0x11});
+ makeOld(file);
+ return file;
+ }
+
+ private PartitionedTableLayout createPartitionedLogTable(
+ String databaseName, String tableName, String partitionValue) throws Exception {
+ admin.createDatabase(databaseName, DatabaseDescriptor.EMPTY, true).get();
+ TablePath tablePath = TablePath.of(databaseName, tableName);
+ Schema schema =
+ Schema.newBuilder()
+ .column("id", DataTypes.INT())
+ .column("value", DataTypes.STRING())
+ .column("pt", DataTypes.STRING())
+ .build();
+ TableDescriptor descriptor =
+ TableDescriptor.builder()
+ .schema(schema)
+ .distributedBy(1, "id")
+ .partitionedBy("pt")
+ .build();
+ admin.createTable(tablePath, descriptor, true).get();
+ admin.createPartition(tablePath, partitionSpec("pt", partitionValue), false).get();
+
+ Map partitionIds =
+ FLUSS_CLUSTER_EXTENSION.waitUntilPartitionAllReady(tablePath, 1);
+ TableInfo tableInfo = admin.getTableInfo(tablePath).get();
+ long partitionId = partitionIds.get(partitionValue);
+ FLUSS_CLUSTER_EXTENSION.waitUntilTablePartitionReady(tableInfo.getTableId(), partitionId);
+ List partitionInfos = admin.listPartitionInfos(tablePath).get();
+ assertThat(partitionInfos).hasSize(1);
+ return new PartitionedTableLayout(tablePath, tableInfo.getTableId(), partitionInfos.get(0));
+ }
+
+ private void seedKvSnapshots(
+ TableBucket tableBucket, FsPath remoteKvTabletDir, long[] snapshotIds)
+ throws Exception {
+ ZooKeeperClient zk = FLUSS_CLUSTER_EXTENSION.getZooKeeperClient();
+ for (long snapshotId : snapshotIds) {
+ FsPath snapshotDir = FlussPaths.remoteKvSnapshotDir(remoteKvTabletDir, snapshotId);
+ Path localSnapshotDir = localPath(snapshotDir);
+ Files.createDirectories(localSnapshotDir);
+
+ Path metadataFile = localSnapshotDir.resolve("_METADATA");
+ Files.write(metadataFile, new byte[] {0x33});
+ makeOld(metadataFile);
+
+ Path dataFile = localSnapshotDir.resolve(snapshotId + ".sst");
+ Files.write(dataFile, new byte[] {0x44});
+ makeOld(dataFile);
+
+ makeOld(localSnapshotDir);
+
+ zk.registerTableBucketSnapshot(
+ tableBucket,
+ new BucketSnapshot(
+ snapshotId, snapshotId, snapshotDir.toString() + "/_METADATA"));
+ }
+ }
+
+ private Path seedManifestAndSegment(
+ FsPath remoteLogTabletDir,
+ FsPath manifestPath,
+ String segmentId,
+ long startOffset,
+ long endOffset)
+ throws Exception {
+ Path manifest = localPath(manifestPath);
+ Files.createDirectories(manifest.getParent());
+ Files.write(
+ manifest,
+ manifestJson(segmentId, startOffset, endOffset).getBytes(StandardCharsets.UTF_8));
+ makeOld(manifest);
+
+ FsPath segmentDir = new FsPath(remoteLogTabletDir, segmentId);
+ Path localSegmentDir = localPath(segmentDir);
+ Files.createDirectories(localSegmentDir);
+ Path logFile =
+ localSegmentDir.resolve(FlussPaths.filenamePrefixFromOffset(startOffset) + ".log");
+ Files.write(logFile, new byte[] {0x55});
+ makeOld(logFile);
+ return logFile;
+ }
+
+ private Path writeSegmentFile(FsPath remoteLogTabletDir, String segmentId, long startOffset)
+ throws Exception {
+ FsPath segmentDir = new FsPath(remoteLogTabletDir, segmentId);
+ Path localSegmentDir = localPath(segmentDir);
+ Files.createDirectories(localSegmentDir);
+ Path logFile =
+ localSegmentDir.resolve(FlussPaths.filenamePrefixFromOffset(startOffset) + ".log");
+ Files.write(logFile, new byte[] {0x55});
+ makeOld(logFile);
+ return logFile;
+ }
+
+ private void upsertManifest(TableBucket tableBucket, FsPath manifestPath, long endOffset)
+ throws Exception {
+ FLUSS_CLUSTER_EXTENSION
+ .getZooKeeperClient()
+ .upsertRemoteLogManifestHandle(
+ tableBucket, new RemoteLogManifestHandle(manifestPath, endOffset));
+ }
+
+ private void runCleanerForDatabase(boolean dryRun, String databaseName, String... extraArgs)
+ throws Exception {
+ List args = new ArrayList();
+ args.add("--bootstrap-server");
+ args.add(bootstrapServers);
+ args.add("--database");
+ args.add(databaseName);
+ appendCommonArgs(args, dryRun, extraArgs);
+ OrphanCleanConfig config =
+ OrphanCleanConfig.fromParams(
+ MultipleParameterToolAdapter.fromArgs(
+ args.toArray(new String[args.size()])));
+ new OrphanFilesCleanAction(config).run();
+ }
+
+ private void runCleanerForAllDatabases(boolean dryRun, String... extraArgs) throws Exception {
+ List args = new ArrayList();
+ args.add("--bootstrap-server");
+ args.add(bootstrapServers);
+ args.add("--all-databases");
+ appendCommonArgs(args, dryRun, extraArgs);
+ OrphanCleanConfig config =
+ OrphanCleanConfig.fromParams(
+ MultipleParameterToolAdapter.fromArgs(
+ args.toArray(new String[args.size()])));
+ new OrphanFilesCleanAction(config).run();
+ }
+
+ private static final DateTimeFormatter CUTOFF_FORMATTER =
+ DateTimeFormatter.ISO_OFFSET_DATE_TIME;
+
+ private static void appendCommonArgs(List args, boolean dryRun, String... extraArgs) {
+ // Tests back-date their orphan files to now - 2d via makeOld(); a cutoff at now - 1d
+ // safely puts those files strictly before the cutoff (mtime < cutoff → DELETE-eligible).
+ String cutoff = OffsetDateTime.now(ZoneOffset.UTC).minusDays(1).format(CUTOFF_FORMATTER);
+ args.add("--older-than");
+ args.add(cutoff);
+ for (String extraArg : extraArgs) {
+ args.add(extraArg);
+ }
+ if (dryRun) {
+ args.add("--dry-run");
+ }
+ }
+
+ private OrphanPartitionLayout createOldOrphanPartitionLayout(
+ Path remoteRoot,
+ TablePath tablePath,
+ long tableId,
+ String partitionName,
+ long partitionId,
+ String fileName)
+ throws Exception {
+ Path tableDir =
+ remoteRoot
+ .resolve("log")
+ .resolve(tablePath.getDatabaseName())
+ .resolve(tablePath.getTableName() + "-" + tableId);
+ Path partitionDir = tableDir.resolve(partitionName + "-p" + partitionId);
+ Path segmentDir =
+ partitionDir
+ .resolve("0")
+ .resolve(
+ UUID.fromString("bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb").toString());
+ Files.createDirectories(segmentDir);
+ Path orphanFile = segmentDir.resolve(fileName);
+ Files.write(orphanFile, new byte[] {0x66});
+ makeOld(orphanFile);
+ makeOld(segmentDir);
+ makeOld(segmentDir.getParent());
+ makeOld(partitionDir);
+ return new OrphanPartitionLayout(partitionDir, orphanFile);
+ }
+
+ private OrphanTableLayout createOldOrphanTableLayout(
+ Path remoteRoot, String dbName, long tableId, String tableName, String fileName)
+ throws Exception {
+ Path tableDir =
+ remoteRoot.resolve("log").resolve(dbName).resolve(tableName + "-" + tableId);
+ Path segmentDir =
+ tableDir.resolve("0")
+ .resolve(
+ UUID.fromString("aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa").toString());
+ Files.createDirectories(segmentDir);
+ Path orphanFile = segmentDir.resolve(fileName);
+ Files.write(orphanFile, new byte[] {0x42});
+ makeOld(orphanFile);
+ makeOld(segmentDir);
+ makeOld(segmentDir.getParent());
+ makeOld(tableDir);
+ return new OrphanTableLayout(tableDir, orphanFile);
+ }
+
+ private OrphanTableLayout createOldOrphanKvTableLayout(
+ Path remoteRoot, String dbName, long tableId, String tableName, String fileName)
+ throws Exception {
+ Path tableDir = remoteRoot.resolve("kv").resolve(dbName).resolve(tableName + "-" + tableId);
+ Path sharedDir = tableDir.resolve("0").resolve("shared");
+ Files.createDirectories(sharedDir);
+ Path orphanFile = sharedDir.resolve(fileName);
+ Files.write(orphanFile, new byte[] {0x24});
+ makeOld(orphanFile);
+ makeOld(sharedDir);
+ makeOld(sharedDir.getParent());
+ makeOld(tableDir);
+ return new OrphanTableLayout(tableDir, orphanFile);
+ }
+
+ private static String newDatabaseName(String prefix) {
+ return prefix + Long.toString(System.nanoTime());
+ }
+
+ private static PartitionSpec partitionSpec(String key, String value) {
+ return new PartitionSpec(Collections.singletonMap(key, value));
+ }
+
+ private static Path localPath(FsPath path) {
+ return Paths.get(java.net.URI.create(path.toString()));
+ }
+
+ private static String manifestJson(String segmentId, long startOffset, long endOffset) {
+ return "{\"version\":1,"
+ + "\"database\":\"db\","
+ + "\"table\":\"t\","
+ + "\"table_id\":0,"
+ + "\"bucket_id\":0,"
+ + "\"remote_log_segments\":[{"
+ + "\"segment_id\":\""
+ + segmentId
+ + "\",\"start_offset\":"
+ + startOffset
+ + ",\"end_offset\":"
+ + endOffset
+ + ",\"max_timestamp\":0,"
+ + "\"size_in_bytes\":1"
+ + "}]}";
+ }
+
+ private void makeOld(Path path) throws Exception {
+ Files.setLastModifiedTime(
+ path, FileTime.fromMillis(System.currentTimeMillis() - OLD_ENOUGH.toMillis()));
+ }
+
+ private static final class PartitionedTableLayout {
+ private final TablePath tablePath;
+ private final long tableId;
+ private final PartitionInfo partitionInfo;
+
+ private PartitionedTableLayout(
+ TablePath tablePath, long tableId, PartitionInfo partitionInfo) {
+ this.tablePath = tablePath;
+ this.tableId = tableId;
+ this.partitionInfo = partitionInfo;
+ }
+ }
+
+ private static final class OrphanPartitionLayout {
+ private final Path partitionDir;
+ private final Path orphanFile;
+
+ private OrphanPartitionLayout(Path partitionDir, Path orphanFile) {
+ this.partitionDir = partitionDir;
+ this.orphanFile = orphanFile;
+ }
+ }
+
+ private static final class OrphanTableLayout {
+ private final Path tableDir;
+ private final Path orphanFile;
+
+ private OrphanTableLayout(Path tableDir, Path orphanFile) {
+ this.tableDir = tableDir;
+ this.orphanFile = orphanFile;
+ }
+ }
+
+ private static final class CapturingAppender extends AbstractAppender {
+
+ private final List messages = new CopyOnWriteArrayList();
+
+ CapturingAppender(String name) {
+ super(
+ name,
+ null,
+ null,
+ true,
+ org.apache.logging.log4j.core.config.Property.EMPTY_ARRAY);
+ }
+
+ @Override
+ public void append(LogEvent event) {
+ messages.add(event.getMessage().getFormattedMessage());
+ }
+
+ List messages() {
+ return new ArrayList(messages);
+ }
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/action/orphan/RpcErrorClassifierTest.java b/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/action/orphan/RpcErrorClassifierTest.java
new file mode 100644
index 0000000000..8746be4ae5
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/action/orphan/RpcErrorClassifierTest.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan;
+
+import org.apache.fluss.exception.FlussRuntimeException;
+import org.apache.fluss.exception.PartitionNotExistException;
+import org.apache.fluss.exception.TableNotExistException;
+
+import org.junit.jupiter.api.Test;
+
+import java.io.IOException;
+import java.util.concurrent.CompletionException;
+import java.util.concurrent.TimeoutException;
+
+import static org.apache.fluss.flink.action.orphan.RpcErrorClassifier.Category.NOT_FOUND;
+import static org.apache.fluss.flink.action.orphan.RpcErrorClassifier.Category.SERVER_ERROR;
+import static org.apache.fluss.flink.action.orphan.RpcErrorClassifier.Category.TRANSIENT;
+import static org.apache.fluss.flink.action.orphan.RpcErrorClassifier.Category.UNKNOWN;
+import static org.assertj.core.api.Assertions.assertThat;
+
+class RpcErrorClassifierTest {
+
+ @Test
+ void tableNotExistIsNotFound() {
+ assertThat(RpcErrorClassifier.classify(new TableNotExistException("x")))
+ .isEqualTo(NOT_FOUND);
+ }
+
+ @Test
+ void partitionNotExistIsNotFound() {
+ assertThat(RpcErrorClassifier.classify(new PartitionNotExistException("x")))
+ .isEqualTo(NOT_FOUND);
+ }
+
+ @Test
+ void ioExceptionIsTransient() {
+ assertThat(RpcErrorClassifier.classify(new IOException("conn reset"))).isEqualTo(TRANSIENT);
+ }
+
+ @Test
+ void timeoutIsTransient() {
+ assertThat(RpcErrorClassifier.classify(new TimeoutException("rpc"))).isEqualTo(TRANSIENT);
+ }
+
+ @Test
+ void unwrapsCompletionException() {
+ assertThat(
+ RpcErrorClassifier.classify(
+ new CompletionException(new TableNotExistException("x"))))
+ .isEqualTo(NOT_FOUND);
+ }
+
+ @Test
+ void flussServerErrorIsServerError() {
+ assertThat(RpcErrorClassifier.classify(new FlussRuntimeException("internal")))
+ .isEqualTo(SERVER_ERROR);
+ }
+
+ @Test
+ void otherRuntimeIsUnknown() {
+ assertThat(RpcErrorClassifier.classify(new IllegalStateException("?"))).isEqualTo(UNKNOWN);
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/action/orphan/build/ActiveRefsFetcherTest.java b/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/action/orphan/build/ActiveRefsFetcherTest.java
new file mode 100644
index 0000000000..7144b4f031
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/action/orphan/build/ActiveRefsFetcherTest.java
@@ -0,0 +1,440 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.build;
+
+import org.apache.fluss.client.metadata.ActiveKvSnapshots;
+import org.apache.fluss.client.metadata.RemoteLogManifestInfo;
+import org.apache.fluss.fs.FsPath;
+import org.apache.fluss.metadata.TableBucket;
+import org.apache.fluss.utils.FlussPaths;
+
+import org.junit.jupiter.api.Test;
+
+import javax.annotation.Nullable;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayDeque;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Deque;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicReference;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+
+/** Tests for {@link ActiveRefsFetcher} — log active set sourced from coordinator metadata. */
+class ActiveRefsFetcherTest {
+
+ @Test
+ void emptyManifestListReturnsEmptyResult() {
+ AtomicInteger rpcCalls = new AtomicInteger(0);
+ StubAdmin admin = new StubAdmin(rpcCalls);
+ admin.queueEmptyResponse();
+
+ StubManifestReader reader = new StubManifestReader();
+
+ ActiveRefsFetcher builder = new ActiveRefsFetcher(admin, reader, /* maxRetries= */ 3);
+ LogActiveRefsFetchResult result = builder.fetchLogActiveRefsByBucket(7L, null);
+
+ assertThat(result.listOk()).isTrue();
+ assertThat(result.statusFor(0))
+ .isEqualTo(LogActiveRefsFetchResult.ManifestReadStatus.NOT_LISTED);
+ // Empty success must NOT trigger a retry — lock down call count.
+ assertThat(rpcCalls.get()).isEqualTo(1);
+ }
+
+ @Test
+ void fileNotFoundMarksBucketReadFailedWithoutRetry() {
+ // Locks down "no per-bucket retry": a single FileNotFound on the manifest second-read
+ // immediately marks the bucket READ_FAILED; recovery is left to the next cleanup round.
+ // This prevents N × retries × IO blow-up during cluster-wide manifest upsert turbulence.
+ FsPath p0 = new FsPath("oss://b/log/db/t-7/0/metadata/p0.manifest");
+ AtomicInteger rpcCalls = new AtomicInteger(0);
+ StubAdmin admin = new StubAdmin(rpcCalls);
+ admin.queueResponse(p0);
+
+ StubManifestReader reader = new StubManifestReader();
+ reader.failWithNotFound(p0);
+
+ ActiveRefsFetcher builder = new ActiveRefsFetcher(admin, reader, /* maxRetries= */ 3);
+ LogActiveRefsFetchResult result = builder.fetchLogActiveRefsByBucket(7L, null);
+
+ assertThat(result.listOk()).isTrue();
+ assertThat(result.statusFor(0))
+ .isEqualTo(LogActiveRefsFetchResult.ManifestReadStatus.READ_FAILED);
+ assertThat(result.readFailureReason(0))
+ .contains("Manifest not found (likely upserted concurrently)")
+ .contains("bucketId=0");
+ // Per-target RPC issued exactly once; no per-bucket retry burst.
+ assertThat(rpcCalls.get()).isEqualTo(1);
+ }
+
+ @Test
+ void fetchLogActiveRefsByBucket_abortsOnlyFailedBucket() throws Exception {
+ FsPath p0 = new FsPath("oss://b/log/db/t-7/0/metadata/p0.manifest");
+ FsPath p1 = new FsPath("oss://b/log/db/t-7/1/metadata/p1.manifest");
+ String manifestJson = manifestJson("11111111-1111-1111-1111-111111111111", 7L, 9L);
+
+ AtomicInteger rpcCalls = new AtomicInteger(0);
+ StubAdmin admin = new StubAdmin(rpcCalls);
+ admin.queueMultiBucketResponse(p0, p1);
+
+ StubManifestReader reader = new StubManifestReader();
+ reader.returnBytes(p0, manifestJson.getBytes(StandardCharsets.UTF_8));
+ reader.failWithNotFound(p1);
+
+ ActiveRefsFetcher builder = new ActiveRefsFetcher(admin, reader, /* maxRetries= */ 3);
+ LogActiveRefsFetchResult result = builder.fetchLogActiveRefsByBucket(7L, null);
+
+ assertThat(result.listOk()).isTrue();
+ assertThat(result.statusFor(0))
+ .isEqualTo(LogActiveRefsFetchResult.ManifestReadStatus.RESOLVED);
+ assertThat(result.statusFor(1))
+ .isEqualTo(LogActiveRefsFetchResult.ManifestReadStatus.READ_FAILED);
+ assertThat(result.activeRefsOf(0).logSegmentRelativePaths())
+ .containsExactlyInAnyOrder(
+ "11111111-1111-1111-1111-111111111111/"
+ + FlussPaths.filenamePrefixFromOffset(7L)
+ + ".log",
+ "11111111-1111-1111-1111-111111111111/"
+ + FlussPaths.filenamePrefixFromOffset(7L)
+ + ".index",
+ "11111111-1111-1111-1111-111111111111/"
+ + FlussPaths.filenamePrefixFromOffset(7L)
+ + ".timeindex",
+ "11111111-1111-1111-1111-111111111111/"
+ + FlussPaths.filenamePrefixFromOffset(9L)
+ + ".writer_snapshot");
+ assertThat(result.readFailureReason(1))
+ .contains("Manifest not found (likely upserted concurrently)")
+ .contains("bucketId=1");
+ assertThat(result.statusFor(2))
+ .isEqualTo(LogActiveRefsFetchResult.ManifestReadStatus.NOT_LISTED);
+ // Per-target RPC issued exactly once; per-bucket failure does not trigger any extra RPC.
+ assertThat(rpcCalls.get()).isEqualTo(1);
+ }
+
+ @Test
+ void fetchLogActiveRefsByBucket_targetRpcFailure() {
+ AtomicInteger rpcCalls = new AtomicInteger(0);
+ StubAdmin admin = new StubAdmin(rpcCalls);
+
+ ActiveRefsFetcher builder =
+ new ActiveRefsFetcher(admin, new StubManifestReader(), /* maxRetries= */ 3);
+ LogActiveRefsFetchResult result = builder.fetchLogActiveRefsByBucket(7L, null);
+
+ assertThat(result.listOk()).isFalse();
+ assertThat(result.listFailureReason()).contains("RPC failure for tableId=7");
+ // Per-bucket queries are not meaningful when listOk=false.
+ assertThatThrownBy(() -> result.statusFor(0)).isInstanceOf(IllegalStateException.class);
+ // Per-target RPC is retried up to maxRetries times before giving up.
+ assertThat(rpcCalls.get()).isEqualTo(3);
+ }
+
+ @Test
+ void manifestParseFailureMarksBucketReadFailed() {
+ FsPath p0 = new FsPath("oss://b/log/db/t-7/0/metadata/p0.manifest");
+ StubAdmin admin = new StubAdmin(new AtomicInteger());
+ admin.queueResponse(p0);
+
+ StubManifestReader reader = new StubManifestReader();
+ reader.returnBytes(p0, "{}".getBytes(StandardCharsets.UTF_8));
+
+ ActiveRefsFetcher builder = new ActiveRefsFetcher(admin, reader, /* maxRetries= */ 3);
+ LogActiveRefsFetchResult result = builder.fetchLogActiveRefsByBucket(7L, null);
+
+ assertThat(result.listOk()).isTrue();
+ assertThat(result.statusFor(0))
+ .isEqualTo(LogActiveRefsFetchResult.ManifestReadStatus.READ_FAILED);
+ assertThat(result.readFailureReason(0))
+ .contains("Manifest parse failure")
+ .contains("bucketId=0");
+ }
+
+ @Test
+ void ioErrorMarksBucketReadFailed() {
+ FsPath p0 = new FsPath("oss://b/log/db/t-7/0/metadata/p0.manifest");
+ StubAdmin admin = new StubAdmin(new AtomicInteger());
+ admin.queueResponse(p0);
+
+ StubManifestReader reader = new StubManifestReader();
+ reader.failWithIo(p0, new IOException("disk fault"));
+
+ ActiveRefsFetcher builder = new ActiveRefsFetcher(admin, reader, /* maxRetries= */ 3);
+ LogActiveRefsFetchResult result = builder.fetchLogActiveRefsByBucket(7L, null);
+
+ assertThat(result.listOk()).isTrue();
+ assertThat(result.statusFor(0))
+ .isEqualTo(LogActiveRefsFetchResult.ManifestReadStatus.READ_FAILED);
+ assertThat(result.readFailureReason(0)).contains("IO error reading manifest");
+ }
+
+ @Test
+ void fetchKvActiveSnapDirsAggregatesPerBucket() {
+ StubAdmin admin = new StubAdmin(new AtomicInteger());
+ Map> snapshotIds = new HashMap<>();
+ snapshotIds.put(0, new HashSet<>(Arrays.asList(9L, 10L)));
+ snapshotIds.put(1, new HashSet<>(Arrays.asList(5L)));
+ admin.queueKvResponseMultiBucket(snapshotIds);
+
+ ActiveRefsFetcher builder =
+ new ActiveRefsFetcher(admin, /* metadataReader */ null, /* maxRetries= */ 3);
+ KvActiveRefsFetchResult result = builder.fetchKvActiveSnapDirs(7L, null);
+
+ assertThat(result.listOk()).isTrue();
+ Map> perBucket = result.activeSnapDirsByBucket();
+ assertThat(perBucket.get(0)).containsExactlyInAnyOrder("snap-9", "snap-10");
+ assertThat(perBucket.get(1)).containsExactly("snap-5");
+ }
+
+ /**
+ * Symmetric with {@link #fetchLogActiveRefsByBucket_targetRpcFailure}: the KV per-target RPC
+ * retries up to {@code maxRetries} times and reports {@code listOk=false} on exhaustion.
+ */
+ @Test
+ void fetchKvActiveSnapDirsRetriesThenReportsListFailure() {
+ AtomicInteger rpcCalls = new AtomicInteger(0);
+ StubAdmin admin = new StubAdmin(rpcCalls);
+ // No queued KV response → StubAdmin returns failed CompletableFutures on every attempt.
+
+ ActiveRefsFetcher builder =
+ new ActiveRefsFetcher(admin, /* metadataReader */ null, /* maxRetries= */ 3);
+ KvActiveRefsFetchResult result = builder.fetchKvActiveSnapDirs(7L, null);
+
+ assertThat(result.listOk()).isFalse();
+ // Reason is classified via RpcErrorClassifier for audit compatibility.
+ assertThat(result.listFailureReason()).isNotEmpty();
+ // Per-target RPC is retried up to maxRetries times before giving up.
+ assertThat(rpcCalls.get()).isEqualTo(3);
+ }
+
+ /**
+ * Verifies that a non-null {@code partitionId} is forwarded to the underlying {@code
+ * listRemoteLogManifests} RPC by {@link ActiveRefsFetcher#fetchLogActiveRefsByBucket}.
+ */
+ @Test
+ void fetchLogActiveRefsByBucketWithPartitionIdRoutesCorrectly() throws Exception {
+ FsPath p0 = new FsPath("oss://b/log/db/t-7/0/metadata/p0.manifest");
+ String manifestJson = manifestJson("11111111-1111-1111-1111-111111111111", 7L, 9L);
+
+ AtomicInteger rpcCalls = new AtomicInteger(0);
+ StubAdmin admin = new StubAdmin(rpcCalls);
+ admin.queueResponse(p0);
+
+ StubManifestReader reader = new StubManifestReader();
+ reader.returnBytes(p0, manifestJson.getBytes(StandardCharsets.UTF_8));
+
+ ActiveRefsFetcher builder = new ActiveRefsFetcher(admin, reader, /* maxRetries= */ 3);
+ LogActiveRefsFetchResult result = builder.fetchLogActiveRefsByBucket(7L, 42L);
+
+ assertThat(result.listOk()).isTrue();
+ assertThat(result.statusFor(0))
+ .isEqualTo(LogActiveRefsFetchResult.ManifestReadStatus.RESOLVED);
+ // Proves partitionId=42 was forwarded to the RPC (sentinel Long.MIN_VALUE would mean
+ // the stub was never invoked).
+ assertThat(admin.lastLogPartitionId.get())
+ .as("partitionId must be forwarded to listRemoteLogManifests RPC")
+ .isEqualTo(42L);
+ assertThat(rpcCalls.get())
+ .as("happy path must issue exactly one listRemoteLogManifests RPC")
+ .isEqualTo(1);
+ }
+
+ /**
+ * Verifies that a non-null {@code partitionId} is forwarded to the underlying {@code
+ * listKvSnapshots} RPC by {@link ActiveRefsFetcher#fetchKvActiveSnapDirs}.
+ */
+ @Test
+ void fetchKvActiveSnapDirsWithPartitionIdRoutesCorrectly() {
+ AtomicInteger rpcCalls = new AtomicInteger(0);
+ StubAdmin admin = new StubAdmin(rpcCalls);
+ admin.queueKvResponse(0, 5L);
+
+ ActiveRefsFetcher builder =
+ new ActiveRefsFetcher(admin, /* metadataReader */ null, /* maxRetries= */ 3);
+ KvActiveRefsFetchResult result = builder.fetchKvActiveSnapDirs(7L, 99L);
+
+ assertThat(result.listOk()).isTrue();
+ Map> perBucket = result.activeSnapDirsByBucket();
+ assertThat(perBucket).containsOnlyKeys(0);
+ assertThat(perBucket.get(0)).containsExactly("snap-5");
+ // Proves partitionId=99 was forwarded to the RPC.
+ assertThat(admin.lastKvPartitionId.get())
+ .as("partitionId must be forwarded to listKvSnapshots RPC")
+ .isEqualTo(99L);
+ assertThat(rpcCalls.get())
+ .as("happy path must issue exactly one listKvSnapshots RPC")
+ .isEqualTo(1);
+ }
+
+ // -------------------------------------------------------------------------
+ // Test fixtures
+ // -------------------------------------------------------------------------
+
+ private static String manifestJson(String segmentId, long startOffset, long endOffset) {
+ return "{\"version\":1,"
+ + "\"database\":\"db\","
+ + "\"table\":\"t\","
+ + "\"table_id\":7,"
+ + "\"bucket_id\":0,"
+ + "\"remote_log_segments\":[{"
+ + "\"segment_id\":\""
+ + segmentId
+ + "\",\"start_offset\":"
+ + startOffset
+ + ",\"end_offset\":"
+ + endOffset
+ + ",\"max_timestamp\":0,"
+ + "\"size_in_bytes\":1"
+ + "}]}";
+ }
+
+ /** Queues per-call responses for ListRemoteLogManifests / ListKvSnapshots and tracks calls. */
+ private static final class StubAdmin implements ActiveRefsFetcher.AdminFacade {
+
+ private final Deque> responses = new ArrayDeque<>();
+ private final Deque kvResponses = new ArrayDeque<>();
+ private final AtomicInteger callCounter;
+ private final AtomicReference lastLogPartitionId =
+ new AtomicReference<>(Long.MIN_VALUE);
+ private final AtomicReference lastKvPartitionId =
+ new AtomicReference<>(Long.MIN_VALUE);
+
+ StubAdmin(AtomicInteger callCounter) {
+ this.callCounter = callCounter;
+ }
+
+ void queueResponse(FsPath manifestPath) {
+ queueResponse(manifestPath, 0);
+ }
+
+ void queueResponse(FsPath manifestPath, int bucketId) {
+ List list = new ArrayList<>();
+ list.add(
+ new RemoteLogManifestInfo(
+ new TableBucket(7L, bucketId), manifestPath.toString(), 0L));
+ responses.add(list);
+ }
+
+ void queueMultiBucketResponse(FsPath manifestPath0, FsPath manifestPath1) {
+ List list = new ArrayList<>();
+ list.add(
+ new RemoteLogManifestInfo(
+ new TableBucket(7L, 0), manifestPath0.toString(), 0L));
+ list.add(
+ new RemoteLogManifestInfo(
+ new TableBucket(7L, 1), manifestPath1.toString(), 0L));
+ responses.add(list);
+ }
+
+ void queueEmptyResponse() {
+ responses.add(Collections.emptyList());
+ }
+
+ void queueKvResponse(int bucketId, long... snapshotIds) {
+ Map> snapshotIdsByBucket = new HashMap<>();
+ Set ids = new HashSet<>();
+ for (long id : snapshotIds) {
+ ids.add(id);
+ }
+ snapshotIdsByBucket.put(bucketId, ids);
+ kvResponses.add(new ActiveKvSnapshots(7L, null, snapshotIdsByBucket));
+ }
+
+ void queueKvResponseMultiBucket(Map> snapshotIdsByBucket) {
+ kvResponses.add(new ActiveKvSnapshots(7L, null, snapshotIdsByBucket));
+ }
+
+ @Override
+ public CompletableFuture> listRemoteLogManifests(
+ long tableId, @Nullable Long partitionId) {
+ callCounter.incrementAndGet();
+ lastLogPartitionId.set(partitionId);
+ List next = responses.poll();
+ if (next == null) {
+ CompletableFuture> failed = new CompletableFuture<>();
+ failed.completeExceptionally(
+ new IllegalStateException("StubAdmin: no more queued responses"));
+ return failed;
+ }
+ return CompletableFuture.completedFuture(next);
+ }
+
+ @Override
+ public CompletableFuture listKvSnapshots(
+ long tableId, @Nullable Long partitionId) {
+ callCounter.incrementAndGet();
+ lastKvPartitionId.set(partitionId);
+ ActiveKvSnapshots next = kvResponses.poll();
+ if (next == null) {
+ CompletableFuture failed = new CompletableFuture<>();
+ failed.completeExceptionally(
+ new IllegalStateException("StubAdmin: no more queued kv responses"));
+ return failed;
+ }
+ return CompletableFuture.completedFuture(next);
+ }
+ }
+
+ /** Per-path file-content / failure registry for the second-read step. */
+ private static final class StubManifestReader implements ActiveRefsFetcher.MetadataReader {
+
+ private final Map bytesByPath = new HashMap<>();
+ private final Set notFoundPaths = new HashSet<>();
+ private final Map ioFailuresByPath = new HashMap<>();
+
+ void returnBytes(FsPath path, byte[] data) {
+ bytesByPath.put(path.toString(), data);
+ }
+
+ void failWithNotFound(FsPath path) {
+ notFoundPaths.add(path.toString());
+ }
+
+ void failWithIo(FsPath path, IOException e) {
+ ioFailuresByPath.put(path.toString(), e);
+ }
+
+ @Override
+ public byte[] read(FsPath path) throws IOException {
+ String key = path.toString();
+ if (notFoundPaths.contains(key)) {
+ throw new FileNotFoundException(key);
+ }
+ IOException io = ioFailuresByPath.get(key);
+ if (io != null) {
+ throw io;
+ }
+ byte[] data = bytesByPath.get(key);
+ if (data == null) {
+ throw new FileNotFoundException(key);
+ }
+ return data;
+ }
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/action/orphan/config/OrphanCleanConfigTest.java b/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/action/orphan/config/OrphanCleanConfigTest.java
new file mode 100644
index 0000000000..222d4743e2
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/action/orphan/config/OrphanCleanConfigTest.java
@@ -0,0 +1,253 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.config;
+
+import org.apache.fluss.flink.adapter.MultipleParameterToolAdapter;
+
+import org.junit.jupiter.api.Test;
+
+import java.time.Duration;
+import java.time.OffsetDateTime;
+import java.time.ZoneOffset;
+import java.time.format.DateTimeFormatter;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+
+/** Tests for {@link OrphanCleanConfig}. */
+class OrphanCleanConfigTest {
+
+ private static final DateTimeFormatter CUTOFF_FORMATTER =
+ DateTimeFormatter.ISO_OFFSET_DATE_TIME;
+
+ @Test
+ void parsesAllDatabasesWithDefaults() {
+ long beforeParse = System.currentTimeMillis();
+ OrphanCleanConfig config =
+ OrphanCleanConfig.fromParams(
+ MultipleParameterToolAdapter.fromArgs(
+ new String[] {"--bootstrap-server", "h:9123", "--all-databases"}));
+ long afterParse = System.currentTimeMillis();
+
+ assertThat(config.allDatabases()).isTrue();
+ assertThat(config.database()).isEmpty();
+ long olderThanLow = beforeParse - Duration.ofDays(3).toMillis();
+ long olderThanHigh = afterParse - Duration.ofDays(3).toMillis();
+ assertThat(config.olderThanMillis()).isBetween(olderThanLow, olderThanHigh);
+ assertThat(config.dryRun()).isFalse();
+ assertThat(config.remoteFsOpRateLimitPerSecond()).isEqualTo(100L);
+ assertThat(config.allowDeleteManifest()).isFalse();
+ assertThat(config.allowCleanOrphanTables()).isFalse();
+ assertThat(config.allowCleanOrphanPartitions()).isFalse();
+ }
+
+ @Test
+ void remoteFsOpRateLimitParsed() {
+ OrphanCleanConfig cfg =
+ OrphanCleanConfig.fromParams(
+ MultipleParameterToolAdapter.fromArgs(
+ new String[] {
+ "--bootstrap-server",
+ "h:9123",
+ "--all-databases",
+ "--remote-fs-op-rate-limit-per-second",
+ "42"
+ }));
+ assertThat(cfg.remoteFsOpRateLimitPerSecond()).isEqualTo(42L);
+ }
+
+ @Test
+ void remoteFsOpRateLimitMustBePositive() {
+ assertThatThrownBy(
+ () ->
+ OrphanCleanConfig.fromParams(
+ MultipleParameterToolAdapter.fromArgs(
+ new String[] {
+ "--bootstrap-server",
+ "h:9123",
+ "--all-databases",
+ "--remote-fs-op-rate-limit-per-second",
+ "0"
+ })))
+ .isInstanceOf(IllegalArgumentException.class)
+ .hasMessageContaining("--remote-fs-op-rate-limit-per-second must be positive");
+ }
+
+ @Test
+ void databaseAndAllDatabasesAreMutuallyExclusive() {
+ assertThatThrownBy(
+ () ->
+ OrphanCleanConfig.fromParams(
+ MultipleParameterToolAdapter.fromArgs(
+ new String[] {
+ "--bootstrap-server",
+ "h:9123",
+ "--database",
+ "x",
+ "--all-databases"
+ })))
+ .isInstanceOf(IllegalArgumentException.class)
+ .hasMessageContaining("mutually exclusive");
+ }
+
+ @Test
+ void cutoffCloserThanOneDayRejected() {
+ OffsetDateTime tooClose = OffsetDateTime.now(ZoneOffset.UTC).minusMinutes(30);
+ assertThatThrownBy(
+ () ->
+ OrphanCleanConfig.fromParams(
+ MultipleParameterToolAdapter.fromArgs(
+ new String[] {
+ "--bootstrap-server",
+ "h:9123",
+ "--all-databases",
+ "--older-than",
+ tooClose.format(CUTOFF_FORMATTER)
+ })))
+ .isInstanceOf(IllegalArgumentException.class)
+ .hasMessageContaining("at least 1d before now");
+ }
+
+ @Test
+ void cutoffWithoutExplicitOffsetRejected() {
+ assertThatThrownBy(
+ () ->
+ OrphanCleanConfig.fromParams(
+ MultipleParameterToolAdapter.fromArgs(
+ new String[] {
+ "--bootstrap-server",
+ "h:9123",
+ "--all-databases",
+ "--older-than",
+ "2024-01-01 00:00:00"
+ })))
+ .isInstanceOf(IllegalArgumentException.class)
+ .hasMessageContaining("ISO-8601");
+ }
+
+ @Test
+ void cutoffWithExplicitOffsetParsed() {
+ OffsetDateTime cutoff = OffsetDateTime.now(ZoneOffset.UTC).minusDays(2).withNano(0);
+ OrphanCleanConfig cfg =
+ OrphanCleanConfig.fromParams(
+ MultipleParameterToolAdapter.fromArgs(
+ new String[] {
+ "--bootstrap-server",
+ "h:9123",
+ "--all-databases",
+ "--older-than",
+ cutoff.format(CUTOFF_FORMATTER)
+ }));
+ assertThat(cfg.olderThanMillis()).isEqualTo(cutoff.toInstant().toEpochMilli());
+ }
+
+ @Test
+ void tableCannotBeUsedWithAllDatabases() {
+ assertThatThrownBy(
+ () ->
+ OrphanCleanConfig.fromParams(
+ MultipleParameterToolAdapter.fromArgs(
+ new String[] {
+ "--bootstrap-server",
+ "h:9123",
+ "--all-databases",
+ "--table",
+ "t1"
+ })))
+ .isInstanceOf(IllegalArgumentException.class)
+ .hasMessageContaining("--table requires --database");
+ }
+
+ @Test
+ void bootstrapServerRequired() {
+ assertThatThrownBy(
+ () ->
+ OrphanCleanConfig.fromParams(
+ MultipleParameterToolAdapter.fromArgs(
+ new String[] {"--all-databases"})))
+ .isInstanceOf(IllegalArgumentException.class)
+ .hasMessageContaining("bootstrap-server");
+ }
+
+ @Test
+ void optInFlagsParsed() {
+ OrphanCleanConfig cfg =
+ OrphanCleanConfig.fromParams(
+ MultipleParameterToolAdapter.fromArgs(
+ new String[] {
+ "--bootstrap-server",
+ "x:1",
+ "--all-databases",
+ "--allow-delete-manifest",
+ "--allow-clean-orphan-tables",
+ "--allow-clean-orphan-partitions"
+ }));
+ assertThat(cfg.allowDeleteManifest()).isTrue();
+ assertThat(cfg.allowCleanOrphanTables()).isTrue();
+ assertThat(cfg.allowCleanOrphanPartitions()).isTrue();
+ }
+
+ @Test
+ void extraConfigsParsed() {
+ OrphanCleanConfig cfg =
+ OrphanCleanConfig.fromParams(
+ MultipleParameterToolAdapter.fromArgs(
+ new String[] {
+ "--bootstrap-server",
+ "h:9123",
+ "--all-databases",
+ "--conf",
+ "fs.oss.accessKeyId=myKey",
+ "--conf",
+ "fs.oss.accessKeySecret=mySecret",
+ "--conf",
+ "fs.oss.endpoint=oss-cn-hangzhou.aliyuncs.com"
+ }));
+ assertThat(cfg.extraConfigs()).hasSize(3);
+ assertThat(cfg.extraConfigs().get("fs.oss.accessKeyId")).isEqualTo("myKey");
+ assertThat(cfg.extraConfigs().get("fs.oss.accessKeySecret")).isEqualTo("mySecret");
+ assertThat(cfg.extraConfigs().get("fs.oss.endpoint"))
+ .isEqualTo("oss-cn-hangzhou.aliyuncs.com");
+ }
+
+ @Test
+ void extraConfigsEmptyWhenNotProvided() {
+ OrphanCleanConfig cfg =
+ OrphanCleanConfig.fromParams(
+ MultipleParameterToolAdapter.fromArgs(
+ new String[] {"--bootstrap-server", "h:9123", "--all-databases"}));
+ assertThat(cfg.extraConfigs()).isEmpty();
+ }
+
+ @Test
+ void extraConfigsRejectsMalformedEntry() {
+ assertThatThrownBy(
+ () ->
+ OrphanCleanConfig.fromParams(
+ MultipleParameterToolAdapter.fromArgs(
+ new String[] {
+ "--bootstrap-server",
+ "h:9123",
+ "--all-databases",
+ "--conf",
+ "noEqualsSign"
+ })))
+ .isInstanceOf(IllegalArgumentException.class)
+ .hasMessageContaining("key=value");
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/action/orphan/fs/SafeDeleterTest.java b/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/action/orphan/fs/SafeDeleterTest.java
new file mode 100644
index 0000000000..8be4bd3d11
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/action/orphan/fs/SafeDeleterTest.java
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.fs;
+
+import org.apache.fluss.flink.action.orphan.audit.AuditLogger;
+import org.apache.fluss.flink.action.orphan.rule.Decision;
+import org.apache.fluss.flink.action.orphan.rule.RuleId;
+import org.apache.fluss.fs.FileSystem;
+import org.apache.fluss.fs.FsPath;
+import org.apache.fluss.fs.local.LocalFileSystem;
+import org.apache.fluss.shaded.guava32.com.google.common.util.concurrent.RateLimiter;
+
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+
+/** Tests for {@link SafeDeleter} against the local filesystem. */
+class SafeDeleterTest {
+
+ @TempDir Path tmp;
+
+ @Test
+ void deleteFileRespectsDryRun() throws IOException {
+ Path target = Files.createFile(tmp.resolve("orphan.log"));
+ SafeDeleter d = newDeleter(localFs(), true);
+ d.deleteFile(new FsPath(target.toString()), Decision.DELETE, RuleId.LOG_SEGMENT);
+ assertThat(Files.exists(target)).isTrue();
+ }
+
+ @Test
+ void deleteFileActuallyDeletesWhenNotDryRun() throws IOException {
+ Path target = Files.createFile(tmp.resolve("orphan.log"));
+ SafeDeleter d = newDeleter(localFs(), false);
+ d.deleteFile(new FsPath(target.toString()), Decision.DELETE, RuleId.LOG_SEGMENT);
+ assertThat(Files.exists(target)).isFalse();
+ }
+
+ @Test
+ void deleteFileRejectsNonDeleteDecision() {
+ SafeDeleter d = newDeleter(null, false);
+ assertThatThrownBy(
+ () ->
+ d.deleteFile(
+ new FsPath("/tmp/x"), Decision.KEEP_ACTIVE, RuleId.UNKNOWN))
+ .isInstanceOf(IllegalArgumentException.class);
+ }
+
+ @Test
+ void deleteEmptyDirNoOpsOnNonEmpty() throws IOException {
+ Path dir = Files.createDirectory(tmp.resolve("d"));
+ Files.createFile(dir.resolve("child"));
+ SafeDeleter d = newDeleter(localFs(), false);
+ d.deleteEmptyDir(new FsPath(dir.toString()));
+ assertThat(Files.exists(dir)).isTrue();
+ }
+
+ @Test
+ void deleteEmptyDirActuallyDeletes() throws IOException {
+ Path dir = Files.createDirectory(tmp.resolve("d"));
+ SafeDeleter d = newDeleter(localFs(), false);
+ d.deleteEmptyDir(new FsPath(dir.toString()));
+ assertThat(Files.exists(dir)).isFalse();
+ }
+
+ private static SafeDeleter newDeleter(FileSystem fs, boolean dryRun) {
+ return new SafeDeleter(fs, dryRun, new AuditLogger(), RateLimiter.create(1000.0));
+ }
+
+ private static FileSystem localFs() {
+ return LocalFileSystem.getSharedInstance();
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/action/orphan/job/BucketCleanerTest.java b/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/action/orphan/job/BucketCleanerTest.java
new file mode 100644
index 0000000000..b0fc5484f5
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/action/orphan/job/BucketCleanerTest.java
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.job;
+
+import org.apache.fluss.flink.action.orphan.audit.AuditLogger;
+import org.apache.fluss.flink.action.orphan.fs.SafeDeleter;
+import org.apache.fluss.flink.action.orphan.rule.BucketActiveRefs;
+import org.apache.fluss.flink.action.orphan.rule.RuleDispatcher;
+import org.apache.fluss.fs.FsPath;
+import org.apache.fluss.shaded.guava32.com.google.common.util.concurrent.RateLimiter;
+import org.apache.fluss.utils.FlussPaths;
+
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.attribute.FileTime;
+import java.util.Collections;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+class BucketCleanerTest {
+
+ @Test
+ void removesOldEmptySegmentDirAfterDeletingExpiredFiles(@TempDir Path tmp) throws IOException {
+ Path bucketRoot = Files.createDirectories(tmp.resolve("bucket"));
+ Path segmentDir =
+ Files.createDirectories(bucketRoot.resolve("11111111-1111-1111-1111-111111111111"));
+ Path logFile =
+ Files.write(
+ segmentDir.resolve(
+ FlussPaths.filenamePrefixFromOffset(0L)
+ + FlussPaths.LOG_FILE_SUFFIX),
+ new byte[] {0x42});
+ long cutoff = System.currentTimeMillis() - 1000L;
+ makeOld(logFile, cutoff - 1000L);
+ makeOld(segmentDir, cutoff - 1000L);
+ makeOld(bucketRoot, cutoff - 1000L);
+
+ BucketCleaner cleaner = createCleaner(bucketRoot, cutoff);
+
+ BucketCleaner.BucketCleanStats stats =
+ cleaner.clean(BucketActiveRefs.empty(), new FsPath(bucketRoot.toString()));
+
+ assertThat(stats.scanned).isEqualTo(1L);
+ assertThat(stats.deleted).isEqualTo(2L);
+ assertThat(stats.emptyDirsRemoved).isEqualTo(1L);
+ assertThat(Files.exists(logFile)).isFalse();
+ assertThat(Files.exists(segmentDir)).isFalse();
+ assertThat(Files.exists(bucketRoot)).isTrue();
+ }
+
+ @Test
+ void keepsFreshEmptySegmentDir(@TempDir Path tmp) throws IOException {
+ Path bucketRoot = Files.createDirectories(tmp.resolve("bucket"));
+ Path segmentDir =
+ Files.createDirectories(bucketRoot.resolve("11111111-1111-1111-1111-111111111111"));
+ long cutoff = System.currentTimeMillis() - 1000L;
+
+ BucketCleaner cleaner = createCleaner(bucketRoot, cutoff);
+
+ BucketCleaner.BucketCleanStats stats =
+ cleaner.clean(
+ new BucketActiveRefs(
+ Collections.emptySet(),
+ Collections.emptySet(),
+ Collections.emptySet()),
+ new FsPath(bucketRoot.toString()));
+
+ assertThat(stats.deleted).isEqualTo(0L);
+ assertThat(stats.emptyDirsRemoved).isEqualTo(0L);
+ assertThat(Files.exists(segmentDir)).isTrue();
+ }
+
+ @Test
+ void scansButDoesNotDeleteUnknownDotFiles(@TempDir Path tmp) throws IOException {
+ Path bucketRoot = Files.createDirectories(tmp.resolve("bucket"));
+ Path segmentDir =
+ Files.createDirectories(bucketRoot.resolve("11111111-1111-1111-1111-111111111111"));
+ Path dotFile = Files.write(segmentDir.resolve(".unknown"), new byte[] {0x42});
+ long cutoff = System.currentTimeMillis() - 1000L;
+ makeOld(dotFile, cutoff - 1000L);
+ makeOld(segmentDir, cutoff - 1000L);
+ makeOld(bucketRoot, cutoff - 1000L);
+
+ BucketCleaner cleaner = createCleaner(bucketRoot, cutoff);
+
+ BucketCleaner.BucketCleanStats stats =
+ cleaner.clean(BucketActiveRefs.empty(), new FsPath(bucketRoot.toString()));
+
+ assertThat(stats.scanned).isEqualTo(1L);
+ assertThat(stats.deleted).isEqualTo(0L);
+ assertThat(stats.emptyDirsRemoved).isEqualTo(0L);
+ assertThat(Files.exists(dotFile)).isTrue();
+ assertThat(Files.exists(segmentDir)).isTrue();
+ }
+
+ private static void makeOld(Path path, long timestampMillis) throws IOException {
+ Files.setLastModifiedTime(path, FileTime.fromMillis(timestampMillis));
+ }
+
+ private static BucketCleaner createCleaner(Path bucketRoot, long cutoff) throws IOException {
+ RateLimiter remoteFsOpRateLimiter = RateLimiter.create(1000.0);
+ return new BucketCleaner(
+ new RuleDispatcher(),
+ new SafeDeleter(
+ new FsPath(bucketRoot.toString()).getFileSystem(),
+ false,
+ new AuditLogger(),
+ remoteFsOpRateLimiter),
+ new AuditLogger(),
+ cutoff,
+ remoteFsOpRateLimiter);
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/action/orphan/rule/KvSharedSstRuleTest.java b/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/action/orphan/rule/KvSharedSstRuleTest.java
new file mode 100644
index 0000000000..c6267d31c8
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/action/orphan/rule/KvSharedSstRuleTest.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.rule;
+
+import org.apache.fluss.fs.FsPath;
+
+import org.junit.jupiter.api.Test;
+
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/** Tests for {@link KvSharedSstRule}. */
+class KvSharedSstRuleTest {
+
+ private static final long NOW = 1_700_000_000_000L;
+ private static final long DAY_MS = 24L * 60L * 60L * 1000L;
+ private static final long CUTOFF_MS = NOW - DAY_MS;
+
+ private final KvSharedSstRule rule = new KvSharedSstRule();
+
+ @Test
+ void keepsExpiredUnreferencedSharedSst() {
+ FileMeta file = file("/kv/db/t-1/0/shared/abc-001.sst", NOW - 2 * DAY_MS);
+
+ assertThat(rule.evaluate(file, BucketActiveRefs.empty(), CUTOFF_MS))
+ .isEqualTo(Decision.KEEP_ACTIVE);
+ }
+
+ @Test
+ void keepsReferencedSharedSst() {
+ FileMeta file = file("/kv/db/t-1/0/shared/abc-001.sst", NOW - 2 * DAY_MS);
+ Set sharedFiles = new HashSet();
+ sharedFiles.add("abc-001.sst");
+ BucketActiveRefs activeRefs =
+ new BucketActiveRefs(
+ Collections.emptySet(),
+ Collections.emptySet(),
+ sharedFiles);
+
+ assertThat(rule.evaluate(file, activeRefs, CUTOFF_MS)).isEqualTo(Decision.KEEP_ACTIVE);
+ }
+
+ @Test
+ void skipsUnknownNonSstFileUnderSharedDirectory() {
+ FileMeta file = file("/kv/db/t-1/0/shared/abc-001.meta", NOW - 2 * DAY_MS);
+
+ assertThat(rule.evaluate(file, BucketActiveRefs.empty(), CUTOFF_MS))
+ .isEqualTo(Decision.SKIP_UNKNOWN);
+ }
+
+ @Test
+ void skipsSstOutsideSharedDirectory() {
+ FileMeta file = file("/kv/db/t-1/0/snap-5/abc-001.sst", NOW - 2 * DAY_MS);
+
+ assertThat(rule.evaluate(file, BucketActiveRefs.empty(), CUTOFF_MS))
+ .isEqualTo(Decision.SKIP_UNKNOWN);
+ }
+
+ private static FileMeta file(String path, long modificationTime) {
+ return new FileMeta(new FsPath(path), 1L, modificationTime);
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/action/orphan/rule/KvSnapshotFileRuleTest.java b/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/action/orphan/rule/KvSnapshotFileRuleTest.java
new file mode 100644
index 0000000000..c056d8e538
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/action/orphan/rule/KvSnapshotFileRuleTest.java
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.rule;
+
+import org.apache.fluss.fs.FsPath;
+
+import org.junit.jupiter.api.Test;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/** Tests for {@link KvSnapshotFileRule}. */
+class KvSnapshotFileRuleTest {
+
+ private static final long NOW = 1_700_000_000_000L;
+ private static final long DAY_MS = 24L * 60L * 60L * 1000L;
+
+ /** Absolute cutoff = NOW - 1d. Files with mtime strictly less are deletion-eligible. */
+ private static final long CUTOFF_MS = NOW - DAY_MS;
+
+ private final KvSnapshotFileRule rule = new KvSnapshotFileRule();
+
+ @Test
+ void deletesExpiredSnapshotFileOutsideBucketActiveRefs() {
+ FileMeta file = file("/kv/db/t-1/0/snap-5/001.sst", NOW - 2 * DAY_MS);
+
+ assertThat(rule.evaluate(file, kvActiveSnapDirs("snap-7", "snap-9"), CUTOFF_MS))
+ .isEqualTo(Decision.DELETE);
+ }
+
+ @Test
+ void keepsActiveSnapshotFile() {
+ FileMeta file = file("/kv/db/t-1/0/snap-5/001.sst", NOW - 2 * DAY_MS);
+
+ assertThat(rule.evaluate(file, kvActiveSnapDirs("snap-5"), CUTOFF_MS))
+ .isEqualTo(Decision.KEEP_ACTIVE);
+ }
+
+ @Test
+ void defersSnapshotWhenMtimeAtOrAfterCutoff() {
+ FileMeta file = file("/kv/db/t-1/0/snap-5/001.sst", NOW - DAY_MS / 2);
+
+ assertThat(rule.evaluate(file, BucketActiveRefs.empty(), CUTOFF_MS))
+ .isEqualTo(Decision.DEFER);
+ }
+
+ @Test
+ void skipsUnknownFileNameInsideSnapshotDirectory() {
+ FileMeta file = file("/kv/db/t-1/0/snap-5/data.bloom", NOW - 2 * DAY_MS);
+
+ assertThat(rule.evaluate(file, BucketActiveRefs.empty(), CUTOFF_MS))
+ .isEqualTo(Decision.SKIP_UNKNOWN);
+ }
+
+ @Test
+ void skipsUnknownWhenParentIsNotSnapshotDirectory() {
+ FileMeta file = file("/kv/db/t-1/0/random/001.sst", NOW - 2 * DAY_MS);
+
+ assertThat(rule.evaluate(file, BucketActiveRefs.empty(), CUTOFF_MS))
+ .isEqualTo(Decision.SKIP_UNKNOWN);
+ }
+
+ @Test
+ void recognizesExactPrefixAndSuffixBasedSnapshotFiles() {
+ String[] fileNames = {
+ "_METADATA", "MANIFEST-001", "OPTIONS-002", "CURRENT", "LOG", "IDENTITY", "001.log"
+ };
+
+ for (String fileName : fileNames) {
+ FileMeta file = file("/kv/db/t-1/0/snap-5/" + fileName, NOW - 2 * DAY_MS);
+ assertThat(rule.evaluate(file, BucketActiveRefs.empty(), CUTOFF_MS))
+ .as("file=%s", fileName)
+ .isEqualTo(Decision.DELETE);
+ }
+ }
+
+ @Test
+ void retainedNonLatestSnapshotIsActive() {
+ // Simulates kv.snapshot.num-retained=2, latest snapId=10, retained={9,10}: the active set
+ // is the full retained set (server emits RETAINED ∪ STILL_IN_USE), so a file under snap-9
+ // MUST be classified as KEEP_ACTIVE even if it's old enough to clear the cutoff. Cutoff is
+ // set to NOW (an aggressive value) to prove the active-set check short-circuits before the
+ // age check.
+ FileMeta file =
+ new FileMeta(new FsPath("oss://b/kv/db/t-7/0/snap-9/_METADATA"), 1024L, 200L);
+
+ Decision decision = rule.evaluate(file, kvActiveSnapDirs("snap-9", "snap-10"), NOW);
+
+ assertThat(decision).isEqualTo(Decision.KEEP_ACTIVE);
+ }
+
+ private static BucketActiveRefs kvActiveSnapDirs(String... snapDirs) {
+ Set activeDirs = new HashSet(Arrays.asList(snapDirs));
+ return new BucketActiveRefs(
+ Collections.emptySet(), activeDirs, Collections.emptySet());
+ }
+
+ private static FileMeta file(String path, long modificationTime) {
+ return new FileMeta(new FsPath(path), 1L, modificationTime);
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/action/orphan/rule/LogManifestRuleTest.java b/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/action/orphan/rule/LogManifestRuleTest.java
new file mode 100644
index 0000000000..b8d166059a
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/action/orphan/rule/LogManifestRuleTest.java
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.rule;
+
+import org.apache.fluss.fs.FsPath;
+
+import org.junit.jupiter.api.Test;
+
+import java.util.Collections;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/** Tests for {@link LogManifestRule}. */
+class LogManifestRuleTest {
+
+ private static final long NOW = 1_700_000_000_000L;
+ private static final long DAY_MS = 24L * 60L * 60L * 1000L;
+ private static final long CUTOFF_MS = NOW - DAY_MS;
+
+ /** Default-conservative rule (allowDeleteManifest=false): never deletes manifests. */
+ private final LogManifestRule defaultRule = new LogManifestRule();
+
+ /** Opt-in rule (allowDeleteManifest=true): uses active-set + cutoff semantics. */
+ private final LogManifestRule optInRule = new LogManifestRule(true);
+
+ @Test
+ void deletesExpiredNonActiveManifest() {
+ FileMeta file = file("/log/db/t-1/0/metadata/old.manifest", NOW - 2 * DAY_MS);
+ BucketActiveRefs activeRefs =
+ new BucketActiveRefs(
+ Collections.emptySet(),
+ Collections.emptySet(),
+ Collections.singleton("/log/db/t-1/0/metadata/current.manifest"));
+
+ assertThat(optInRule.evaluate(file, activeRefs, CUTOFF_MS)).isEqualTo(Decision.DELETE);
+ }
+
+ @Test
+ void keepsManifestListedInActiveManifestPaths() {
+ FileMeta file = file("/log/db/t-1/0/metadata/active.manifest", NOW - 2 * DAY_MS);
+ BucketActiveRefs activeRefs =
+ new BucketActiveRefs(
+ Collections.emptySet(),
+ Collections.emptySet(),
+ Collections.singleton("/log/db/t-1/0/metadata/active.manifest"));
+
+ assertThat(optInRule.evaluate(file, activeRefs, CUTOFF_MS)).isEqualTo(Decision.KEEP_ACTIVE);
+ }
+
+ @Test
+ void defersManifestWhenMtimeAtOrAfterCutoff() {
+ FileMeta file = file("/log/db/t-1/0/metadata/fresh.manifest", NOW - DAY_MS / 2);
+
+ assertThat(optInRule.evaluate(file, BucketActiveRefs.empty(), CUTOFF_MS))
+ .isEqualTo(Decision.DEFER);
+ }
+
+ @Test
+ void skipsUnknownFileInMetadataDirectory() {
+ FileMeta file = file("/log/db/t-1/0/metadata/readme.txt", NOW - 2 * DAY_MS);
+
+ assertThat(defaultRule.evaluate(file, BucketActiveRefs.empty(), CUTOFF_MS))
+ .isEqualTo(Decision.SKIP_UNKNOWN);
+ assertThat(optInRule.evaluate(file, BucketActiveRefs.empty(), CUTOFF_MS))
+ .isEqualTo(Decision.SKIP_UNKNOWN);
+ }
+
+ @Test
+ void skipsManifestOutsideMetadataDirectory() {
+ FileMeta file =
+ file(
+ "/log/db/t-1/0/11111111-1111-1111-1111-111111111111/file.manifest",
+ NOW - 2 * DAY_MS);
+
+ assertThat(defaultRule.evaluate(file, BucketActiveRefs.empty(), CUTOFF_MS))
+ .isEqualTo(Decision.SKIP_UNKNOWN);
+ assertThat(optInRule.evaluate(file, BucketActiveRefs.empty(), CUTOFF_MS))
+ .isEqualTo(Decision.SKIP_UNKNOWN);
+ }
+
+ @Test
+ void defaultRuleNeverDeletesEvenWhenStaleAndOrphan() {
+ // mtime=0L (very old); active-set lists a different manifest as active; under the
+ // default-conservative branch the rule MUST still return KEEP_ACTIVE rather than DELETE.
+ FileMeta file = file("/log/db/t-1/0/metadata/orphan.manifest", 0L);
+ BucketActiveRefs activeRefs =
+ new BucketActiveRefs(
+ Collections.emptySet(),
+ Collections.emptySet(),
+ Collections.singleton("/log/db/t-1/0/metadata/current.manifest"));
+
+ assertThat(defaultRule.evaluate(file, activeRefs, CUTOFF_MS))
+ .isEqualTo(Decision.KEEP_ACTIVE);
+ }
+
+ private static FileMeta file(String path, long modificationTime) {
+ return new FileMeta(new FsPath(path), 1L, modificationTime);
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/action/orphan/rule/LogSegmentRuleTest.java b/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/action/orphan/rule/LogSegmentRuleTest.java
new file mode 100644
index 0000000000..bb8249e55d
--- /dev/null
+++ b/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/action/orphan/rule/LogSegmentRuleTest.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.fluss.flink.action.orphan.rule;
+
+import org.apache.fluss.fs.FsPath;
+
+import org.junit.jupiter.api.Test;
+
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+/** Tests for {@link LogSegmentRule}. */
+class LogSegmentRuleTest {
+
+ private static final String SEGMENT_ID = "11111111-1111-1111-1111-111111111111";
+ private static final long NOW = 1_700_000_000_000L;
+ private static final long DAY_MS = 24L * 60L * 60L * 1000L;
+
+ /**
+ * Absolute cutoff = NOW - 1d. Files with mtime strictly less than this are deletion-eligible.
+ */
+ private static final long CUTOFF_MS = NOW - DAY_MS;
+
+ private final LogSegmentRule rule = new LogSegmentRule();
+
+ @Test
+ void deleteWhenKnownExpiredAndNotInBucketActiveRefs() {
+ FileMeta file =
+ file("/log/db/t-1/0/" + SEGMENT_ID + "/00000000000000000000.log", NOW - 2 * DAY_MS);
+
+ Decision decision = rule.evaluate(file, BucketActiveRefs.empty(), CUTOFF_MS);
+
+ assertThat(decision).isEqualTo(Decision.DELETE);
+ }
+
+ @Test
+ void keepActiveWhenInBucketActiveRefs() {
+ FileMeta file =
+ file("/log/db/t-1/0/" + SEGMENT_ID + "/00000000000000000000.log", NOW - 2 * DAY_MS);
+ Set liveFiles = new HashSet();
+ liveFiles.add(SEGMENT_ID + "/00000000000000000000.log");
+ BucketActiveRefs activeRefs =
+ new BucketActiveRefs(
+ liveFiles, Collections.emptySet(), Collections.emptySet());
+
+ Decision decision = rule.evaluate(file, activeRefs, CUTOFF_MS);
+
+ assertThat(decision).isEqualTo(Decision.KEEP_ACTIVE);
+ }
+
+ @Test
+ void deferWhenMtimeAtOrAfterCutoff() {
+ FileMeta file =
+ file("/log/db/t-1/0/" + SEGMENT_ID + "/00000000000000000000.log", NOW - DAY_MS / 2);
+
+ Decision decision = rule.evaluate(file, BucketActiveRefs.empty(), CUTOFF_MS);
+
+ assertThat(decision).isEqualTo(Decision.DEFER);
+ }
+
+ @Test
+ void skipUnknownExtension() {
+ FileMeta file =
+ file(
+ "/log/db/t-1/0/" + SEGMENT_ID + "/00000000000000000000.bloom",
+ NOW - 2 * DAY_MS);
+
+ Decision decision = rule.evaluate(file, BucketActiveRefs.empty(), CUTOFF_MS);
+
+ assertThat(decision).isEqualTo(Decision.SKIP_UNKNOWN);
+ }
+
+ @Test
+ void skipUnknownWhenParentIsNotSegmentUuid() {
+ FileMeta file = file("/log/db/t-1/0/not-a-uuid/00000000000000000000.log", NOW - 2 * DAY_MS);
+
+ Decision decision = rule.evaluate(file, BucketActiveRefs.empty(), CUTOFF_MS);
+
+ assertThat(decision).isEqualTo(Decision.SKIP_UNKNOWN);
+ }
+
+ @Test
+ void deletedSuffixIsRecognizedAsKnownType() {
+ FileMeta file =
+ file(
+ "/log/db/t-1/0/" + SEGMENT_ID + "/00000000000000000000.log.deleted",
+ NOW - 2 * DAY_MS);
+
+ Decision decision = rule.evaluate(file, BucketActiveRefs.empty(), CUTOFF_MS);
+
+ assertThat(decision).isEqualTo(Decision.DELETE);
+ }
+
+ private static FileMeta file(String path, long modificationTime) {
+ return new FileMeta(new FsPath(path), 100L, modificationTime);
+ }
+}
diff --git a/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/adapter/FlinkMultipleParameterToolTest.java b/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/adapter/FlinkMultipleParameterToolTest.java
index 2cbf282352..321bc177c0 100644
--- a/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/adapter/FlinkMultipleParameterToolTest.java
+++ b/fluss-flink/fluss-flink-common/src/test/java/org/apache/fluss/flink/adapter/FlinkMultipleParameterToolTest.java
@@ -37,4 +37,33 @@ public void testToMap() {
assertThat(adapter.toMap()).containsEntry("multi1", "multiValue3");
assertThat(adapter.toMap()).containsEntry("multi2", "multiValue2");
}
+
+ @Test
+ public void testHas() {
+ String[] args = new String[] {"--key1", "value1", "--key2", "value2"};
+ MultipleParameterToolAdapter adapter = MultipleParameterToolAdapter.fromArgs(args);
+
+ assertThat(adapter.has("key1")).isTrue();
+ assertThat(adapter.has("key2")).isTrue();
+ assertThat(adapter.has("nonexistent")).isFalse();
+ }
+
+ @Test
+ public void testGet() {
+ String[] args = new String[] {"--key1", "value1", "--key2", "value2"};
+ MultipleParameterToolAdapter adapter = MultipleParameterToolAdapter.fromArgs(args);
+
+ assertThat(adapter.get("key1")).isEqualTo("value1");
+ assertThat(adapter.get("key2")).isEqualTo("value2");
+ assertThat(adapter.get("nonexistent")).isNull();
+ }
+
+ @Test
+ public void testGetMultiParameter() {
+ String[] args = new String[] {"--multi", "val1", "--multi", "val2", "--single", "only"};
+ MultipleParameterToolAdapter adapter = MultipleParameterToolAdapter.fromArgs(args);
+
+ assertThat(adapter.getMultiParameter("multi")).containsExactly("val1", "val2");
+ assertThat(adapter.getMultiParameter("single")).containsExactly("only");
+ }
}
diff --git a/fluss-flink/pom.xml b/fluss-flink/pom.xml
index 4f65374352..a0fae789b9 100644
--- a/fluss-flink/pom.xml
+++ b/fluss-flink/pom.xml
@@ -76,4 +76,4 @@
-
\ No newline at end of file
+
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/log/remote/DefaultRemoteLogStorage.java b/fluss-server/src/main/java/org/apache/fluss/server/log/remote/DefaultRemoteLogStorage.java
index 33f29f2830..56e8e24091 100644
--- a/fluss-server/src/main/java/org/apache/fluss/server/log/remote/DefaultRemoteLogStorage.java
+++ b/fluss-server/src/main/java/org/apache/fluss/server/log/remote/DefaultRemoteLogStorage.java
@@ -25,6 +25,7 @@
import org.apache.fluss.fs.FsPath;
import org.apache.fluss.metadata.PhysicalTablePath;
import org.apache.fluss.metadata.TableBucket;
+import org.apache.fluss.remote.RemoteLogManifest;
import org.apache.fluss.remote.RemoteLogSegment;
import org.apache.fluss.utils.CloseableRegistry;
import org.apache.fluss.utils.ExceptionUtils;
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/log/remote/LogTieringTask.java b/fluss-server/src/main/java/org/apache/fluss/server/log/remote/LogTieringTask.java
index 8c7d0d8832..c9cb32215e 100644
--- a/fluss-server/src/main/java/org/apache/fluss/server/log/remote/LogTieringTask.java
+++ b/fluss-server/src/main/java/org/apache/fluss/server/log/remote/LogTieringTask.java
@@ -22,6 +22,7 @@
import org.apache.fluss.fs.FsPath;
import org.apache.fluss.metadata.PhysicalTablePath;
import org.apache.fluss.metadata.TableBucket;
+import org.apache.fluss.remote.RemoteLogManifest;
import org.apache.fluss.remote.RemoteLogSegment;
import org.apache.fluss.rpc.gateway.CoordinatorGateway;
import org.apache.fluss.rpc.messages.CommitRemoteLogManifestRequest;
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/log/remote/RemoteLogManager.java b/fluss-server/src/main/java/org/apache/fluss/server/log/remote/RemoteLogManager.java
index 143ae251a7..f57a6ea81c 100644
--- a/fluss-server/src/main/java/org/apache/fluss/server/log/remote/RemoteLogManager.java
+++ b/fluss-server/src/main/java/org/apache/fluss/server/log/remote/RemoteLogManager.java
@@ -25,6 +25,7 @@
import org.apache.fluss.fs.FsPath;
import org.apache.fluss.metadata.PhysicalTablePath;
import org.apache.fluss.metadata.TableBucket;
+import org.apache.fluss.remote.RemoteLogManifest;
import org.apache.fluss.remote.RemoteLogSegment;
import org.apache.fluss.rpc.gateway.CoordinatorGateway;
import org.apache.fluss.server.log.LogManager;
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/log/remote/RemoteLogStorage.java b/fluss-server/src/main/java/org/apache/fluss/server/log/remote/RemoteLogStorage.java
index 1a410fcb2c..6e1de16cf3 100644
--- a/fluss-server/src/main/java/org/apache/fluss/server/log/remote/RemoteLogStorage.java
+++ b/fluss-server/src/main/java/org/apache/fluss/server/log/remote/RemoteLogStorage.java
@@ -22,6 +22,7 @@
import org.apache.fluss.fs.FsPath;
import org.apache.fluss.metadata.PhysicalTablePath;
import org.apache.fluss.metadata.TableBucket;
+import org.apache.fluss.remote.RemoteLogManifest;
import org.apache.fluss.remote.RemoteLogSegment;
import java.io.Closeable;
diff --git a/fluss-server/src/main/java/org/apache/fluss/server/log/remote/RemoteLogTablet.java b/fluss-server/src/main/java/org/apache/fluss/server/log/remote/RemoteLogTablet.java
index c840a0a028..9f0ae6e949 100644
--- a/fluss-server/src/main/java/org/apache/fluss/server/log/remote/RemoteLogTablet.java
+++ b/fluss-server/src/main/java/org/apache/fluss/server/log/remote/RemoteLogTablet.java
@@ -22,6 +22,7 @@
import org.apache.fluss.metadata.TableBucket;
import org.apache.fluss.metrics.MetricNames;
import org.apache.fluss.metrics.groups.MetricGroup;
+import org.apache.fluss.remote.RemoteLogManifest;
import org.apache.fluss.remote.RemoteLogSegment;
import org.apache.fluss.server.metrics.group.BucketMetricGroup;
diff --git a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/RebalanceManagerITCase.java b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/RebalanceManagerITCase.java
index a99370908b..425b5463e1 100644
--- a/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/RebalanceManagerITCase.java
+++ b/fluss-server/src/test/java/org/apache/fluss/server/coordinator/rebalance/RebalanceManagerITCase.java
@@ -28,6 +28,7 @@
import org.apache.fluss.metadata.TableBucketReplica;
import org.apache.fluss.metadata.TableDescriptor;
import org.apache.fluss.metadata.TablePath;
+import org.apache.fluss.remote.RemoteLogManifest;
import org.apache.fluss.rpc.gateway.TabletServerGateway;
import org.apache.fluss.rpc.messages.AddServerTagRequest;
import org.apache.fluss.server.coordinator.CoordinatorEventProcessor;
@@ -35,7 +36,6 @@
import org.apache.fluss.server.coordinator.rebalance.model.ClusterModel;
import org.apache.fluss.server.coordinator.statemachine.ReplicaState;
import org.apache.fluss.server.log.remote.RemoteLogManager;
-import org.apache.fluss.server.log.remote.RemoteLogManifest;
import org.apache.fluss.server.log.remote.RemoteLogTablet;
import org.apache.fluss.server.replica.ReplicaManager;
import org.apache.fluss.server.tablet.TabletServer;
diff --git a/fluss-server/src/test/java/org/apache/fluss/server/log/remote/DefaultRemoteLogStorageTest.java b/fluss-server/src/test/java/org/apache/fluss/server/log/remote/DefaultRemoteLogStorageTest.java
index a450295a7f..a8ec544510 100644
--- a/fluss-server/src/test/java/org/apache/fluss/server/log/remote/DefaultRemoteLogStorageTest.java
+++ b/fluss-server/src/test/java/org/apache/fluss/server/log/remote/DefaultRemoteLogStorageTest.java
@@ -21,6 +21,7 @@
import org.apache.fluss.fs.FsPath;
import org.apache.fluss.metadata.PhysicalTablePath;
import org.apache.fluss.metadata.TableBucket;
+import org.apache.fluss.remote.RemoteLogManifest;
import org.apache.fluss.remote.RemoteLogSegment;
import org.apache.fluss.server.log.LogTablet;
import org.apache.fluss.server.log.remote.RemoteLogStorage.IndexType;
diff --git a/fluss-server/src/test/java/org/apache/fluss/server/log/remote/RemoteLogITCase.java b/fluss-server/src/test/java/org/apache/fluss/server/log/remote/RemoteLogITCase.java
index 90dfa0914e..872682ad26 100644
--- a/fluss-server/src/test/java/org/apache/fluss/server/log/remote/RemoteLogITCase.java
+++ b/fluss-server/src/test/java/org/apache/fluss/server/log/remote/RemoteLogITCase.java
@@ -27,6 +27,7 @@
import org.apache.fluss.metadata.TableBucket;
import org.apache.fluss.metadata.TableDescriptor;
import org.apache.fluss.metadata.TablePath;
+import org.apache.fluss.remote.RemoteLogManifest;
import org.apache.fluss.remote.RemoteLogSegment;
import org.apache.fluss.rpc.entity.FetchLogResultForBucket;
import org.apache.fluss.rpc.gateway.CoordinatorGateway;
diff --git a/fluss-server/src/test/java/org/apache/fluss/server/log/remote/TestingRemoteLogStorage.java b/fluss-server/src/test/java/org/apache/fluss/server/log/remote/TestingRemoteLogStorage.java
index a946e9dd0c..eeba26a54c 100644
--- a/fluss-server/src/test/java/org/apache/fluss/server/log/remote/TestingRemoteLogStorage.java
+++ b/fluss-server/src/test/java/org/apache/fluss/server/log/remote/TestingRemoteLogStorage.java
@@ -20,6 +20,7 @@
import org.apache.fluss.config.Configuration;
import org.apache.fluss.exception.RemoteStorageException;
import org.apache.fluss.fs.FsPath;
+import org.apache.fluss.remote.RemoteLogManifest;
import org.apache.fluss.remote.RemoteLogSegment;
import java.io.IOException;
diff --git a/fluss-server/src/test/java/org/apache/fluss/server/testutils/FlussClusterExtension.java b/fluss-server/src/test/java/org/apache/fluss/server/testutils/FlussClusterExtension.java
index ddf52b8bab..aeb8a848df 100644
--- a/fluss-server/src/test/java/org/apache/fluss/server/testutils/FlussClusterExtension.java
+++ b/fluss-server/src/test/java/org/apache/fluss/server/testutils/FlussClusterExtension.java
@@ -772,33 +772,28 @@ public CompletedSnapshot triggerAndWaitSnapshot(TableBucket tableBucket) {
}
private Long triggerSnapshot(TableBucket tableBucket) {
- Long snapshotId = null;
- Long nextSnapshotId = null;
for (TabletServer ts : tabletServers.values()) {
ReplicaManager.HostedReplica replica = ts.getReplicaManager().getReplica(tableBucket);
if (replica instanceof ReplicaManager.OnlineReplica) {
Replica r = ((ReplicaManager.OnlineReplica) replica).getReplica();
PeriodicSnapshotManager kvSnapshotManager = r.getKvSnapshotManager();
if (r.isLeader() && kvSnapshotManager != null) {
- snapshotId = kvSnapshotManager.currentSnapshotId();
+ long snapshotId = kvSnapshotManager.currentSnapshotId();
+ // KvTablet#getGuardedExecutor runs the submitted task synchronously
+ // on the calling thread inside the kv write lock, so initSnapshot()
+ // has already completed by the time triggerSnapshot() returns. The
+ // counter is either bumped (a new snapshot was scheduled) or left
+ // unchanged (no new data since the last snapshot — legitimate no-op).
kvSnapshotManager.triggerSnapshot();
- nextSnapshotId = kvSnapshotManager.currentSnapshotId();
- break;
+ if (kvSnapshotManager.currentSnapshotId() > snapshotId) {
+ return snapshotId;
+ }
+ return null;
}
}
}
-
- if (snapshotId != null) {
- if (nextSnapshotId > snapshotId) {
- // only there is a new snapshot triggered, we return the snapshot id
- return snapshotId;
- } else {
- return null;
- }
- } else {
- fail("No KV snapshot manager found for table bucket " + tableBucket);
- return null;
- }
+ fail("No KV snapshot manager found for table bucket " + tableBucket);
+ return null;
}
public CompletedSnapshot waitUntilSnapshotFinished(TableBucket tableBucket, long snapshotId) {
diff --git a/fluss-test-coverage/pom.xml b/fluss-test-coverage/pom.xml
index d5e91d302d..357b3e98a2 100644
--- a/fluss-test-coverage/pom.xml
+++ b/fluss-test-coverage/pom.xml
@@ -502,6 +502,13 @@
org.apache.fluss.flink.functions.bitmap.RbAndAggFunction.AccumulatorSerializer.AccumulatorSerializerSnapshot
+
+ org.apache.fluss.flink.action.Action
+ org.apache.fluss.flink.action.ActionFactory
+ org.apache.fluss.flink.action.ActionLoader
+ org.apache.fluss.flink.action.FlussActionEntrypoint
+ org.apache.fluss.flink.action.orphan.*
+
org.apache.flink.table.catalog.*