From 2179db0498511b2a8a5b5e96863cc7e522613180 Mon Sep 17 00:00:00 2001 From: CrownChu Date: Fri, 22 May 2026 09:51:22 +0800 Subject: [PATCH 01/20] [globalindex] Support multi-column GlobalIndex framework Extend the GlobalIndex SPI, build path, and query path to support one index builder handling multiple columns (e.g. Lucene indexing title + content + tags together). Key changes: - GlobalIndexerFactory/GlobalIndexer: add List create overloads - GlobalIndexMultiColumnWriter: new interface for multi-column writes - GlobalIndexBuilderUtils: toIndexFileMetas/createIndexWriter accept List - GlobalIndexScanner: route extraFieldIds to same reader group - VectorScanImpl/FullTextScanImpl: match against extraFieldIds - GenericIndexTopoBuilder (Flink): multi-column projection and writer dispatch - DefaultGlobalIndexBuilder/TopoBuilder (Spark): multi-column support - All single-column APIs preserved for backward compatibility --- .../GlobalIndexMultiColumnWriter.java | 34 +++++ .../paimon/globalindex/GlobalIndexer.java | 5 + .../globalindex/GlobalIndexerFactory.java | 6 + .../globalindex/GlobalIndexBuilderUtils.java | 53 ++++++- .../globalindex/GlobalIndexScanner.java | 82 +++++++++-- .../paimon/table/source/FullTextScanImpl.java | 12 +- .../paimon/table/source/VectorScanImpl.java | 30 +++- .../globalindex/GenericIndexTopoBuilder.java | 135 +++++++++++++----- .../GenericIndexTopoBuilderTest.java | 2 +- .../DefaultGlobalIndexBuilder.java | 64 ++++++--- .../DefaultGlobalIndexTopoBuilder.java | 24 +++- .../GlobalIndexTopologyBuilder.java | 21 +++ 12 files changed, 393 insertions(+), 75 deletions(-) create mode 100644 paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexMultiColumnWriter.java diff --git a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexMultiColumnWriter.java b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexMultiColumnWriter.java new file mode 100644 index 000000000000..a6ded78d33fd --- /dev/null +++ b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexMultiColumnWriter.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.globalindex; + +import org.apache.paimon.data.InternalRow; + +import javax.annotation.Nullable; + +/** Index writer for global index that accepts multiple column values per row. */ +public interface GlobalIndexMultiColumnWriter extends GlobalIndexWriter { + + /** + * Write a projected row containing all indexed columns for one record. The row layout matches + * the fields order passed to {@link GlobalIndexerFactory#create(java.util.List, + * org.apache.paimon.options.Options)}. + */ + void write(@Nullable InternalRow row); +} diff --git a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexer.java b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexer.java index 74d223a60467..6c46415cfeee 100644 --- a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexer.java +++ b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexer.java @@ -41,4 +41,9 @@ static GlobalIndexer create(String type, DataField dataField, Options options) { GlobalIndexerFactory globalIndexerFactory = GlobalIndexerFactoryUtils.load(type); return globalIndexerFactory.create(dataField, options); } + + static GlobalIndexer create(String type, List fields, Options options) { + GlobalIndexerFactory globalIndexerFactory = GlobalIndexerFactoryUtils.load(type); + return globalIndexerFactory.create(fields, options); + } } diff --git a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java index 6eabb6d25360..e2497a6f82e3 100644 --- a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java +++ b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java @@ -22,10 +22,16 @@ import org.apache.paimon.options.Options; import org.apache.paimon.types.DataField; +import java.util.List; + /** File index factory to construct {@link FileIndexer}. */ public interface GlobalIndexerFactory { String identifier(); GlobalIndexer create(DataField dataField, Options options); + + default GlobalIndexer create(List fields, Options options) { + return create(fields.get(0), options); + } } diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java index 085423efa851..34badf3ed566 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java @@ -29,6 +29,8 @@ import org.apache.paimon.types.DataField; import org.apache.paimon.utils.Range; +import javax.annotation.Nullable; + import java.io.IOException; import java.util.ArrayList; import java.util.List; @@ -45,12 +47,54 @@ public static List toIndexFileMetas( String indexType, List entries) throws IOException { + return toIndexFileMetas( + fileIO, indexPathFactory, options, range, indexFieldId, null, indexType, entries); + } + + public static List toIndexFileMetas( + FileIO fileIO, + IndexPathFactory indexPathFactory, + CoreOptions options, + Range range, + List fields, + String indexType, + List entries) + throws IOException { + int indexFieldId = fields.get(0).id(); + int[] extraFieldIds = + fields.size() > 1 + ? fields.subList(1, fields.size()).stream() + .mapToInt(DataField::id) + .toArray() + : null; + return toIndexFileMetas( + fileIO, + indexPathFactory, + options, + range, + indexFieldId, + extraFieldIds, + indexType, + entries); + } + + private static List toIndexFileMetas( + FileIO fileIO, + IndexPathFactory indexPathFactory, + CoreOptions options, + Range range, + int indexFieldId, + @Nullable int[] extraFieldIds, + String indexType, + List entries) + throws IOException { List results = new ArrayList<>(); for (ResultEntry entry : entries) { String fileName = entry.fileName(); long fileSize = fileIO.getFileSize(indexPathFactory.toPath(fileName)); GlobalIndexMeta globalIndexMeta = - new GlobalIndexMeta(range.from, range.to, indexFieldId, null, entry.meta()); + new GlobalIndexMeta( + range.from, range.to, indexFieldId, extraFieldIds, entry.meta()); Path externalPathDir = options.globalIndexExternalPath(); String externalPathString = null; @@ -78,6 +122,13 @@ public static GlobalIndexWriter createIndexWriter( return globalIndexer.createWriter(createGlobalIndexFileReadWrite(table)); } + public static GlobalIndexWriter createIndexWriter( + FileStoreTable table, String indexType, List fields, Options options) + throws IOException { + GlobalIndexer globalIndexer = GlobalIndexer.create(indexType, fields, options); + return globalIndexer.createWriter(createGlobalIndexFileReadWrite(table)); + } + private static GlobalIndexFileReadWrite createGlobalIndexFileReadWrite(FileStoreTable table) { IndexPathFactory indexPathFactory = table.store().pathFactory().globalIndexFileFactory(); return new GlobalIndexFileReadWrite(table.fileIO(), indexPathFactory); diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java index 975b28183331..04e16eed2d87 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java @@ -20,6 +20,7 @@ import org.apache.paimon.fs.FileIO; import org.apache.paimon.fs.Path; +import org.apache.paimon.globalindex.GlobalIndexBuilderUtils; import org.apache.paimon.globalindex.io.GlobalIndexFileReader; import org.apache.paimon.index.GlobalIndexMeta; import org.apache.paimon.index.IndexFileMeta; @@ -37,6 +38,7 @@ import java.io.Closeable; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashMap; @@ -74,26 +76,66 @@ public GlobalIndexScanner( GlobalIndexReadThreadPool.getExecutorService(options.get(GLOBAL_INDEX_THREAD_NUM)); this.indexPathFactory = indexPathFactory; GlobalIndexFileReader indexFileReader = meta -> fileIO.newInputStream(meta.filePath()); + + // Single-column indexes: fieldId -> indexType -> range -> files Map>>> indexMetas = new HashMap<>(); + // Multi-column indexes: fieldIds -> indexType -> range -> files + Map, Map>>> multiColumnMetas = + new HashMap<>(); + // Reverse lookup: fieldId -> its multi-column group + Map> fieldToGroup = new HashMap<>(); + for (IndexFileMeta indexFile : indexFiles) { GlobalIndexMeta meta = checkNotNull(indexFile.globalIndexMeta()); - int fieldId = meta.indexFieldId(); String indexType = indexFile.indexType(); - indexMetas - .computeIfAbsent(fieldId, k -> new HashMap<>()) - .computeIfAbsent(indexType, k -> new HashMap<>()) - .computeIfAbsent( - new Range(meta.rowRangeStart(), meta.rowRangeEnd()), - k -> new ArrayList<>()) - .add(indexFile); + Range range = new Range(meta.rowRangeStart(), meta.rowRangeEnd()); + + if (meta.indexFieldId() == GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID + && meta.extraFieldIds() != null) { + // Multi-column index: all participating fields share the same IndexFileMeta, + // so looking up from any fieldId returns identical index files. + List fieldIds = + Arrays.stream(meta.extraFieldIds()) + .boxed() + .collect(Collectors.toList()); + multiColumnMetas + .computeIfAbsent(fieldIds, k -> new HashMap<>()) + .computeIfAbsent(indexType, k -> new HashMap<>()) + .computeIfAbsent(range, k -> new ArrayList<>()) + .add(indexFile); + for (int id : fieldIds) { + fieldToGroup.put(id, fieldIds); + } + } else { + // Single-column index + int fieldId = meta.indexFieldId(); + indexMetas + .computeIfAbsent(fieldId, k -> new HashMap<>()) + .computeIfAbsent(indexType, k -> new HashMap<>()) + .computeIfAbsent(range, k -> new ArrayList<>()) + .add(indexFile); + } } IntFunction> readersFunction = - fieldId -> - createReaders( + fId -> { + List group = fieldToGroup.get(fId); + if (group != null) { + // Multi-column: resolve full field list + List fields = + group.stream() + .map(rowType::getField) + .collect(Collectors.toList()); + return createReaders( + indexFileReader, multiColumnMetas.get(group), fields); + } else { + // Single-column + return createReaders( indexFileReader, - indexMetas.get(fieldId), - rowType.getField(fieldId)); + indexMetas.get(fId), + Collections.singletonList(rowType.getField(fId))); + } + }; this.globalIndexEvaluator = new GlobalIndexEvaluator(rowType, readersFunction); } @@ -127,7 +169,17 @@ public static Optional create( if (globalIndex == null) { return false; } - return filterFieldIds.contains(globalIndex.indexFieldId()); + if (filterFieldIds.contains(globalIndex.indexFieldId())) { + return true; + } + if (globalIndex.extraFieldIds() != null) { + for (int id : globalIndex.extraFieldIds()) { + if (filterFieldIds.contains(id)) { + return true; + } + } + } + return false; }; List indexFiles = @@ -145,7 +197,7 @@ public Optional scan(Predicate predicate) { private Collection createReaders( GlobalIndexFileReader indexFileReadWrite, Map>> indexMetas, - DataField dataField) { + List fields) { if (indexMetas == null) { return Collections.emptyList(); } @@ -155,7 +207,7 @@ private Collection createReaders( String indexType = entry.getKey(); Map> metas = entry.getValue(); GlobalIndexerFactory globalIndexerFactory = GlobalIndexerFactoryUtils.load(indexType); - GlobalIndexer globalIndexer = globalIndexerFactory.create(dataField, options); + GlobalIndexer globalIndexer = globalIndexerFactory.create(fields, options); List> futures = new ArrayList<>(metas.size()); for (Map.Entry> rangeMetas : metas.entrySet()) { diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextScanImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextScanImpl.java index cc77d9121ad5..6230b31336a3 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextScanImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextScanImpl.java @@ -61,7 +61,17 @@ public Plan scan() { if (globalIndex == null) { return false; } - return textColumn.id() == globalIndex.indexFieldId(); + if (textColumn.id() == globalIndex.indexFieldId()) { + return true; + } + if (globalIndex.extraFieldIds() != null) { + for (int id : globalIndex.extraFieldIds()) { + if (textColumn.id() == id) { + return true; + } + } + } + return false; }; List allIndexFiles = diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorScanImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorScanImpl.java index d3db6dd13d37..1ff3f82852f6 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorScanImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorScanImpl.java @@ -82,7 +82,17 @@ public Plan scan() { return false; } int fieldId = globalIndex.indexFieldId(); - return vectorColumn.id() == fieldId || filterFieldIds.contains(fieldId); + if (vectorColumn.id() == fieldId || filterFieldIds.contains(fieldId)) { + return true; + } + if (globalIndex.extraFieldIds() != null) { + for (int id : globalIndex.extraFieldIds()) { + if (vectorColumn.id() == id || filterFieldIds.contains(id)) { + return true; + } + } + } + return false; }; List allIndexFiles = @@ -94,7 +104,7 @@ public Plan scan() { Map> vectorByRange = new HashMap<>(); for (IndexFileMeta indexFile : allIndexFiles) { GlobalIndexMeta meta = checkNotNull(indexFile.globalIndexMeta()); - if (meta.indexFieldId() == vectorColumn.id()) { + if (containsField(meta, vectorColumn.id())) { Range range = new Range(meta.rowRangeStart(), meta.rowRangeEnd()); vectorByRange.computeIfAbsent(range, k -> new ArrayList<>()).add(indexFile); } @@ -111,7 +121,7 @@ public Plan scan() { f -> { GlobalIndexMeta globalIndex = checkNotNull(f.globalIndexMeta()); - if (globalIndex.indexFieldId() == vectorColumn.id()) { + if (containsField(globalIndex, vectorColumn.id())) { return false; } return range.hasIntersection(globalIndex.rowRange()); @@ -122,4 +132,18 @@ public Plan scan() { return () -> splits; } + + private static boolean containsField(GlobalIndexMeta meta, int fieldId) { + if (meta.indexFieldId() == fieldId) { + return true; + } + if (meta.extraFieldIds() != null) { + for (int id : meta.extraFieldIds()) { + if (id == fieldId) { + return true; + } + } + } + return false; + } } diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java index 5896503ce09d..5a0d852b12a7 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java @@ -29,7 +29,9 @@ import org.apache.paimon.flink.utils.BoundedOneInputOperator; import org.apache.paimon.flink.utils.JavaTypeInfo; import org.apache.paimon.flink.utils.StreamExecutionEnvironmentUtils; +import org.apache.paimon.globalindex.GlobalIndexMultiColumnWriter; import org.apache.paimon.globalindex.GlobalIndexSingletonWriter; +import org.apache.paimon.globalindex.GlobalIndexWriter; import org.apache.paimon.globalindex.ResultEntry; import org.apache.paimon.index.IndexFileMeta; import org.apache.paimon.io.DataFileMeta; @@ -103,7 +105,7 @@ public static void buildIndexAndExecute( buildIndexAndExecute( env, table, - indexColumn, + Collections.singletonList(indexColumn), indexType, partitionPredicate, userOptions, @@ -119,12 +121,31 @@ public static void buildIndexAndExecute( Options userOptions, long maxIndexedRowId) throws Exception { + buildIndexAndExecute( + env, + table, + Collections.singletonList(indexColumn), + indexType, + partitionPredicate, + userOptions, + maxIndexedRowId); + } + + public static void buildIndexAndExecute( + StreamExecutionEnvironment env, + FileStoreTable table, + List indexColumns, + String indexType, + PartitionPredicate partitionPredicate, + Options userOptions, + long maxIndexedRowId) + throws Exception { boolean hasIndexToBuild = buildIndex( env, () -> new GenericGlobalIndexBuilder(table), table, - indexColumn, + indexColumns, indexType, partitionPredicate, userOptions, @@ -149,13 +170,34 @@ public static boolean buildIndex( env, indexBuilderSupplier, table, - indexColumn, + Collections.singletonList(indexColumn), indexType, partitionPredicate, userOptions, NO_MAX_INDEXED_ROW_ID); } + public static boolean buildIndex( + StreamExecutionEnvironment env, + Supplier indexBuilderSupplier, + FileStoreTable table, + String indexColumn, + String indexType, + PartitionPredicate partitionPredicate, + Options userOptions, + long maxIndexedRowId) + throws Exception { + return buildIndex( + env, + indexBuilderSupplier, + table, + Collections.singletonList(indexColumn), + indexType, + partitionPredicate, + userOptions, + maxIndexedRowId); + } + /** * Builds a generic global index topology using a {@link GenericGlobalIndexBuilder} supplier. * @@ -166,7 +208,7 @@ public static boolean buildIndex( StreamExecutionEnvironment env, Supplier indexBuilderSupplier, FileStoreTable table, - String indexColumn, + List indexColumns, String indexType, PartitionPredicate partitionPredicate, Options userOptions, @@ -183,7 +225,7 @@ public static boolean buildIndex( return buildTopology( env, table, - indexColumn, + indexColumns, indexType, userOptions, entries, @@ -203,7 +245,7 @@ public static boolean buildIndex( private static boolean buildTopology( StreamExecutionEnvironment env, FileStoreTable table, - String indexColumn, + List indexColumns, String indexType, Options userOptions, List entries, @@ -212,24 +254,24 @@ private static boolean buildTopology( throws Exception { long totalRowCount = entries.stream().mapToLong(e -> e.file().rowCount()).sum(); LOG.info( - "Scanned {} files ({} rows) across {} partitions for {} index on column '{}'" + "Scanned {} files ({} rows) across {} partitions for {} index on columns '{}'" + (maxIndexedRowId >= 0 ? ", maxIndexedRowId={}." : "."), entries.size(), totalRowCount, entries.stream().map(ManifestEntry::partition).distinct().count(), indexType, - indexColumn, + indexColumns, maxIndexedRowId); long minNonIndexableRowId = - findMinNonIndexableRowId(table.schemaManager(), entries, indexColumn); + findMinNonIndexableRowId(table.schemaManager(), entries, indexColumns); entries = filterEntriesBefore(entries, minNonIndexableRowId); RowType rowType = table.rowType(); - DataField indexField = rowType.getField(indexColumn); - // Project indexColumn + _ROW_ID so we can read the actual row ID from data - List readColumns = new ArrayList<>(); - readColumns.add(indexColumn); + List indexFields = + indexColumns.stream().map(rowType::getField).collect(Collectors.toList()); + // Project indexColumns + _ROW_ID so we can read the actual row ID from data + List readColumns = new ArrayList<>(indexColumns); readColumns.add(SpecialFields.ROW_ID.name()); RowType projectedRowType = SpecialFields.rowTypeWithRowId(rowType).project(readColumns); @@ -277,7 +319,7 @@ private static boolean buildTopology( readBuilder, table, indexType, - indexField, + indexFields, projectedRowType, mergedOptions)) .setParallelism(parallelism); @@ -299,20 +341,22 @@ private static boolean buildTopology( } /** - * Find the minimum firstRowId among files whose schema does not contain the index column. Files - * at or beyond this rowId cannot be indexed because the column was added later via ALTER TABLE. + * Find the minimum firstRowId among files whose schema does not contain all index columns. + * Files at or beyond this rowId cannot be indexed because the column was added later via ALTER + * TABLE. * - * @return the boundary rowId, or {@link Long#MAX_VALUE} if all files contain the column + * @return the boundary rowId, or {@link Long#MAX_VALUE} if all files contain the columns */ static long findMinNonIndexableRowId( - SchemaManager schemaManager, List entries, String indexColumn) { - Map schemaContainsColumn = new HashMap<>(); + SchemaManager schemaManager, List entries, List indexColumns) { + Map schemaContainsColumns = new HashMap<>(); long minRowId = Long.MAX_VALUE; for (ManifestEntry entry : entries) { long sid = entry.file().schemaId(); boolean contains = - schemaContainsColumn.computeIfAbsent( - sid, id -> schemaManager.schema(id).fieldNames().contains(indexColumn)); + schemaContainsColumns.computeIfAbsent( + sid, + id -> schemaManager.schema(id).fieldNames().containsAll(indexColumns)); if (!contains && entry.file().firstRowId() != null) { minRowId = Math.min(minRowId, entry.file().nonNullFirstRowId()); } @@ -548,25 +592,26 @@ private static class BuildIndexOperator private final ReadBuilder readBuilder; private final FileStoreTable table; private final String indexType; - private final DataField indexField; + private final List indexFields; private final RowType projectedRowType; private final Options mergedOptions; private transient TableRead tableRead; - private transient InternalRow.FieldGetter indexFieldGetter; + private transient InternalRow.FieldGetter[] indexFieldGetters; private transient int rowIdFieldIndex; + private transient boolean multiColumn; BuildIndexOperator( ReadBuilder readBuilder, FileStoreTable table, String indexType, - DataField indexField, + List indexFields, RowType projectedRowType, Options mergedOptions) { this.readBuilder = readBuilder; this.table = table; this.indexType = indexType; - this.indexField = indexField; + this.indexFields = indexFields; this.projectedRowType = projectedRowType; this.mergedOptions = mergedOptions; } @@ -575,10 +620,15 @@ private static class BuildIndexOperator public void open() throws Exception { super.open(); this.tableRead = readBuilder.newRead(); - this.indexFieldGetter = - InternalRow.createFieldGetter( - indexField.type(), projectedRowType.getFieldIndex(indexField.name())); + this.indexFieldGetters = new InternalRow.FieldGetter[indexFields.size()]; + for (int i = 0; i < indexFields.size(); i++) { + DataField field = indexFields.get(i); + indexFieldGetters[i] = + InternalRow.createFieldGetter( + field.type(), projectedRowType.getFieldIndex(field.name())); + } this.rowIdFieldIndex = projectedRowType.getFieldIndex(SpecialFields.ROW_ID.name()); + this.multiColumn = indexFields.size() > 1; } @Override @@ -595,9 +645,8 @@ public void processElement(StreamRecord element) throws Exception { task.split.dataFiles().size()); long startTime = System.currentTimeMillis(); - GlobalIndexSingletonWriter indexWriter = - (GlobalIndexSingletonWriter) - createIndexWriter(table, indexType, indexField, mergedOptions); + GlobalIndexWriter indexWriter = + createIndexWriter(table, indexType, indexFields, mergedOptions); try { long rowsSeen = 0; @@ -626,8 +675,20 @@ public void processElement(StreamRecord element) throws Exception { } // Only write rows within this shard's range if (currentRowId >= task.shardRange.from) { - Object fieldData = indexFieldGetter.getFieldOrNull(row); - indexWriter.write(fieldData); + if (multiColumn) { + ((GlobalIndexMultiColumnWriter) indexWriter).write(row); + } else { + Object fieldData = indexFieldGetters[0].getFieldOrNull(row); + if (fieldData == null) { + LOG.info( + "Null value at rowId={}, stopping shard [{}, {}].", + currentRowId, + task.shardRange.from, + task.shardRange.to); + break; + } + ((GlobalIndexSingletonWriter) indexWriter).write(fieldData); + } rowsSeen++; } } @@ -664,7 +725,7 @@ public void processElement(StreamRecord element) throws Exception { table, partition, task.shardRange, - indexField, + indexFields, indexType, resultEntries); output.collect( @@ -688,7 +749,7 @@ private static CommitMessage flushIndex( FileStoreTable table, BinaryRow partition, Range rowRange, - DataField indexField, + List indexFields, String indexType, List resultEntries) throws IOException { @@ -698,14 +759,14 @@ private static CommitMessage flushIndex( table.store().pathFactory().globalIndexFileFactory(), table.coreOptions(), rowRange, - indexField.id(), + indexFields, indexType, resultEntries); return new CommitMessageImpl( partition, 0, null, indexIncrement(indexFileMetas), emptyIncrement()); } - private static void closeWriterQuietly(GlobalIndexSingletonWriter writer) { + private static void closeWriterQuietly(GlobalIndexWriter writer) { if (writer instanceof Closeable) { try { ((Closeable) writer).close(); diff --git a/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilderTest.java b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilderTest.java index 0de57077b295..fb1bd02f4408 100644 --- a/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilderTest.java +++ b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilderTest.java @@ -475,7 +475,7 @@ void testAppendFilterOldFilesBeforeNewFiles() { GenericIndexTopoBuilder.filterEntriesBefore( entries, GenericIndexTopoBuilder.findMinNonIndexableRowId( - schemaManager, entries, "vec")); + schemaManager, entries, Collections.singletonList("vec"))); assertThat(result).hasSize(2); assertThat(result.get(0).file().nonNullFirstRowId()).isEqualTo(0L); diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java index 1485d14fac1c..041ee9bf41b6 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java @@ -20,7 +20,9 @@ import org.apache.paimon.data.BinaryRow; import org.apache.paimon.data.InternalRow; +import org.apache.paimon.globalindex.GlobalIndexMultiColumnWriter; import org.apache.paimon.globalindex.GlobalIndexSingletonWriter; +import org.apache.paimon.globalindex.GlobalIndexWriter; import org.apache.paimon.globalindex.ResultEntry; import org.apache.paimon.index.IndexFileMeta; import org.apache.paimon.io.CompactIncrement; @@ -37,6 +39,7 @@ import java.io.IOException; import java.io.Serializable; +import java.util.Collections; import java.util.List; import static org.apache.paimon.globalindex.GlobalIndexBuilderUtils.createIndexWriter; @@ -50,7 +53,7 @@ public class DefaultGlobalIndexBuilder implements Serializable { private final FileStoreTable table; private final BinaryRow partition; private final RowType readType; - private final DataField indexField; + private final List indexFields; private final String indexType; private final Range rowRange; private final Options options; @@ -63,10 +66,28 @@ public DefaultGlobalIndexBuilder( String indexType, Range rowRange, Options options) { + this( + table, + partition, + readType, + Collections.singletonList(indexField), + indexType, + rowRange, + options); + } + + public DefaultGlobalIndexBuilder( + FileStoreTable table, + BinaryRow partition, + RowType readType, + List indexFields, + String indexType, + Range rowRange, + Options options) { this.table = table; this.partition = partition; this.readType = readType; - this.indexField = indexField; + this.indexFields = indexFields; this.indexType = indexType; this.rowRange = rowRange; this.options = options; @@ -89,7 +110,7 @@ public CommitMessage build(CloseableIterator data) throws IOExcepti table.store().pathFactory().globalIndexFileFactory(), table.coreOptions(), rowRange, - indexField.id(), + indexFields, indexType, resultEntries); DataIncrement dataIncrement = DataIncrement.indexIncrement(indexFileMetas); @@ -99,27 +120,38 @@ public CommitMessage build(CloseableIterator data) throws IOExcepti private List writePaimonRows( CloseableIterator rows, LongCounter rowCounter) throws IOException { - GlobalIndexSingletonWriter indexWriter = - (GlobalIndexSingletonWriter) - createIndexWriter(table, indexType, indexField, options); + GlobalIndexWriter indexWriter = createIndexWriter(table, indexType, indexFields, options); + boolean multiColumn = indexFields.size() > 1; try { - InternalRow.FieldGetter getter = - InternalRow.createFieldGetter( - indexField.type(), readType.getFieldIndex(indexField.name())); - rows.forEachRemaining( - row -> { - Object indexO = getter.getFieldOrNull(row); - indexWriter.write(indexO); - rowCounter.add(1); - }); + if (multiColumn) { + GlobalIndexMultiColumnWriter multiWriter = + (GlobalIndexMultiColumnWriter) indexWriter; + rows.forEachRemaining( + row -> { + multiWriter.write(row); + rowCounter.add(1); + }); + } else { + DataField indexField = indexFields.get(0); + GlobalIndexSingletonWriter singleWriter = (GlobalIndexSingletonWriter) indexWriter; + InternalRow.FieldGetter getter = + InternalRow.createFieldGetter( + indexField.type(), readType.getFieldIndex(indexField.name())); + rows.forEachRemaining( + row -> { + Object indexO = getter.getFieldOrNull(row); + singleWriter.write(indexO); + rowCounter.add(1); + }); + } return indexWriter.finish(); } finally { closeWriterQuietly(indexWriter); } } - private static void closeWriterQuietly(GlobalIndexSingletonWriter writer) { + private static void closeWriterQuietly(GlobalIndexWriter writer) { if (writer instanceof java.io.Closeable) { try { ((java.io.Closeable) writer).close(); diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexTopoBuilder.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexTopoBuilder.java index afd954c39a5d..437ad11737dc 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexTopoBuilder.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexTopoBuilder.java @@ -77,6 +77,28 @@ public List buildIndex( DataField indexField, Options options) throws IOException { + return buildIndex( + spark, + relation, + partitionPredicate, + table, + indexType, + readType, + Collections.singletonList(indexField), + options); + } + + @Override + public List buildIndex( + SparkSession spark, + DataSourceV2Relation relation, + PartitionPredicate partitionPredicate, + FileStoreTable table, + String indexType, + RowType readType, + List indexFields, + Options options) + throws IOException { Options tableOptions = table.coreOptions().toConfiguration(); long rowsPerShard = tableOptions @@ -106,7 +128,7 @@ public List buildIndex( table, partition, readType, - indexField, + indexFields, indexType, indexedSplit.rowRanges().get(0), options); diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/GlobalIndexTopologyBuilder.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/GlobalIndexTopologyBuilder.java index 50c6ab34e153..aea421800410 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/GlobalIndexTopologyBuilder.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/GlobalIndexTopologyBuilder.java @@ -46,4 +46,25 @@ List buildIndex( DataField indexField, Options options) throws IOException; + + default List buildIndex( + SparkSession spark, + DataSourceV2Relation relation, + PartitionPredicate partitionPredicate, + FileStoreTable table, + String indexType, + RowType readType, + List indexFields, + Options options) + throws IOException { + return buildIndex( + spark, + relation, + partitionPredicate, + table, + indexType, + readType, + indexFields.get(0), + options); + } } From 9091d672a10a3b5f13d0bec9bc99550982b0875f Mon Sep 17 00:00:00 2001 From: CrownChu Date: Mon, 25 May 2026 11:37:17 +0800 Subject: [PATCH 02/20] [globalindex] Support multi-column in CreateGlobalIndexProcedure Allow index_column parameter to accept comma-separated column names (e.g. "title,embedding") for both Flink and Spark procedures. Add List overload for GenericIndexTopoBuilder.buildIndexAndExecute. --- .../globalindex/GenericIndexTopoBuilder.java | 18 +++++++++++ .../procedure/CreateGlobalIndexProcedure.java | 27 ++++++++++------ .../procedure/CreateGlobalIndexProcedure.java | 32 ++++++++++++------- 3 files changed, 57 insertions(+), 20 deletions(-) diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java index 5a0d852b12a7..df8e92f8d0cd 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java @@ -131,6 +131,24 @@ public static void buildIndexAndExecute( maxIndexedRowId); } + public static void buildIndexAndExecute( + StreamExecutionEnvironment env, + FileStoreTable table, + List indexColumns, + String indexType, + PartitionPredicate partitionPredicate, + Options userOptions) + throws Exception { + buildIndexAndExecute( + env, + table, + indexColumns, + indexType, + partitionPredicate, + userOptions, + NO_MAX_INDEXED_ROW_ID); + } + public static void buildIndexAndExecute( StreamExecutionEnvironment env, FileStoreTable table, diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java index 5f4855567047..e77a2f598df8 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java @@ -32,8 +32,10 @@ import org.apache.flink.table.annotation.ProcedureHint; import org.apache.flink.table.procedure.ProcedureContext; +import java.util.Arrays; import java.util.List; import java.util.Map; +import java.util.stream.Collectors; import static org.apache.paimon.utils.ParameterUtils.getPartitions; import static org.apache.paimon.utils.Preconditions.checkArgument; @@ -77,11 +79,18 @@ public String[] call( tableId); RowType rowType = table.rowType(); - checkArgument( - rowType.containsField(indexColumn), - "Column '%s' does not exist in table '%s'.", - indexColumn, - tableId); + List indexColumns = + Arrays.stream(indexColumn.split(",")) + .map(String::trim) + .filter(s -> !s.isEmpty()) + .collect(Collectors.toList()); + for (String col : indexColumns) { + checkArgument( + rowType.containsField(col), + "Column '%s' does not exist in table '%s'.", + col, + tableId); + } // Parse partition predicate PartitionPredicate partitionPredicate = parsePartitionPredicate(table, partitions); @@ -97,7 +106,7 @@ public String[] call( BTreeIndexTopoBuilder.buildIndexAndExecute( procedureContext.getExecutionEnvironment(), table, - indexColumn, + indexColumns.get(0), partitionPredicate, userOptions); return new String[] { @@ -107,7 +116,7 @@ public String[] call( GenericIndexTopoBuilder.buildIndexAndExecute( procedureContext.getExecutionEnvironment(), table, - indexColumn, + indexColumns, indexType, partitionPredicate, userOptions); @@ -115,8 +124,8 @@ public String[] call( } catch (Exception e) { throw new RuntimeException( String.format( - "Failed to create %s index for column '%s' on table '%s'.", - indexType, indexColumn, table.name()), + "Failed to create %s index for columns '%s' on table '%s'.", + indexType, indexColumns, table.name()), e); } return new String[] { diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java index e25464b173d7..0ddc4fcc526a 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java @@ -43,11 +43,12 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.Collections; +import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Locale; import java.util.UUID; +import java.util.stream.Collectors; import static org.apache.paimon.utils.Preconditions.checkArgument; import static org.apache.spark.sql.types.DataTypes.StringType; @@ -121,11 +122,18 @@ public InternalRow[] call(InternalRow args) { tableIdent); RowType rowType = table.rowType(); - checkArgument( - rowType.containsField(column), - "Column '%s' does not exist in table '%s'.", - column, - tableIdent); + List indexColumns = + Arrays.stream(column.split(",")) + .map(String::trim) + .filter(s -> !s.isEmpty()) + .collect(Collectors.toList()); + for (String col : indexColumns) { + checkArgument( + rowType.containsField(col), + "Column '%s' does not exist in table '%s'.", + col, + tableIdent); + } DataSourceV2Relation relation = createRelation(tableIdent, sparkTable); PartitionPredicate partitionPredicate = SparkProcedureUtils.convertToPartitionPredicate( @@ -134,9 +142,11 @@ public InternalRow[] call(InternalRow args) { spark(), relation); - DataField indexField = rowType.getField(column); - RowType projectedRowType = - rowType.project(Collections.singletonList(column)); + List indexFields = + indexColumns.stream() + .map(rowType::getField) + .collect(Collectors.toList()); + RowType projectedRowType = rowType.project(indexColumns); RowType readRowType = SpecialFields.rowTypeWithRowId(projectedRowType); HashMap parsedOptions = new HashMap<>(); @@ -154,7 +164,7 @@ public InternalRow[] call(InternalRow args) { table, indexType, readRowType, - indexField, + indexFields, userOptions); try (TableCommitImpl commit = @@ -170,7 +180,7 @@ public InternalRow[] call(InternalRow args) { } catch (Exception e) { throw new RuntimeException( String.format( - "Failed to create %s index for column '%s' on table '%s'.", + "Failed to create %s index for columns '%s' on table '%s'.", indexType, column, tableIdent), e); } From 77bbc77599cce1f42a8ece92a8084e75efb16edb Mon Sep 17 00:00:00 2001 From: CrownChu Date: Tue, 26 May 2026 02:00:19 +0800 Subject: [PATCH 03/20] [globalindex] Fix multi-column index metadata storage and resolveFields validation --- .../globalindex/GlobalIndexBuilderUtils.java | 18 +++++++++++------- .../paimon/globalindex/GlobalIndexScanner.java | 9 +++++++++ 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java index 34badf3ed566..3931a53a9ecb 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java @@ -38,6 +38,8 @@ /** Utils for global index build. */ public class GlobalIndexBuilderUtils { + public static final int MULTI_COLUMN_INDEX_FIELD_ID = -1; + public static List toIndexFileMetas( FileIO fileIO, IndexPathFactory indexPathFactory, @@ -60,13 +62,15 @@ public static List toIndexFileMetas( String indexType, List entries) throws IOException { - int indexFieldId = fields.get(0).id(); - int[] extraFieldIds = - fields.size() > 1 - ? fields.subList(1, fields.size()).stream() - .mapToInt(DataField::id) - .toArray() - : null; + int indexFieldId; + int[] extraFieldIds; + if (fields.size() > 1) { + indexFieldId = MULTI_COLUMN_INDEX_FIELD_ID; + extraFieldIds = fields.stream().mapToInt(DataField::id).toArray(); + } else { + indexFieldId = fields.get(0).id(); + extraFieldIds = null; + } return toIndexFileMetas( fileIO, indexPathFactory, diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java index 04e16eed2d87..cffcbb34646a 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java @@ -53,8 +53,10 @@ import java.util.stream.Collectors; import static org.apache.paimon.CoreOptions.GLOBAL_INDEX_THREAD_NUM; +import static org.apache.paimon.globalindex.GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID; import static org.apache.paimon.predicate.PredicateVisitor.collectFieldNames; import static org.apache.paimon.table.source.snapshot.TimeTravelUtil.tryTravelOrLatest; +import static org.apache.paimon.utils.Preconditions.checkArgument; import static org.apache.paimon.utils.Preconditions.checkNotNull; /** Scanner for shard-based global indexes. */ @@ -98,6 +100,13 @@ public GlobalIndexScanner( Arrays.stream(meta.extraFieldIds()) .boxed() .collect(Collectors.toList()); + // Validate consistency: all files in the same group must have identical extraFieldIds + if (fieldToGroup.containsKey(fieldIds.get(0))) { + List existingGroup = fieldToGroup.get(fieldIds.get(0)); + checkArgument( + existingGroup.equals(fieldIds), + "Inconsistent extraFieldIds across index files."); + } multiColumnMetas .computeIfAbsent(fieldIds, k -> new HashMap<>()) .computeIfAbsent(indexType, k -> new HashMap<>()) From 92692a027a3103e7c85f48d4d20dc5517820030b Mon Sep 17 00:00:00 2001 From: CrownChu Date: Tue, 26 May 2026 17:04:15 +0800 Subject: [PATCH 04/20] [globalindex] Fix GenericIndexTopoBuilder multi-column null value error --- .../GlobalIndexBuilderUtilsTest.java | 146 ++++++++++++++++++ .../globalindex/GenericIndexTopoBuilder.java | 15 ++ 2 files changed, 161 insertions(+) create mode 100644 paimon-core/src/test/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtilsTest.java diff --git a/paimon-core/src/test/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtilsTest.java b/paimon-core/src/test/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtilsTest.java new file mode 100644 index 000000000000..703c01c69633 --- /dev/null +++ b/paimon-core/src/test/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtilsTest.java @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.paimon.globalindex; + +import org.apache.paimon.CoreOptions; +import org.apache.paimon.fs.FileIO; +import org.apache.paimon.fs.Path; +import org.apache.paimon.fs.local.LocalFileIO; +import org.apache.paimon.index.IndexFileMeta; +import org.apache.paimon.index.IndexPathFactory; +import org.apache.paimon.options.Options; +import org.apache.paimon.types.ArrayType; +import org.apache.paimon.types.DataField; +import org.apache.paimon.types.FloatType; +import org.apache.paimon.types.IntType; +import org.apache.paimon.types.VarCharType; +import org.apache.paimon.utils.Range; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.UUID; + +import static org.assertj.core.api.Assertions.assertThat; + +/** Tests for {@link GlobalIndexBuilderUtils}. */ +class GlobalIndexBuilderUtilsTest { + + @TempDir java.nio.file.Path tempDir; + + private FileIO fileIO; + private IndexPathFactory indexPathFactory; + private CoreOptions coreOptions; + + @BeforeEach + void setUp() { + fileIO = new LocalFileIO(); + Path dir = new Path(tempDir.toString()); + indexPathFactory = + new IndexPathFactory() { + @Override + public Path toPath(String fileName) { + return new Path(dir, fileName); + } + + @Override + public Path newPath() { + return new Path(dir, UUID.randomUUID().toString()); + } + + @Override + public boolean isExternalPath() { + return false; + } + }; + coreOptions = new CoreOptions(new Options().toMap()); + } + + // Test: 2 columns (title + vec), indexFieldId=-1, all field ids stored in extraFieldIds + @Test + void testToIndexFileMetasMultiColumn() throws IOException { + DataField titleField = new DataField(1, "title", new VarCharType(Integer.MAX_VALUE)); + DataField vecField = new DataField(2, "vec", new ArrayType(new FloatType())); + List fields = Arrays.asList(titleField, vecField); + + List entries = createDummyResultEntries(); + Range range = new Range(0, 99); + + List metas = + GlobalIndexBuilderUtils.toIndexFileMetas( + fileIO, indexPathFactory, coreOptions, range, fields, "test-type", entries); + + assertThat(metas).hasSize(1); + assertThat(metas.get(0).globalIndexMeta().indexFieldId()).isEqualTo(-1); + assertThat(metas.get(0).globalIndexMeta().extraFieldIds()).isEqualTo(new int[] {1, 2}); + assertThat(metas.get(0).globalIndexMeta().rowRangeStart()).isEqualTo(0); + assertThat(metas.get(0).globalIndexMeta().rowRangeEnd()).isEqualTo(99); + } + + // Test: single column, extraFieldIds should be null (backward compatible with single-column + // path) + @Test + void testToIndexFileMetasSingleColumn() throws IOException { + DataField titleField = new DataField(1, "title", new VarCharType(Integer.MAX_VALUE)); + List fields = Collections.singletonList(titleField); + + List entries = createDummyResultEntries(); + Range range = new Range(0, 49); + + List metas = + GlobalIndexBuilderUtils.toIndexFileMetas( + fileIO, indexPathFactory, coreOptions, range, fields, "test-type", entries); + + assertThat(metas).hasSize(1); + assertThat(metas.get(0).globalIndexMeta().indexFieldId()).isEqualTo(1); + assertThat(metas.get(0).globalIndexMeta().extraFieldIds()).isNull(); + } + + // Test: 3 columns (title + vec + id), indexFieldId=-1, all field ids in extraFieldIds + @Test + void testToIndexFileMetasThreeColumns() throws IOException { + DataField titleField = new DataField(1, "title", new VarCharType(Integer.MAX_VALUE)); + DataField vecField = new DataField(2, "vec", new ArrayType(new FloatType())); + DataField idField = new DataField(3, "id", new IntType()); + List fields = Arrays.asList(titleField, vecField, idField); + + List entries = createDummyResultEntries(); + Range range = new Range(0, 199); + + List metas = + GlobalIndexBuilderUtils.toIndexFileMetas( + fileIO, indexPathFactory, coreOptions, range, fields, "test-type", entries); + + assertThat(metas).hasSize(1); + assertThat(metas.get(0).globalIndexMeta().indexFieldId()).isEqualTo(-1); + assertThat(metas.get(0).globalIndexMeta().extraFieldIds()).isEqualTo(new int[] {1, 2, 3}); + } + + private List createDummyResultEntries() throws IOException { + String fileName = "test-index-" + UUID.randomUUID(); + Path filePath = indexPathFactory.toPath(fileName); + fileIO.newOutputStream(filePath, false).close(); + return Collections.singletonList(new ResultEntry(fileName, 100, null)); + } +} diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java index df8e92f8d0cd..2ac57502cc02 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java @@ -694,6 +694,21 @@ public void processElement(StreamRecord element) throws Exception { // Only write rows within this shard's range if (currentRowId >= task.shardRange.from) { if (multiColumn) { + boolean hasNull = false; + for (InternalRow.FieldGetter getter : indexFieldGetters) { + if (getter.getFieldOrNull(row) == null) { + hasNull = true; + break; + } + } + if (hasNull) { + LOG.info( + "Null value in indexed columns at rowId={}, stopping shard [{}, {}].", + currentRowId, + task.shardRange.from, + task.shardRange.to); + break; + } ((GlobalIndexMultiColumnWriter) indexWriter).write(row); } else { Object fieldData = indexFieldGetters[0].getFieldOrNull(row); From 2229970508fdfdeab258a01463f5264ce6d80b6b Mon Sep 17 00:00:00 2001 From: CrownChu Date: Tue, 26 May 2026 18:49:08 +0800 Subject: [PATCH 05/20] [globalindex] Extract findMinNonIndexableRowId and filterEntriesBefore into GlobalIndexBuilderUtils --- .../globalindex/GlobalIndexBuilderUtils.java | 44 +++++++++++++++++ .../globalindex/GenericIndexTopoBuilder.java | 49 +------------------ 2 files changed, 46 insertions(+), 47 deletions(-) diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java index 3931a53a9ecb..8f256957b7da 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java @@ -24,7 +24,9 @@ import org.apache.paimon.index.GlobalIndexMeta; import org.apache.paimon.index.IndexFileMeta; import org.apache.paimon.index.IndexPathFactory; +import org.apache.paimon.manifest.ManifestEntry; import org.apache.paimon.options.Options; +import org.apache.paimon.schema.SchemaManager; import org.apache.paimon.table.FileStoreTable; import org.apache.paimon.types.DataField; import org.apache.paimon.utils.Range; @@ -33,7 +35,9 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; /** Utils for global index build. */ public class GlobalIndexBuilderUtils { @@ -133,6 +137,46 @@ public static GlobalIndexWriter createIndexWriter( return globalIndexer.createWriter(createGlobalIndexFileReadWrite(table)); } + /** + * Find the minimum firstRowId among files whose schema does not contain all index columns. + * Files at or beyond this rowId cannot be indexed because the column was added later via ALTER + * TABLE. + * + * @return the boundary rowId, or {@link Long#MAX_VALUE} if all files contain the columns + */ + public static long findMinNonIndexableRowId( + SchemaManager schemaManager, List entries, List indexColumns) { + Map schemaContainsColumns = new HashMap<>(); + long minRowId = Long.MAX_VALUE; + for (ManifestEntry entry : entries) { + long sid = entry.file().schemaId(); + boolean contains = + schemaContainsColumns.computeIfAbsent( + sid, + id -> schemaManager.schema(id).fieldNames().containsAll(indexColumns)); + if (!contains && entry.file().firstRowId() != null) { + minRowId = Math.min(minRowId, entry.file().nonNullFirstRowId()); + } + } + return minRowId; + } + + /** Keep only entries whose firstRowId is strictly less than the given boundary. */ + public static List filterEntriesBefore( + List entries, long boundaryRowId) { + if (boundaryRowId == Long.MAX_VALUE) { + return entries; + } + List result = new ArrayList<>(); + for (ManifestEntry entry : entries) { + if (entry.file().firstRowId() != null + && entry.file().nonNullFirstRowId() < boundaryRowId) { + result.add(entry); + } + } + return result; + } + private static GlobalIndexFileReadWrite createGlobalIndexFileReadWrite(FileStoreTable table) { IndexPathFactory indexPathFactory = table.store().pathFactory().globalIndexFileFactory(); return new GlobalIndexFileReadWrite(table.fileIO(), indexPathFactory); diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java index 2ac57502cc02..035aacdce3f8 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java @@ -40,7 +40,6 @@ import org.apache.paimon.options.Options; import org.apache.paimon.partition.PartitionPredicate; import org.apache.paimon.reader.RecordReader; -import org.apache.paimon.schema.SchemaManager; import org.apache.paimon.table.FileStoreTable; import org.apache.paimon.table.SpecialFields; import org.apache.paimon.table.sink.BatchWriteBuilder; @@ -67,7 +66,6 @@ import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; -import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; @@ -76,6 +74,8 @@ import java.util.stream.Collectors; import static org.apache.paimon.globalindex.GlobalIndexBuilderUtils.createIndexWriter; +import static org.apache.paimon.globalindex.GlobalIndexBuilderUtils.filterEntriesBefore; +import static org.apache.paimon.globalindex.GlobalIndexBuilderUtils.findMinNonIndexableRowId; import static org.apache.paimon.globalindex.GlobalIndexBuilderUtils.toIndexFileMetas; import static org.apache.paimon.io.CompactIncrement.emptyIncrement; import static org.apache.paimon.io.DataIncrement.deleteIndexIncrement; @@ -358,51 +358,6 @@ private static boolean buildTopology( return true; } - /** - * Find the minimum firstRowId among files whose schema does not contain all index columns. - * Files at or beyond this rowId cannot be indexed because the column was added later via ALTER - * TABLE. - * - * @return the boundary rowId, or {@link Long#MAX_VALUE} if all files contain the columns - */ - static long findMinNonIndexableRowId( - SchemaManager schemaManager, List entries, List indexColumns) { - Map schemaContainsColumns = new HashMap<>(); - long minRowId = Long.MAX_VALUE; - for (ManifestEntry entry : entries) { - long sid = entry.file().schemaId(); - boolean contains = - schemaContainsColumns.computeIfAbsent( - sid, - id -> schemaManager.schema(id).fieldNames().containsAll(indexColumns)); - if (!contains && entry.file().firstRowId() != null) { - minRowId = Math.min(minRowId, entry.file().nonNullFirstRowId()); - } - } - return minRowId; - } - - /** Keep only entries whose firstRowId is strictly less than the given boundary. */ - static List filterEntriesBefore( - List entries, long boundaryRowId) { - if (boundaryRowId == Long.MAX_VALUE) { - return entries; - } - List result = new ArrayList<>(); - for (ManifestEntry entry : entries) { - if (entry.file().firstRowId() != null - && entry.file().nonNullFirstRowId() < boundaryRowId) { - result.add(entry); - } - } - LOG.info( - "Filtered {} files at or beyond rowId {}, {} files remain.", - entries.size() - result.size(), - boundaryRowId, - result.size()); - return result; - } - /** * Compute shard tasks for a full build (no rows to skip). * From 5582952da73cc5e29aa608170cc719d07adc81f0 Mon Sep 17 00:00:00 2001 From: CrownChu Date: Tue, 26 May 2026 22:45:58 +0800 Subject: [PATCH 06/20] [globalindex] Fix test to reference GlobalIndexBuilderUtils after method extraction --- .../flink/globalindex/GenericIndexTopoBuilderTest.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilderTest.java b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilderTest.java index fb1bd02f4408..c69b59ad6e3c 100644 --- a/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilderTest.java +++ b/paimon-flink/paimon-flink-common/src/test/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilderTest.java @@ -23,6 +23,7 @@ import org.apache.paimon.data.BinaryRowWriter; import org.apache.paimon.data.BinaryString; import org.apache.paimon.fs.Path; +import org.apache.paimon.globalindex.GlobalIndexBuilderUtils; import org.apache.paimon.io.PojoDataFileMeta; import org.apache.paimon.manifest.FileKind; import org.apache.paimon.manifest.ManifestEntry; @@ -472,9 +473,9 @@ void testAppendFilterOldFilesBeforeNewFiles() { entries.add(createEntryWithSchemaId(BinaryRow.EMPTY_ROW, 200L, 100, 0L)); List result = - GenericIndexTopoBuilder.filterEntriesBefore( + GlobalIndexBuilderUtils.filterEntriesBefore( entries, - GenericIndexTopoBuilder.findMinNonIndexableRowId( + GlobalIndexBuilderUtils.findMinNonIndexableRowId( schemaManager, entries, Collections.singletonList("vec"))); assertThat(result).hasSize(2); From 7e6d5b0c33001479fe7348d26d4cfe77711874d0 Mon Sep 17 00:00:00 2001 From: CrownChu Date: Wed, 27 May 2026 11:05:04 +0800 Subject: [PATCH 07/20] [globalindex] Fix multi-column writer projection, add BTree validation, and restore observability logs --- .../globalindex/GlobalIndexBuilderUtils.java | 31 ++++++++++++++++++- .../globalindex/GenericIndexTopoBuilder.java | 12 ++++++- .../procedure/CreateGlobalIndexProcedure.java | 6 ++++ .../DefaultGlobalIndexBuilder.java | 8 ++++- .../procedure/CreateGlobalIndexProcedure.java | 6 ++++ 5 files changed, 60 insertions(+), 3 deletions(-) diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java index 8f256957b7da..497d50ece6e9 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java @@ -31,6 +31,9 @@ import org.apache.paimon.types.DataField; import org.apache.paimon.utils.Range; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import javax.annotation.Nullable; import java.io.IOException; @@ -42,6 +45,8 @@ /** Utils for global index build. */ public class GlobalIndexBuilderUtils { + private static final Logger LOG = LoggerFactory.getLogger(GlobalIndexBuilderUtils.class); + public static final int MULTI_COLUMN_INDEX_FIELD_ID = -1; public static List toIndexFileMetas( @@ -148,6 +153,7 @@ public static long findMinNonIndexableRowId( SchemaManager schemaManager, List entries, List indexColumns) { Map schemaContainsColumns = new HashMap<>(); long minRowId = Long.MAX_VALUE; + long minSchemaId = -1; for (ManifestEntry entry : entries) { long sid = entry.file().schemaId(); boolean contains = @@ -155,8 +161,26 @@ public static long findMinNonIndexableRowId( sid, id -> schemaManager.schema(id).fieldNames().containsAll(indexColumns)); if (!contains && entry.file().firstRowId() != null) { - minRowId = Math.min(minRowId, entry.file().nonNullFirstRowId()); + long rowId = entry.file().nonNullFirstRowId(); + if (rowId < minRowId) { + minRowId = rowId; + minSchemaId = sid; + } + } + } + if (minRowId != Long.MAX_VALUE) { + List schemaFields = schemaManager.schema(minSchemaId).fieldNames(); + List missingColumns = new ArrayList<>(); + for (String col : indexColumns) { + if (!schemaFields.contains(col)) { + missingColumns.add(col); + } } + LOG.info( + "Found non-indexable files: schemaId={} missing columns {}, boundaryRowId={}.", + minSchemaId, + missingColumns, + minRowId); } return minRowId; } @@ -174,6 +198,11 @@ public static List filterEntriesBefore( result.add(entry); } } + LOG.info( + "Filtered {} files to {} indexable files (boundaryRowId={}).", + entries.size(), + result.size(), + boundaryRowId); return result; } diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java index 035aacdce3f8..99a551a9e4d9 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java @@ -51,6 +51,7 @@ import org.apache.paimon.types.DataField; import org.apache.paimon.types.RowType; import org.apache.paimon.utils.CloseableIterator; +import org.apache.paimon.utils.ProjectedRow; import org.apache.paimon.utils.Range; import org.apache.flink.streaming.api.datastream.DataStream; @@ -573,6 +574,7 @@ private static class BuildIndexOperator private transient InternalRow.FieldGetter[] indexFieldGetters; private transient int rowIdFieldIndex; private transient boolean multiColumn; + private transient ProjectedRow writerProjection; BuildIndexOperator( ReadBuilder readBuilder, @@ -602,6 +604,13 @@ public void open() throws Exception { } this.rowIdFieldIndex = projectedRowType.getFieldIndex(SpecialFields.ROW_ID.name()); this.multiColumn = indexFields.size() > 1; + if (multiColumn) { + int[] projection = new int[indexFields.size()]; + for (int i = 0; i < indexFields.size(); i++) { + projection[i] = projectedRowType.getFieldIndex(indexFields.get(i).name()); + } + this.writerProjection = ProjectedRow.from(projection); + } } @Override @@ -664,7 +673,8 @@ public void processElement(StreamRecord element) throws Exception { task.shardRange.to); break; } - ((GlobalIndexMultiColumnWriter) indexWriter).write(row); + ((GlobalIndexMultiColumnWriter) indexWriter) + .write(writerProjection.replaceRow(row)); } else { Object fieldData = indexFieldGetters[0].getFieldOrNull(row); if (fieldData == null) { diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java index e77a2f598df8..2584c5419a25 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java @@ -101,6 +101,12 @@ public String[] call( // Build global index based on index type indexType = indexType.toLowerCase().trim(); + if ("btree".equals(indexType)) { + checkArgument( + indexColumns.size() == 1, + "BTree index only supports single column, got: %s", + indexColumns); + } try { if ("btree".equals(indexType)) { BTreeIndexTopoBuilder.buildIndexAndExecute( diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java index 041ee9bf41b6..48386d7ab8d3 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java @@ -35,6 +35,7 @@ import org.apache.paimon.types.RowType; import org.apache.paimon.utils.CloseableIterator; import org.apache.paimon.utils.LongCounter; +import org.apache.paimon.utils.ProjectedRow; import org.apache.paimon.utils.Range; import java.io.IOException; @@ -127,9 +128,14 @@ private List writePaimonRows( if (multiColumn) { GlobalIndexMultiColumnWriter multiWriter = (GlobalIndexMultiColumnWriter) indexWriter; + int[] projection = new int[indexFields.size()]; + for (int i = 0; i < indexFields.size(); i++) { + projection[i] = readType.getFieldIndex(indexFields.get(i).name()); + } + ProjectedRow projectedRow = ProjectedRow.from(projection); rows.forEachRemaining( row -> { - multiWriter.write(row); + multiWriter.write(projectedRow.replaceRow(row)); rowCounter.add(1); }); } else { diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java index 0ddc4fcc526a..eac65d9e4baa 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java @@ -134,6 +134,12 @@ public InternalRow[] call(InternalRow args) { col, tableIdent); } + if ("btree".equalsIgnoreCase(indexType)) { + checkArgument( + indexColumns.size() == 1, + "BTree index only supports single column, got: %s", + indexColumns); + } DataSourceV2Relation relation = createRelation(tableIdent, sparkTable); PartitionPredicate partitionPredicate = SparkProcedureUtils.convertToPartitionPredicate( From 230c97d3fb275c03ebc4cf42aa09bf7131dec2d8 Mon Sep 17 00:00:00 2001 From: CrownChu Date: Wed, 27 May 2026 11:17:54 +0800 Subject: [PATCH 08/20] [globalindex] Fix MERGE INTO crash when table has multi-column global index (indexFieldId=-1) --- .../dataevolution/MergeIntoUpdateChecker.java | 37 ++++++++++++++++--- .../MergeIntoPaimonDataEvolutionTable.scala | 23 ++++++++++-- .../MergeIntoPaimonDataEvolutionTable.scala | 23 ++++++++++-- 3 files changed, 69 insertions(+), 14 deletions(-) diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/MergeIntoUpdateChecker.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/MergeIntoUpdateChecker.java index 8b1122382aae..aed46f0078e8 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/MergeIntoUpdateChecker.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/MergeIntoUpdateChecker.java @@ -39,6 +39,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.util.ArrayList; +import java.util.Collection; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -46,6 +48,8 @@ import java.util.Set; import java.util.stream.Collectors; +import static org.apache.paimon.globalindex.GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID; + /** * The checker for merge into update result. It will check each committable to see if some * global-indexed columns are updated. It will take some actions according to {@link @@ -100,10 +104,12 @@ private void checkUpdatedColumns() { GlobalIndexMeta globalIndexMeta = entry.indexFile().globalIndexMeta(); if (globalIndexMeta != null) { - String fieldName = - rowType.getField(globalIndexMeta.indexFieldId()) - .name(); - return updatedColumns.contains(fieldName) + Collection indexedNames = + getIndexedFieldNames(globalIndexMeta, rowType); + boolean overlaps = + indexedNames.stream() + .anyMatch(updatedColumns::contains); + return overlaps && affectedPartitions.contains(entry.partition()); } return false; @@ -116,8 +122,8 @@ private void checkUpdatedColumns() { case THROW_ERROR: Set conflictedColumns = affectedEntries.stream() - .map(file -> file.indexFile().globalIndexMeta().indexFieldId()) - .map(id -> rowType.getField(id).name()) + .map(file -> file.indexFile().globalIndexMeta()) + .flatMap(meta -> getIndexedFieldNames(meta, rowType).stream()) .collect(Collectors.toSet()); throw new RuntimeException( @@ -159,4 +165,23 @@ private void checkUpdatedColumns() { } } } + + private static Collection getIndexedFieldNames(GlobalIndexMeta meta, RowType rowType) { + int fieldId = meta.indexFieldId(); + if (fieldId == MULTI_COLUMN_INDEX_FIELD_ID) { + List names = new ArrayList<>(); + for (int id : meta.extraFieldIds()) { + names.add(rowType.getField(id).name()); + } + return names; + } + List names = new ArrayList<>(); + names.add(rowType.getField(fieldId).name()); + if (meta.extraFieldIds() != null) { + for (int id : meta.extraFieldIds()) { + names.add(rowType.getField(id).name()); + } + } + return names; + } } diff --git a/paimon-spark/paimon-spark-4.0/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala b/paimon-spark/paimon-spark-4.0/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala index 492d64bbf5bf..9c90a1e8445c 100644 --- a/paimon-spark/paimon-spark-4.0/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala +++ b/paimon-spark/paimon-spark-4.0/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala @@ -21,6 +21,8 @@ package org.apache.paimon.spark.commands import org.apache.paimon.CoreOptions.GlobalIndexColumnUpdateAction import org.apache.paimon.data.BinaryRow import org.apache.paimon.format.blob.BlobFileFormat.isBlobFile +import org.apache.paimon.globalindex.GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID +import org.apache.paimon.index.GlobalIndexMeta import org.apache.paimon.io.{CompactIncrement, DataIncrement} import org.apache.paimon.manifest.IndexManifestEntry import org.apache.paimon.spark.SparkTable @@ -502,15 +504,29 @@ case class MergeIntoPaimonDataEvolutionTable( return updateCommit } + def getIndexedFieldNames( + meta: GlobalIndexMeta, + rt: org.apache.paimon.types.RowType): Seq[String] = { + if (meta.indexFieldId() == MULTI_COLUMN_INDEX_FIELD_ID) { + meta.extraFieldIds().map(id => rt.getField(id).name()).toSeq + } else { + val names = ArrayBuffer(rt.getField(meta.indexFieldId()).name()) + if (meta.extraFieldIds() != null) { + meta.extraFieldIds().foreach(id => names += rt.getField(id).name()) + } + names.toSeq + } + } + val filter: org.apache.paimon.utils.Filter[IndexManifestEntry] = (entry: IndexManifestEntry) => { val globalIndexMeta = entry.indexFile().globalIndexMeta() if (globalIndexMeta == null) { false } else { - val fieldName = rowType.getField(globalIndexMeta.indexFieldId()).name() + val indexedNames = getIndexedFieldNames(globalIndexMeta, rowType) affectedParts.contains(entry.partition()) && updateColumns.exists( - _.name.equals(fieldName)) + col => indexedNames.contains(col.name)) } } @@ -527,8 +543,7 @@ case class MergeIntoPaimonDataEvolutionTable( case GlobalIndexColumnUpdateAction.THROW_ERROR => val updatedColNames = updateColumns.map(_.name) val conflicted = affectedIndexEntries - .map(_.indexFile().globalIndexMeta().indexFieldId()) - .map(id => rowType.getField(id).name()) + .flatMap(e => getIndexedFieldNames(e.indexFile().globalIndexMeta(), rowType)) .toSet throw new RuntimeException( s"""MergeInto: update columns contain globally indexed columns, not supported now. diff --git a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala index 96f8c0c5cc9f..15e03a74dbc0 100644 --- a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala +++ b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala @@ -21,6 +21,8 @@ package org.apache.paimon.spark.commands import org.apache.paimon.CoreOptions.GlobalIndexColumnUpdateAction import org.apache.paimon.data.BinaryRow import org.apache.paimon.format.blob.BlobFileFormat.isBlobFile +import org.apache.paimon.globalindex.GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID +import org.apache.paimon.index.GlobalIndexMeta import org.apache.paimon.io.{CompactIncrement, DataIncrement} import org.apache.paimon.manifest.IndexManifestEntry import org.apache.paimon.spark.SparkTable @@ -511,9 +513,9 @@ case class MergeIntoPaimonDataEvolutionTable( if (globalIndexMeta == null) { false } else { - val fieldName = rowType.getField(globalIndexMeta.indexFieldId()).name() + val indexedNames = getIndexedFieldNames(globalIndexMeta, rowType) affectedParts.contains(entry.partition()) && updateColumns.exists( - _.name.equals(fieldName)) + col => indexedNames.contains(col.name)) } } @@ -530,8 +532,7 @@ case class MergeIntoPaimonDataEvolutionTable( case GlobalIndexColumnUpdateAction.THROW_ERROR => val updatedColNames = updateColumns.map(_.name) val conflicted = affectedIndexEntries - .map(_.indexFile().globalIndexMeta().indexFieldId()) - .map(id => rowType.getField(id).name()) + .flatMap(e => getIndexedFieldNames(e.indexFile().globalIndexMeta(), rowType)) .toSet throw new RuntimeException( s"""MergeInto: update columns contain globally indexed columns, not supported now. @@ -555,6 +556,20 @@ case class MergeIntoPaimonDataEvolutionTable( } } + private def getIndexedFieldNames( + meta: GlobalIndexMeta, + rowType: org.apache.paimon.types.RowType): Seq[String] = { + if (meta.indexFieldId() == MULTI_COLUMN_INDEX_FIELD_ID) { + meta.extraFieldIds().map(id => rowType.getField(id).name()).toSeq + } else { + val names = ArrayBuffer(rowType.getField(meta.indexFieldId()).name()) + if (meta.extraFieldIds() != null) { + meta.extraFieldIds().foreach(id => names += rowType.getField(id).name()) + } + names.toSeq + } + } + private def findRelatedFirstRowIds( dataset: Dataset[Row], sparkSession: SparkSession, From 8a3bd942ee45a055c21a9ec878f29cf6a2e1d492 Mon Sep 17 00:00:00 2001 From: CrownChu Date: Wed, 27 May 2026 11:50:00 +0800 Subject: [PATCH 09/20] [globalindex] Fix FullText/Vector read path mismatch and reject multi-column for unsupported index types --- .../globalindex/GlobalIndexerFactory.java | 6 +++++ .../paimon/table/source/FullTextReadImpl.java | 24 +++++++++++++++---- .../paimon/table/source/VectorReadImpl.java | 24 +++++++++++++++---- 3 files changed, 46 insertions(+), 8 deletions(-) diff --git a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java index e2497a6f82e3..cef643fa463f 100644 --- a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java +++ b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java @@ -32,6 +32,12 @@ public interface GlobalIndexerFactory { GlobalIndexer create(DataField dataField, Options options); default GlobalIndexer create(List fields, Options options) { + if (fields.size() > 1) { + throw new UnsupportedOperationException( + String.format( + "Index type '%s' does not support multi-column index, got columns: %s", + identifier(), fields)); + } return create(fields.get(0), options); } } diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java index 37b5e4553713..f5429e4f72c1 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java @@ -19,6 +19,7 @@ package org.apache.paimon.table.source; import org.apache.paimon.fs.FileIO; +import org.apache.paimon.globalindex.GlobalIndexBuilderUtils; import org.apache.paimon.globalindex.GlobalIndexIOMeta; import org.apache.paimon.globalindex.GlobalIndexReadThreadPool; import org.apache.paimon.globalindex.GlobalIndexReader; @@ -34,6 +35,7 @@ import org.apache.paimon.predicate.FullTextSearch; import org.apache.paimon.table.FileStoreTable; import org.apache.paimon.types.DataField; +import org.apache.paimon.types.RowType; import org.apache.paimon.utils.IOUtils; import java.util.ArrayList; @@ -67,10 +69,24 @@ public GlobalIndexResult read(List splits) { return GlobalIndexResult.createEmpty(); } - String indexType = splits.get(0).fullTextIndexFiles().get(0).indexType(); - GlobalIndexer globalIndexer = - GlobalIndexerFactoryUtils.load(indexType) - .create(textColumn, table.coreOptions().toConfiguration()); + IndexFileMeta firstFile = splits.get(0).fullTextIndexFiles().get(0); + String indexType = firstFile.indexType(); + GlobalIndexMeta firstMeta = checkNotNull(firstFile.globalIndexMeta()); + GlobalIndexer globalIndexer; + if (firstMeta.indexFieldId() == GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID) { + RowType rowType = table.rowType(); + List fields = new ArrayList<>(); + for (int id : firstMeta.extraFieldIds()) { + fields.add(rowType.getField(id)); + } + globalIndexer = + GlobalIndexerFactoryUtils.load(indexType) + .create(fields, table.coreOptions().toConfiguration()); + } else { + globalIndexer = + GlobalIndexerFactoryUtils.load(indexType) + .create(textColumn, table.coreOptions().toConfiguration()); + } IndexPathFactory indexPathFactory = table.store().pathFactory().globalIndexFileFactory(); int parallelism = table.coreOptions().toConfiguration().get(GLOBAL_INDEX_THREAD_NUM); diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java index a3402c3f1d66..410437bda2a1 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java @@ -19,6 +19,7 @@ package org.apache.paimon.table.source; import org.apache.paimon.fs.FileIO; +import org.apache.paimon.globalindex.GlobalIndexBuilderUtils; import org.apache.paimon.globalindex.GlobalIndexIOMeta; import org.apache.paimon.globalindex.GlobalIndexReadThreadPool; import org.apache.paimon.globalindex.GlobalIndexReader; @@ -36,6 +37,7 @@ import org.apache.paimon.predicate.VectorSearch; import org.apache.paimon.table.FileStoreTable; import org.apache.paimon.types.DataField; +import org.apache.paimon.types.RowType; import org.apache.paimon.utils.IOUtils; import org.apache.paimon.utils.RoaringNavigableMap64; @@ -84,10 +86,24 @@ public GlobalIndexResult read(List splits) { RoaringNavigableMap64 preFilter = preFilter(splits).orElse(null); - String indexType = splits.get(0).vectorIndexFiles().get(0).indexType(); - GlobalIndexer globalIndexer = - GlobalIndexerFactoryUtils.load(indexType) - .create(vectorColumn, table.coreOptions().toConfiguration()); + IndexFileMeta firstFile = splits.get(0).vectorIndexFiles().get(0); + String indexType = firstFile.indexType(); + GlobalIndexMeta firstMeta = checkNotNull(firstFile.globalIndexMeta()); + GlobalIndexer globalIndexer; + if (firstMeta.indexFieldId() == GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID) { + RowType rowType = table.rowType(); + List fields = new ArrayList<>(); + for (int id : firstMeta.extraFieldIds()) { + fields.add(rowType.getField(id)); + } + globalIndexer = + GlobalIndexerFactoryUtils.load(indexType) + .create(fields, table.coreOptions().toConfiguration()); + } else { + globalIndexer = + GlobalIndexerFactoryUtils.load(indexType) + .create(vectorColumn, table.coreOptions().toConfiguration()); + } IndexPathFactory indexPathFactory = table.store().pathFactory().globalIndexFileFactory(); int parallelism = table.coreOptions().toConfiguration().get(GLOBAL_INDEX_THREAD_NUM); From df64ef16f88652e53c24be1432d645262350dd5a Mon Sep 17 00:00:00 2001 From: CrownChu Date: Wed, 27 May 2026 13:59:05 +0800 Subject: [PATCH 10/20] [globalindex] Add input validation, Spark schema filtering, null check, and multi-column guard --- .../procedure/CreateGlobalIndexProcedure.java | 1 + .../DefaultGlobalIndexBuilder.java | 35 +++++++++++++++---- .../DefaultGlobalIndexTopoBuilder.java | 9 +++++ .../GlobalIndexTopologyBuilder.java | 6 ++++ .../procedure/CreateGlobalIndexProcedure.java | 1 + 5 files changed, 46 insertions(+), 6 deletions(-) diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java index 2584c5419a25..27d10fecf640 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java @@ -84,6 +84,7 @@ public String[] call( .map(String::trim) .filter(s -> !s.isEmpty()) .collect(Collectors.toList()); + checkArgument(!indexColumns.isEmpty(), "At least one column required."); for (String col : indexColumns) { checkArgument( rowType.containsField(col), diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java index 48386d7ab8d3..a64045633c6b 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java @@ -38,6 +38,9 @@ import org.apache.paimon.utils.ProjectedRow; import org.apache.paimon.utils.Range; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import java.io.IOException; import java.io.Serializable; import java.util.Collections; @@ -49,6 +52,7 @@ /** Default global index builder. */ public class DefaultGlobalIndexBuilder implements Serializable { + private static final Logger LOG = LoggerFactory.getLogger(DefaultGlobalIndexBuilder.class); private static final long serialVersionUID = 1L; private final FileStoreTable table; @@ -129,15 +133,34 @@ private List writePaimonRows( GlobalIndexMultiColumnWriter multiWriter = (GlobalIndexMultiColumnWriter) indexWriter; int[] projection = new int[indexFields.size()]; + InternalRow.FieldGetter[] getters = new InternalRow.FieldGetter[indexFields.size()]; for (int i = 0; i < indexFields.size(); i++) { - projection[i] = readType.getFieldIndex(indexFields.get(i).name()); + DataField field = indexFields.get(i); + projection[i] = readType.getFieldIndex(field.name()); + getters[i] = + InternalRow.createFieldGetter( + field.type(), readType.getFieldIndex(field.name())); } ProjectedRow projectedRow = ProjectedRow.from(projection); - rows.forEachRemaining( - row -> { - multiWriter.write(projectedRow.replaceRow(row)); - rowCounter.add(1); - }); + while (rows.hasNext()) { + InternalRow row = rows.next(); + boolean hasNull = false; + for (InternalRow.FieldGetter getter : getters) { + if (getter.getFieldOrNull(row) == null) { + hasNull = true; + break; + } + } + if (hasNull) { + LOG.info( + "Null value in indexed columns, stopping shard [{}, {}].", + rowRange.from, + rowRange.to); + break; + } + multiWriter.write(projectedRow.replaceRow(row)); + rowCounter.add(1); + } } else { DataField indexField = indexFields.get(0); GlobalIndexSingletonWriter singleWriter = (GlobalIndexSingletonWriter) indexWriter; diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexTopoBuilder.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexTopoBuilder.java index 437ad11737dc..ea2cda4a8b85 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexTopoBuilder.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexTopoBuilder.java @@ -21,12 +21,14 @@ import org.apache.paimon.data.BinaryRow; import org.apache.paimon.data.InternalRow; import org.apache.paimon.fs.Path; +import org.apache.paimon.globalindex.GlobalIndexBuilderUtils; import org.apache.paimon.globalindex.IndexedSplit; import org.apache.paimon.io.DataFileMeta; import org.apache.paimon.manifest.ManifestEntry; import org.apache.paimon.options.Options; import org.apache.paimon.partition.PartitionPredicate; import org.apache.paimon.reader.RecordReader; +import org.apache.paimon.schema.SchemaManager; import org.apache.paimon.table.FileStoreTable; import org.apache.paimon.table.sink.CommitMessage; import org.apache.paimon.table.sink.CommitMessageSerializer; @@ -110,6 +112,13 @@ public List buildIndex( List entries = table.store().newScan().withPartitionFilter(partitionPredicate).plan().files(); + List indexColumns = + indexFields.stream().map(DataField::name).collect(Collectors.toList()); + SchemaManager schemaManager = new SchemaManager(table.fileIO(), table.location()); + long boundaryRowId = + GlobalIndexBuilderUtils.findMinNonIndexableRowId( + schemaManager, entries, indexColumns); + entries = GlobalIndexBuilderUtils.filterEntriesBefore(entries, boundaryRowId); // generate splits for each partition && shard Map> splits = split(table, entries, rowsPerShard); diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/GlobalIndexTopologyBuilder.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/GlobalIndexTopologyBuilder.java index aea421800410..3d751f4585ac 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/GlobalIndexTopologyBuilder.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/GlobalIndexTopologyBuilder.java @@ -57,6 +57,12 @@ default List buildIndex( List indexFields, Options options) throws IOException { + if (indexFields.size() > 1) { + throw new UnsupportedOperationException( + String.format( + "Topology builder '%s' does not support multi-column index, got columns: %s", + identifier(), indexFields)); + } return buildIndex( spark, relation, diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java index eac65d9e4baa..584b0e895ce2 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java @@ -127,6 +127,7 @@ public InternalRow[] call(InternalRow args) { .map(String::trim) .filter(s -> !s.isEmpty()) .collect(Collectors.toList()); + checkArgument(!indexColumns.isEmpty(), "At least one column required."); for (String col : indexColumns) { checkArgument( rowType.containsField(col), From 6e79d861804b84a95b855caa82ff9c10a31113c5 Mon Sep 17 00:00:00 2001 From: CrownChu Date: Wed, 27 May 2026 14:54:39 +0800 Subject: [PATCH 11/20] [globalindex] Reject duplicate index columns and document why column count is unlimited --- .../paimon/globalindex/GlobalIndexScanner.java | 17 ++++++----------- .../procedure/CreateGlobalIndexProcedure.java | 10 ++++++++++ .../procedure/CreateGlobalIndexProcedure.java | 11 +++++++++++ 3 files changed, 27 insertions(+), 11 deletions(-) diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java index cffcbb34646a..68d3c76823e8 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java @@ -20,7 +20,6 @@ import org.apache.paimon.fs.FileIO; import org.apache.paimon.fs.Path; -import org.apache.paimon.globalindex.GlobalIndexBuilderUtils; import org.apache.paimon.globalindex.io.GlobalIndexFileReader; import org.apache.paimon.index.GlobalIndexMeta; import org.apache.paimon.index.IndexFileMeta; @@ -92,15 +91,14 @@ public GlobalIndexScanner( String indexType = indexFile.indexType(); Range range = new Range(meta.rowRangeStart(), meta.rowRangeEnd()); - if (meta.indexFieldId() == GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID + if (meta.indexFieldId() == MULTI_COLUMN_INDEX_FIELD_ID && meta.extraFieldIds() != null) { // Multi-column index: all participating fields share the same IndexFileMeta, // so looking up from any fieldId returns identical index files. List fieldIds = - Arrays.stream(meta.extraFieldIds()) - .boxed() - .collect(Collectors.toList()); - // Validate consistency: all files in the same group must have identical extraFieldIds + Arrays.stream(meta.extraFieldIds()).boxed().collect(Collectors.toList()); + // Validate consistency: all files in the same group must have identical + // extraFieldIds if (fieldToGroup.containsKey(fieldIds.get(0))) { List existingGroup = fieldToGroup.get(fieldIds.get(0)); checkArgument( @@ -132,11 +130,8 @@ public GlobalIndexScanner( if (group != null) { // Multi-column: resolve full field list List fields = - group.stream() - .map(rowType::getField) - .collect(Collectors.toList()); - return createReaders( - indexFileReader, multiColumnMetas.get(group), fields); + group.stream().map(rowType::getField).collect(Collectors.toList()); + return createReaders(indexFileReader, multiColumnMetas.get(group), fields); } else { // Single-column return createReaders( diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java index 27d10fecf640..f9f54918ea4a 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java @@ -33,6 +33,7 @@ import org.apache.flink.table.procedure.ProcedureContext; import java.util.Arrays; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.stream.Collectors; @@ -85,6 +86,15 @@ public String[] call( .filter(s -> !s.isEmpty()) .collect(Collectors.toList()); checkArgument(!indexColumns.isEmpty(), "At least one column required."); + checkArgument( + indexColumns.size() == new HashSet<>(indexColumns).size(), + "Duplicate index columns are not allowed: %s", + indexColumns); + // No hard cap on the number of index columns: unlike row-store B-tree indexes + // (e.g. MySQL 16, PostgreSQL 32) whose limit comes from composing columns into a + // single key, the global index is built on per-type index frameworks. Whether + // multiple columns are supported, and any practical limit, is decided by each + // index type (single-column types reject multi-column via UnsupportedOperationException). for (String col : indexColumns) { checkArgument( rowType.containsField(col), diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java index 584b0e895ce2..18b6c807b231 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java @@ -45,6 +45,7 @@ import java.util.Arrays; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Locale; import java.util.UUID; @@ -128,6 +129,16 @@ public InternalRow[] call(InternalRow args) { .filter(s -> !s.isEmpty()) .collect(Collectors.toList()); checkArgument(!indexColumns.isEmpty(), "At least one column required."); + checkArgument( + indexColumns.size() == new HashSet<>(indexColumns).size(), + "Duplicate index columns are not allowed: %s", + indexColumns); + // No hard cap on the number of index columns: unlike row-store B-tree + // indexes (e.g. MySQL 16, PostgreSQL 32) whose limit comes from composing + // columns into a single key, the global index is built on per-type index + // frameworks. Whether multiple columns are supported, and any practical + // limit, is decided by each index type (single-column types reject + // multi-column via UnsupportedOperationException). for (String col : indexColumns) { checkArgument( rowType.containsField(col), From 62e6ac5b862950d5d2d4de0890edae4e56c36658 Mon Sep 17 00:00:00 2001 From: CrownChu Date: Fri, 29 May 2026 19:34:48 +0800 Subject: [PATCH 12/20] [globalindex] Address PR review: isMultiColumn helper, overlap detection, and display fix - Add GlobalIndexMeta.isMultiColumn() helper to replace scattered sentinel checks - Fix IndexManifestFileHandler overlap detection for multi-column indexes - Fix TableIndexesTable showing null for multi-column index field names - Replace all MULTI_COLUMN_INDEX_FIELD_ID == checks with isMultiColumn() --- .../globalindex/GlobalIndexScanner.java | 4 +-- .../apache/paimon/index/GlobalIndexMeta.java | 5 +++ .../manifest/IndexManifestFileHandler.java | 31 ++++++++++++++----- .../paimon/table/source/FullTextReadImpl.java | 3 +- .../paimon/table/source/VectorReadImpl.java | 3 +- .../table/system/TableIndexesTable.java | 17 ++++++++-- .../dataevolution/MergeIntoUpdateChecker.java | 4 +-- 7 files changed, 47 insertions(+), 20 deletions(-) diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java index 68d3c76823e8..d31175666c71 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java @@ -52,7 +52,6 @@ import java.util.stream.Collectors; import static org.apache.paimon.CoreOptions.GLOBAL_INDEX_THREAD_NUM; -import static org.apache.paimon.globalindex.GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID; import static org.apache.paimon.predicate.PredicateVisitor.collectFieldNames; import static org.apache.paimon.table.source.snapshot.TimeTravelUtil.tryTravelOrLatest; import static org.apache.paimon.utils.Preconditions.checkArgument; @@ -91,8 +90,7 @@ public GlobalIndexScanner( String indexType = indexFile.indexType(); Range range = new Range(meta.rowRangeStart(), meta.rowRangeEnd()); - if (meta.indexFieldId() == MULTI_COLUMN_INDEX_FIELD_ID - && meta.extraFieldIds() != null) { + if (meta.isMultiColumn() && meta.extraFieldIds() != null) { // Multi-column index: all participating fields share the same IndexFileMeta, // so looking up from any fieldId returns identical index files. List fieldIds = diff --git a/paimon-core/src/main/java/org/apache/paimon/index/GlobalIndexMeta.java b/paimon-core/src/main/java/org/apache/paimon/index/GlobalIndexMeta.java index c468bbffb3aa..4bdb17c53817 100644 --- a/paimon-core/src/main/java/org/apache/paimon/index/GlobalIndexMeta.java +++ b/paimon-core/src/main/java/org/apache/paimon/index/GlobalIndexMeta.java @@ -18,6 +18,7 @@ package org.apache.paimon.index; +import org.apache.paimon.globalindex.GlobalIndexBuilderUtils; import org.apache.paimon.types.BigIntType; import org.apache.paimon.types.DataField; import org.apache.paimon.types.DataTypes; @@ -78,6 +79,10 @@ public int indexFieldId() { return indexFieldId; } + public boolean isMultiColumn() { + return indexFieldId == GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID; + } + @Nullable public int[] extraFieldIds() { return extraFieldIds; diff --git a/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestFileHandler.java b/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestFileHandler.java index 3621483197f7..87628290810c 100644 --- a/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestFileHandler.java +++ b/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestFileHandler.java @@ -28,6 +28,7 @@ import javax.annotation.Nullable; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; @@ -239,16 +240,32 @@ private void validateRetainedIndexFiles( for (IndexManifestEntry added : addedIndexFiles) { GlobalIndexMeta addedMeta = added.indexFile().globalIndexMeta(); - if (addedMeta == null - || retainedMeta.indexFieldId() != addedMeta.indexFieldId() - || !Range.intersect( - retainedMeta.rowRangeStart(), - retainedMeta.rowRangeEnd(), - addedMeta.rowRangeStart(), - addedMeta.rowRangeEnd())) { + if (addedMeta == null) { continue; } + // Single-column: skip if different fieldId or no range overlap + if (!retainedMeta.isMultiColumn()) { + if (retainedMeta.indexFieldId() != addedMeta.indexFieldId() + || !Range.intersect( + retainedMeta.rowRangeStart(), + retainedMeta.rowRangeEnd(), + addedMeta.rowRangeStart(), + addedMeta.rowRangeEnd())) { + continue; + } + } else { + // Multi-column: skip if different column group or no range overlap + if (!Arrays.equals(retainedMeta.extraFieldIds(), addedMeta.extraFieldIds()) + || !Range.intersect( + retainedMeta.rowRangeStart(), + retainedMeta.rowRangeEnd(), + addedMeta.rowRangeStart(), + addedMeta.rowRangeEnd())) { + continue; + } + } + throw new IllegalStateException( String.format( "Trying to add global index file %s of type %s for index field %s" diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java index f5429e4f72c1..e905da67a4e9 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java @@ -19,7 +19,6 @@ package org.apache.paimon.table.source; import org.apache.paimon.fs.FileIO; -import org.apache.paimon.globalindex.GlobalIndexBuilderUtils; import org.apache.paimon.globalindex.GlobalIndexIOMeta; import org.apache.paimon.globalindex.GlobalIndexReadThreadPool; import org.apache.paimon.globalindex.GlobalIndexReader; @@ -73,7 +72,7 @@ public GlobalIndexResult read(List splits) { String indexType = firstFile.indexType(); GlobalIndexMeta firstMeta = checkNotNull(firstFile.globalIndexMeta()); GlobalIndexer globalIndexer; - if (firstMeta.indexFieldId() == GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID) { + if (firstMeta.isMultiColumn()) { RowType rowType = table.rowType(); List fields = new ArrayList<>(); for (int id : firstMeta.extraFieldIds()) { diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java index 410437bda2a1..9621a82e37ce 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java @@ -19,7 +19,6 @@ package org.apache.paimon.table.source; import org.apache.paimon.fs.FileIO; -import org.apache.paimon.globalindex.GlobalIndexBuilderUtils; import org.apache.paimon.globalindex.GlobalIndexIOMeta; import org.apache.paimon.globalindex.GlobalIndexReadThreadPool; import org.apache.paimon.globalindex.GlobalIndexReader; @@ -90,7 +89,7 @@ public GlobalIndexResult read(List splits) { String indexType = firstFile.indexType(); GlobalIndexMeta firstMeta = checkNotNull(firstFile.globalIndexMeta()); GlobalIndexer globalIndexer; - if (firstMeta.indexFieldId() == GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID) { + if (firstMeta.isMultiColumn()) { RowType rowType = table.rowType(); List fields = new ArrayList<>(); for (int id : firstMeta.extraFieldIds()) { diff --git a/paimon-core/src/main/java/org/apache/paimon/table/system/TableIndexesTable.java b/paimon-core/src/main/java/org/apache/paimon/table/system/TableIndexesTable.java index 320257ce1057..f5e693d4a79d 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/system/TableIndexesTable.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/system/TableIndexesTable.java @@ -234,9 +234,20 @@ private InternalRow toRow( GlobalIndexMeta globalMeta = indexManifestEntry.indexFile().globalIndexMeta(); String indexFieldName = null; if (globalMeta != null) { - try { - indexFieldName = logicalRowType.getField(globalMeta.indexFieldId()).name(); - } catch (RuntimeException ignored) { + if (globalMeta.isMultiColumn() && globalMeta.extraFieldIds() != null) { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < globalMeta.extraFieldIds().length; i++) { + if (i > 0) { + sb.append(","); + } + sb.append(logicalRowType.getField(globalMeta.extraFieldIds()[i]).name()); + } + indexFieldName = sb.toString(); + } else { + try { + indexFieldName = logicalRowType.getField(globalMeta.indexFieldId()).name(); + } catch (RuntimeException ignored) { + } } } return GenericRow.of( diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/MergeIntoUpdateChecker.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/MergeIntoUpdateChecker.java index aed46f0078e8..b66e5d008307 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/MergeIntoUpdateChecker.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/MergeIntoUpdateChecker.java @@ -48,8 +48,6 @@ import java.util.Set; import java.util.stream.Collectors; -import static org.apache.paimon.globalindex.GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID; - /** * The checker for merge into update result. It will check each committable to see if some * global-indexed columns are updated. It will take some actions according to {@link @@ -168,7 +166,7 @@ private void checkUpdatedColumns() { private static Collection getIndexedFieldNames(GlobalIndexMeta meta, RowType rowType) { int fieldId = meta.indexFieldId(); - if (fieldId == MULTI_COLUMN_INDEX_FIELD_ID) { + if (meta.isMultiColumn()) { List names = new ArrayList<>(); for (int id : meta.extraFieldIds()) { names.add(rowType.getField(id).name()); From db9d30cc62c31edc82157e761428e85cbd752d3d Mon Sep 17 00:00:00 2001 From: CrownChu Date: Fri, 29 May 2026 20:04:44 +0800 Subject: [PATCH 13/20] [globalindex] Extract getIndexedFieldNames to GlobalIndexMeta and fix error message - Add GlobalIndexMeta.getIndexedFieldNames(RowType) to eliminate copy-pasted helper - Replace local getIndexedFieldNames in MergeIntoUpdateChecker (Flink) - Replace local getIndexedFieldNames in MergeIntoPaimonDataEvolutionTable (Spark common & 4.0) - Fix Spark CreateGlobalIndexProcedure error message to use indexColumns instead of column --- .../apache/paimon/index/GlobalIndexMeta.java | 19 +++++++++++++ .../dataevolution/MergeIntoUpdateChecker.java | 27 +++---------------- .../MergeIntoPaimonDataEvolutionTable.scala | 19 ++----------- .../procedure/CreateGlobalIndexProcedure.java | 2 +- .../MergeIntoPaimonDataEvolutionTable.scala | 19 ++----------- 5 files changed, 27 insertions(+), 59 deletions(-) diff --git a/paimon-core/src/main/java/org/apache/paimon/index/GlobalIndexMeta.java b/paimon-core/src/main/java/org/apache/paimon/index/GlobalIndexMeta.java index 4bdb17c53817..60aba56d21ae 100644 --- a/paimon-core/src/main/java/org/apache/paimon/index/GlobalIndexMeta.java +++ b/paimon-core/src/main/java/org/apache/paimon/index/GlobalIndexMeta.java @@ -28,7 +28,9 @@ import javax.annotation.Nullable; +import java.util.ArrayList; import java.util.Arrays; +import java.util.List; /** Schema for global index. */ public class GlobalIndexMeta { @@ -92,4 +94,21 @@ public int[] extraFieldIds() { public byte[] indexMeta() { return indexMeta; } + + public List getIndexedFieldNames(RowType rowType) { + List names = new ArrayList<>(); + if (isMultiColumn()) { + for (int id : extraFieldIds) { + names.add(rowType.getField(id).name()); + } + } else { + names.add(rowType.getField(indexFieldId).name()); + if (extraFieldIds != null) { + for (int id : extraFieldIds) { + names.add(rowType.getField(id).name()); + } + } + } + return names; + } } diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/MergeIntoUpdateChecker.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/MergeIntoUpdateChecker.java index b66e5d008307..bdd0c0d49194 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/MergeIntoUpdateChecker.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/dataevolution/MergeIntoUpdateChecker.java @@ -39,8 +39,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.ArrayList; -import java.util.Collection; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -102,8 +100,8 @@ private void checkUpdatedColumns() { GlobalIndexMeta globalIndexMeta = entry.indexFile().globalIndexMeta(); if (globalIndexMeta != null) { - Collection indexedNames = - getIndexedFieldNames(globalIndexMeta, rowType); + List indexedNames = + globalIndexMeta.getIndexedFieldNames(rowType); boolean overlaps = indexedNames.stream() .anyMatch(updatedColumns::contains); @@ -121,7 +119,7 @@ private void checkUpdatedColumns() { Set conflictedColumns = affectedEntries.stream() .map(file -> file.indexFile().globalIndexMeta()) - .flatMap(meta -> getIndexedFieldNames(meta, rowType).stream()) + .flatMap(meta -> meta.getIndexedFieldNames(rowType).stream()) .collect(Collectors.toSet()); throw new RuntimeException( @@ -163,23 +161,4 @@ private void checkUpdatedColumns() { } } } - - private static Collection getIndexedFieldNames(GlobalIndexMeta meta, RowType rowType) { - int fieldId = meta.indexFieldId(); - if (meta.isMultiColumn()) { - List names = new ArrayList<>(); - for (int id : meta.extraFieldIds()) { - names.add(rowType.getField(id).name()); - } - return names; - } - List names = new ArrayList<>(); - names.add(rowType.getField(fieldId).name()); - if (meta.extraFieldIds() != null) { - for (int id : meta.extraFieldIds()) { - names.add(rowType.getField(id).name()); - } - } - return names; - } } diff --git a/paimon-spark/paimon-spark-4.0/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala b/paimon-spark/paimon-spark-4.0/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala index 9c90a1e8445c..e72efe6efcaa 100644 --- a/paimon-spark/paimon-spark-4.0/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala +++ b/paimon-spark/paimon-spark-4.0/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala @@ -21,7 +21,6 @@ package org.apache.paimon.spark.commands import org.apache.paimon.CoreOptions.GlobalIndexColumnUpdateAction import org.apache.paimon.data.BinaryRow import org.apache.paimon.format.blob.BlobFileFormat.isBlobFile -import org.apache.paimon.globalindex.GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID import org.apache.paimon.index.GlobalIndexMeta import org.apache.paimon.io.{CompactIncrement, DataIncrement} import org.apache.paimon.manifest.IndexManifestEntry @@ -504,27 +503,13 @@ case class MergeIntoPaimonDataEvolutionTable( return updateCommit } - def getIndexedFieldNames( - meta: GlobalIndexMeta, - rt: org.apache.paimon.types.RowType): Seq[String] = { - if (meta.indexFieldId() == MULTI_COLUMN_INDEX_FIELD_ID) { - meta.extraFieldIds().map(id => rt.getField(id).name()).toSeq - } else { - val names = ArrayBuffer(rt.getField(meta.indexFieldId()).name()) - if (meta.extraFieldIds() != null) { - meta.extraFieldIds().foreach(id => names += rt.getField(id).name()) - } - names.toSeq - } - } - val filter: org.apache.paimon.utils.Filter[IndexManifestEntry] = (entry: IndexManifestEntry) => { val globalIndexMeta = entry.indexFile().globalIndexMeta() if (globalIndexMeta == null) { false } else { - val indexedNames = getIndexedFieldNames(globalIndexMeta, rowType) + val indexedNames = globalIndexMeta.getIndexedFieldNames(rowType).asScala affectedParts.contains(entry.partition()) && updateColumns.exists( col => indexedNames.contains(col.name)) } @@ -543,7 +528,7 @@ case class MergeIntoPaimonDataEvolutionTable( case GlobalIndexColumnUpdateAction.THROW_ERROR => val updatedColNames = updateColumns.map(_.name) val conflicted = affectedIndexEntries - .flatMap(e => getIndexedFieldNames(e.indexFile().globalIndexMeta(), rowType)) + .flatMap(e => e.indexFile().globalIndexMeta().getIndexedFieldNames(rowType).asScala) .toSet throw new RuntimeException( s"""MergeInto: update columns contain globally indexed columns, not supported now. diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java index 18b6c807b231..a68758338899 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java @@ -199,7 +199,7 @@ public InternalRow[] call(InternalRow args) { throw new RuntimeException( String.format( "Failed to create %s index for columns '%s' on table '%s'.", - indexType, column, tableIdent), + indexType, indexColumns, tableIdent), e); } }); diff --git a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala index 15e03a74dbc0..8c84eafd1a5a 100644 --- a/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala +++ b/paimon-spark/paimon-spark-common/src/main/scala/org/apache/paimon/spark/commands/MergeIntoPaimonDataEvolutionTable.scala @@ -21,7 +21,6 @@ package org.apache.paimon.spark.commands import org.apache.paimon.CoreOptions.GlobalIndexColumnUpdateAction import org.apache.paimon.data.BinaryRow import org.apache.paimon.format.blob.BlobFileFormat.isBlobFile -import org.apache.paimon.globalindex.GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID import org.apache.paimon.index.GlobalIndexMeta import org.apache.paimon.io.{CompactIncrement, DataIncrement} import org.apache.paimon.manifest.IndexManifestEntry @@ -513,7 +512,7 @@ case class MergeIntoPaimonDataEvolutionTable( if (globalIndexMeta == null) { false } else { - val indexedNames = getIndexedFieldNames(globalIndexMeta, rowType) + val indexedNames = globalIndexMeta.getIndexedFieldNames(rowType).asScala affectedParts.contains(entry.partition()) && updateColumns.exists( col => indexedNames.contains(col.name)) } @@ -532,7 +531,7 @@ case class MergeIntoPaimonDataEvolutionTable( case GlobalIndexColumnUpdateAction.THROW_ERROR => val updatedColNames = updateColumns.map(_.name) val conflicted = affectedIndexEntries - .flatMap(e => getIndexedFieldNames(e.indexFile().globalIndexMeta(), rowType)) + .flatMap(e => e.indexFile().globalIndexMeta().getIndexedFieldNames(rowType).asScala) .toSet throw new RuntimeException( s"""MergeInto: update columns contain globally indexed columns, not supported now. @@ -556,20 +555,6 @@ case class MergeIntoPaimonDataEvolutionTable( } } - private def getIndexedFieldNames( - meta: GlobalIndexMeta, - rowType: org.apache.paimon.types.RowType): Seq[String] = { - if (meta.indexFieldId() == MULTI_COLUMN_INDEX_FIELD_ID) { - meta.extraFieldIds().map(id => rowType.getField(id).name()).toSeq - } else { - val names = ArrayBuffer(rowType.getField(meta.indexFieldId()).name()) - if (meta.extraFieldIds() != null) { - meta.extraFieldIds().foreach(id => names += rowType.getField(id).name()) - } - names.toSeq - } - } - private def findRelatedFirstRowIds( dataset: Dataset[Row], sparkSession: SparkSession, From 27548b1db34841311657eabfcc107f11df8ef35e Mon Sep 17 00:00:00 2001 From: CrownChu Date: Mon, 1 Jun 2026 20:05:47 +0800 Subject: [PATCH 14/20] [globalindex] Fix compilation error: move indexColumns out of try block indexColumns was declared inside the try block but referenced in the catch block's error message, which is out of scope. Hoist the parsing before the try so the catch can access it. --- .../spark/procedure/CreateGlobalIndexProcedure.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java index a68758338899..a3173ca8ac1f 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java @@ -111,6 +111,11 @@ public InternalRow[] call(InternalRow args) { return modifySparkTable( tableIdent, sparkTable -> { + List indexColumns = + Arrays.stream(column.split(",")) + .map(String::trim) + .filter(s -> !s.isEmpty()) + .collect(Collectors.toList()); try { org.apache.paimon.table.Table t = sparkTable.getTable(); checkArgument( @@ -123,11 +128,6 @@ public InternalRow[] call(InternalRow args) { tableIdent); RowType rowType = table.rowType(); - List indexColumns = - Arrays.stream(column.split(",")) - .map(String::trim) - .filter(s -> !s.isEmpty()) - .collect(Collectors.toList()); checkArgument(!indexColumns.isEmpty(), "At least one column required."); checkArgument( indexColumns.size() == new HashSet<>(indexColumns).size(), From df2d15b474154151060ff3509614354648514424 Mon Sep 17 00:00:00 2001 From: CrownChu Date: Tue, 2 Jun 2026 10:52:35 +0800 Subject: [PATCH 15/20] [globalindex] Keep building through null values instead of ending the shard Breaking out of the shard loop on the first null indexed value dropped all later rows in the shard from the index and broke row-id alignment. Pass every row through the writer instead: a null field advances the logical row id without indexing a value, so later non-null rows are still indexed. - Flink single-column: restore null pass-through (was a regression) - Flink/Spark multi-column: pass the projected row through; each index type decides how to handle null fields --- .../globalindex/GenericIndexTopoBuilder.java | 29 ++++--------------- .../DefaultGlobalIndexBuilder.java | 26 +++-------------- 2 files changed, 10 insertions(+), 45 deletions(-) diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java index 99a551a9e4d9..4bba6fcc830b 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java @@ -658,33 +658,16 @@ public void processElement(StreamRecord element) throws Exception { // Only write rows within this shard's range if (currentRowId >= task.shardRange.from) { if (multiColumn) { - boolean hasNull = false; - for (InternalRow.FieldGetter getter : indexFieldGetters) { - if (getter.getFieldOrNull(row) == null) { - hasNull = true; - break; - } - } - if (hasNull) { - LOG.info( - "Null value in indexed columns at rowId={}, stopping shard [{}, {}].", - currentRowId, - task.shardRange.from, - task.shardRange.to); - break; - } + // Pass the row through, including null fields; each index type + // decides how to handle nulls. A null field advances the logical + // row id without indexing a value, so it must not end the shard: + // later non-null rows still need to be indexed and row-id alignment + // must be preserved. ((GlobalIndexMultiColumnWriter) indexWriter) .write(writerProjection.replaceRow(row)); } else { + // A null value advances the logical row id without indexing. Object fieldData = indexFieldGetters[0].getFieldOrNull(row); - if (fieldData == null) { - LOG.info( - "Null value at rowId={}, stopping shard [{}, {}].", - currentRowId, - task.shardRange.from, - task.shardRange.to); - break; - } ((GlobalIndexSingletonWriter) indexWriter).write(fieldData); } rowsSeen++; diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java index a64045633c6b..713965b01a32 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java @@ -38,9 +38,6 @@ import org.apache.paimon.utils.ProjectedRow; import org.apache.paimon.utils.Range; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import java.io.IOException; import java.io.Serializable; import java.util.Collections; @@ -52,7 +49,6 @@ /** Default global index builder. */ public class DefaultGlobalIndexBuilder implements Serializable { - private static final Logger LOG = LoggerFactory.getLogger(DefaultGlobalIndexBuilder.class); private static final long serialVersionUID = 1L; private final FileStoreTable table; @@ -133,31 +129,17 @@ private List writePaimonRows( GlobalIndexMultiColumnWriter multiWriter = (GlobalIndexMultiColumnWriter) indexWriter; int[] projection = new int[indexFields.size()]; - InternalRow.FieldGetter[] getters = new InternalRow.FieldGetter[indexFields.size()]; for (int i = 0; i < indexFields.size(); i++) { DataField field = indexFields.get(i); projection[i] = readType.getFieldIndex(field.name()); - getters[i] = - InternalRow.createFieldGetter( - field.type(), readType.getFieldIndex(field.name())); } ProjectedRow projectedRow = ProjectedRow.from(projection); while (rows.hasNext()) { InternalRow row = rows.next(); - boolean hasNull = false; - for (InternalRow.FieldGetter getter : getters) { - if (getter.getFieldOrNull(row) == null) { - hasNull = true; - break; - } - } - if (hasNull) { - LOG.info( - "Null value in indexed columns, stopping shard [{}, {}].", - rowRange.from, - rowRange.to); - break; - } + // Pass the row through, including null fields; each index type decides how to + // handle nulls. A null field advances the logical row id without indexing a + // value, so it must not end the shard: later non-null rows still need to be + // indexed and row-id alignment must be preserved. multiWriter.write(projectedRow.replaceRow(row)); rowCounter.add(1); } From a27c630283577fdbf220444de7a23929722085f6 Mon Sep 17 00:00:00 2001 From: CrownChu Date: Tue, 2 Jun 2026 10:52:42 +0800 Subject: [PATCH 16/20] [globalindex] Let a field participate in multiple multi-column index groups The scanner mapped each field id to a single multi-column group, so a field shared by several multi-column indexes (e.g. (a,b) and (a,c)) threw "Inconsistent extraFieldIds" or silently dropped readers. Model fieldId -> list of groups instead. For evaluation, every index covering a single field returns the same matching row ids, so pick one index rather than running them all: prefer the single-column index, otherwise fall back to one multi-column group. --- .../globalindex/GlobalIndexScanner.java | 52 +++++++++++-------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java index d31175666c71..adffa88152eb 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java @@ -54,7 +54,6 @@ import static org.apache.paimon.CoreOptions.GLOBAL_INDEX_THREAD_NUM; import static org.apache.paimon.predicate.PredicateVisitor.collectFieldNames; import static org.apache.paimon.table.source.snapshot.TimeTravelUtil.tryTravelOrLatest; -import static org.apache.paimon.utils.Preconditions.checkArgument; import static org.apache.paimon.utils.Preconditions.checkNotNull; /** Scanner for shard-based global indexes. */ @@ -82,8 +81,9 @@ public GlobalIndexScanner( // Multi-column indexes: fieldIds -> indexType -> range -> files Map, Map>>> multiColumnMetas = new HashMap<>(); - // Reverse lookup: fieldId -> its multi-column group - Map> fieldToGroup = new HashMap<>(); + // Reverse lookup: fieldId -> all multi-column groups it participates in. A field can + // belong to several multi-column indexes (e.g. (a,b) and (a,c)) at the same time. + Map>> fieldToGroups = new HashMap<>(); for (IndexFileMeta indexFile : indexFiles) { GlobalIndexMeta meta = checkNotNull(indexFile.globalIndexMeta()); @@ -91,25 +91,22 @@ public GlobalIndexScanner( Range range = new Range(meta.rowRangeStart(), meta.rowRangeEnd()); if (meta.isMultiColumn() && meta.extraFieldIds() != null) { - // Multi-column index: all participating fields share the same IndexFileMeta, - // so looking up from any fieldId returns identical index files. + // Multi-column index: all participating fields share the same IndexFileMeta. + // Multiple index files belonging to the same group are aggregated under the same + // multiColumnMetas key, and each participating field records this group. List fieldIds = Arrays.stream(meta.extraFieldIds()).boxed().collect(Collectors.toList()); - // Validate consistency: all files in the same group must have identical - // extraFieldIds - if (fieldToGroup.containsKey(fieldIds.get(0))) { - List existingGroup = fieldToGroup.get(fieldIds.get(0)); - checkArgument( - existingGroup.equals(fieldIds), - "Inconsistent extraFieldIds across index files."); - } multiColumnMetas .computeIfAbsent(fieldIds, k -> new HashMap<>()) .computeIfAbsent(indexType, k -> new HashMap<>()) .computeIfAbsent(range, k -> new ArrayList<>()) .add(indexFile); for (int id : fieldIds) { - fieldToGroup.put(id, fieldIds); + List> groups = + fieldToGroups.computeIfAbsent(id, k -> new ArrayList<>()); + if (!groups.contains(fieldIds)) { + groups.add(fieldIds); + } } } else { // Single-column index @@ -124,19 +121,28 @@ public GlobalIndexScanner( IntFunction> readersFunction = fId -> { - List group = fieldToGroup.get(fId); - if (group != null) { - // Multi-column: resolve full field list - List fields = - group.stream().map(rowType::getField).collect(Collectors.toList()); - return createReaders(indexFileReader, multiColumnMetas.get(group), fields); - } else { - // Single-column + // A filter on a single field can be served by any index covering that field, + // and every such index returns the same matching row ids. So pick ONE index + // instead of running them all: prefer the single-column index (purpose-built + // for this field and always able to serve the predicate); otherwise fall back + // to one of the multi-column groups this field participates in. + Map>> singleColumn = indexMetas.get(fId); + if (singleColumn != null) { return createReaders( indexFileReader, - indexMetas.get(fId), + singleColumn, Collections.singletonList(rowType.getField(fId))); } + List> groups = fieldToGroups.get(fId); + if (groups != null && !groups.isEmpty()) { + // No single-column index for this field: pick one of the multi-column + // groups it belongs to to accelerate the single-column filter. + List group = groups.get(0); + List fields = + group.stream().map(rowType::getField).collect(Collectors.toList()); + return createReaders(indexFileReader, multiColumnMetas.get(group), fields); + } + return Collections.emptyList(); }; this.globalIndexEvaluator = new GlobalIndexEvaluator(rowType, readersFunction); } From 6a0c49693a2a9821eb52a8e05e3ccf843a3bdfee Mon Sep 17 00:00:00 2001 From: CrownChu Date: Wed, 3 Jun 2026 02:14:34 +0800 Subject: [PATCH 17/20] [globalindex] Make indexFieldId the primary column for multi-column indexes Previously a multi-column index stored indexFieldId=-1 and put all field ids in extraFieldIds, treating columns as parallel. Switch to a primary-column model: indexFieldId is always the first (primary) column and extraFieldIds holds the remaining columns. A primary column can own at most one index. - GlobalIndexMeta: isMultiColumn() based on extraFieldIds; add getIndexedFieldIds() and getIndexedFields(); unify getIndexedFieldNames() - GlobalIndexBuilderUtils: drop MULTI_COLUMN_INDEX_FIELD_ID; first column becomes the primary, rest become extraFieldIds - GlobalIndexScanner: key indexes by primary field id; reject conflicting indexes that share a primary with different columns - IndexManifestFileHandler: reject added index files sharing a primary with an existing one over an overlapping row range - FullText/VectorReadImpl: resolve the full column list via getIndexedFields() - TableIndexesTable: show all indexed column names; log when names cannot resolve --- .../globalindex/GlobalIndexBuilderUtils.java | 26 +++--- .../globalindex/GlobalIndexScanner.java | 82 ++++++------------- .../apache/paimon/index/GlobalIndexMeta.java | 41 +++++++--- .../manifest/IndexManifestFileHandler.java | 29 ++----- .../paimon/table/source/FullTextReadImpl.java | 7 +- .../paimon/table/source/VectorReadImpl.java | 7 +- .../table/system/TableIndexesTable.java | 25 +++--- .../GlobalIndexBuilderUtilsTest.java | 13 +-- 8 files changed, 95 insertions(+), 135 deletions(-) diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java index 497d50ece6e9..62b13833b393 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtils.java @@ -47,8 +47,6 @@ public class GlobalIndexBuilderUtils { private static final Logger LOG = LoggerFactory.getLogger(GlobalIndexBuilderUtils.class); - public static final int MULTI_COLUMN_INDEX_FIELD_ID = -1; - public static List toIndexFileMetas( FileIO fileIO, IndexPathFactory indexPathFactory, @@ -62,6 +60,12 @@ public static List toIndexFileMetas( fileIO, indexPathFactory, options, range, indexFieldId, null, indexType, entries); } + /** + * Builds the index file metas. The first column in {@code fields} is treated as the primary + * index column (e.g. the first column in {@code CREATE ... INDEX ON (a, b, c)}) and is stored + * as {@code indexFieldId}; the remaining columns go into {@code extraFieldIds}. Callers must + * pass {@code fields} in the intended column order. + */ public static List toIndexFileMetas( FileIO fileIO, IndexPathFactory indexPathFactory, @@ -71,15 +75,15 @@ public static List toIndexFileMetas( String indexType, List entries) throws IOException { - int indexFieldId; - int[] extraFieldIds; - if (fields.size() > 1) { - indexFieldId = MULTI_COLUMN_INDEX_FIELD_ID; - extraFieldIds = fields.stream().mapToInt(DataField::id).toArray(); - } else { - indexFieldId = fields.get(0).id(); - extraFieldIds = null; - } + // The first column is the primary index column and is stored as indexFieldId; the + // remaining columns (if any) go into extraFieldIds. + int indexFieldId = fields.get(0).id(); + int[] extraFieldIds = + fields.size() > 1 + ? fields.subList(1, fields.size()).stream() + .mapToInt(DataField::id) + .toArray() + : null; return toIndexFileMetas( fileIO, indexPathFactory, diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java index adffa88152eb..960a56d3b97f 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java @@ -37,7 +37,6 @@ import java.io.Closeable; import java.io.IOException; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashMap; @@ -54,6 +53,7 @@ import static org.apache.paimon.CoreOptions.GLOBAL_INDEX_THREAD_NUM; import static org.apache.paimon.predicate.PredicateVisitor.collectFieldNames; import static org.apache.paimon.table.source.snapshot.TimeTravelUtil.tryTravelOrLatest; +import static org.apache.paimon.utils.Preconditions.checkArgument; import static org.apache.paimon.utils.Preconditions.checkNotNull; /** Scanner for shard-based global indexes. */ @@ -75,74 +75,42 @@ public GlobalIndexScanner( GlobalIndexReadThreadPool.getExecutorService(options.get(GLOBAL_INDEX_THREAD_NUM)); this.indexPathFactory = indexPathFactory; GlobalIndexFileReader indexFileReader = meta -> fileIO.newInputStream(meta.filePath()); - - // Single-column indexes: fieldId -> indexType -> range -> files Map>>> indexMetas = new HashMap<>(); - // Multi-column indexes: fieldIds -> indexType -> range -> files - Map, Map>>> multiColumnMetas = - new HashMap<>(); - // Reverse lookup: fieldId -> all multi-column groups it participates in. A field can - // belong to several multi-column indexes (e.g. (a,b) and (a,c)) at the same time. - Map>> fieldToGroups = new HashMap<>(); - + Map> fieldIdToIndexFields = new HashMap<>(); for (IndexFileMeta indexFile : indexFiles) { GlobalIndexMeta meta = checkNotNull(indexFile.globalIndexMeta()); String indexType = indexFile.indexType(); Range range = new Range(meta.rowRangeStart(), meta.rowRangeEnd()); - - if (meta.isMultiColumn() && meta.extraFieldIds() != null) { - // Multi-column index: all participating fields share the same IndexFileMeta. - // Multiple index files belonging to the same group are aggregated under the same - // multiColumnMetas key, and each participating field records this group. - List fieldIds = - Arrays.stream(meta.extraFieldIds()).boxed().collect(Collectors.toList()); - multiColumnMetas - .computeIfAbsent(fieldIds, k -> new HashMap<>()) - .computeIfAbsent(indexType, k -> new HashMap<>()) - .computeIfAbsent(range, k -> new ArrayList<>()) - .add(indexFile); - for (int id : fieldIds) { - List> groups = - fieldToGroups.computeIfAbsent(id, k -> new ArrayList<>()); - if (!groups.contains(fieldIds)) { - groups.add(fieldIds); - } - } + int fieldId = meta.indexFieldId(); + List indexFields = meta.getIndexedFieldIds(); + List existing = fieldIdToIndexFields.get(fieldId); + if (existing == null) { + fieldIdToIndexFields.put(fieldId, indexFields); } else { - // Single-column index - int fieldId = meta.indexFieldId(); - indexMetas - .computeIfAbsent(fieldId, k -> new HashMap<>()) - .computeIfAbsent(indexType, k -> new HashMap<>()) - .computeIfAbsent(range, k -> new ArrayList<>()) - .add(indexFile); + checkArgument( + existing.equals(indexFields), + "Primary field %s owns multiple indexes with different columns %s and %s; " + + "a primary column can own at most one index.", + fieldId, + existing, + indexFields); } + indexMetas + .computeIfAbsent(fieldId, k -> new HashMap<>()) + .computeIfAbsent(indexType, k -> new HashMap<>()) + .computeIfAbsent(range, k -> new ArrayList<>()) + .add(indexFile); } IntFunction> readersFunction = fId -> { - // A filter on a single field can be served by any index covering that field, - // and every such index returns the same matching row ids. So pick ONE index - // instead of running them all: prefer the single-column index (purpose-built - // for this field and always able to serve the predicate); otherwise fall back - // to one of the multi-column groups this field participates in. - Map>> singleColumn = indexMetas.get(fId); - if (singleColumn != null) { - return createReaders( - indexFileReader, - singleColumn, - Collections.singletonList(rowType.getField(fId))); - } - List> groups = fieldToGroups.get(fId); - if (groups != null && !groups.isEmpty()) { - // No single-column index for this field: pick one of the multi-column - // groups it belongs to to accelerate the single-column filter. - List group = groups.get(0); - List fields = - group.stream().map(rowType::getField).collect(Collectors.toList()); - return createReaders(indexFileReader, multiColumnMetas.get(group), fields); + List group = fieldIdToIndexFields.get(fId); + if (group == null) { + return Collections.emptyList(); } - return Collections.emptyList(); + List fields = + group.stream().map(rowType::getField).collect(Collectors.toList()); + return createReaders(indexFileReader, indexMetas.get(fId), fields); }; this.globalIndexEvaluator = new GlobalIndexEvaluator(rowType, readersFunction); } diff --git a/paimon-core/src/main/java/org/apache/paimon/index/GlobalIndexMeta.java b/paimon-core/src/main/java/org/apache/paimon/index/GlobalIndexMeta.java index 60aba56d21ae..a987e994f9ea 100644 --- a/paimon-core/src/main/java/org/apache/paimon/index/GlobalIndexMeta.java +++ b/paimon-core/src/main/java/org/apache/paimon/index/GlobalIndexMeta.java @@ -18,7 +18,6 @@ package org.apache.paimon.index; -import org.apache.paimon.globalindex.GlobalIndexBuilderUtils; import org.apache.paimon.types.BigIntType; import org.apache.paimon.types.DataField; import org.apache.paimon.types.DataTypes; @@ -81,8 +80,13 @@ public int indexFieldId() { return indexFieldId; } + /** + * Whether this index covers more than one column. {@link #indexFieldId} is always the primary + * column; {@link #extraFieldIds} holds the remaining columns and is null/empty for a + * single-column index. + */ public boolean isMultiColumn() { - return indexFieldId == GlobalIndexBuilderUtils.MULTI_COLUMN_INDEX_FIELD_ID; + return extraFieldIds != null && extraFieldIds.length > 0; } @Nullable @@ -95,20 +99,31 @@ public byte[] indexMeta() { return indexMeta; } - public List getIndexedFieldNames(RowType rowType) { - List names = new ArrayList<>(); - if (isMultiColumn()) { + /** All indexed field ids in order: the primary {@link #indexFieldId} followed by the rest. */ + public List getIndexedFieldIds() { + List ids = new ArrayList<>(); + ids.add(indexFieldId); + if (extraFieldIds != null) { for (int id : extraFieldIds) { - names.add(rowType.getField(id).name()); - } - } else { - names.add(rowType.getField(indexFieldId).name()); - if (extraFieldIds != null) { - for (int id : extraFieldIds) { - names.add(rowType.getField(id).name()); - } + ids.add(id); } } + return ids; + } + + public List getIndexedFields(RowType rowType) { + List fields = new ArrayList<>(); + for (int id : getIndexedFieldIds()) { + fields.add(rowType.getField(id)); + } + return fields; + } + + public List getIndexedFieldNames(RowType rowType) { + List names = new ArrayList<>(); + for (int id : getIndexedFieldIds()) { + names.add(rowType.getField(id).name()); + } return names; } } diff --git a/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestFileHandler.java b/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestFileHandler.java index 87628290810c..3dbd01d3a645 100644 --- a/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestFileHandler.java +++ b/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestFileHandler.java @@ -28,7 +28,6 @@ import javax.annotation.Nullable; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; @@ -243,27 +242,13 @@ private void validateRetainedIndexFiles( if (addedMeta == null) { continue; } - - // Single-column: skip if different fieldId or no range overlap - if (!retainedMeta.isMultiColumn()) { - if (retainedMeta.indexFieldId() != addedMeta.indexFieldId() - || !Range.intersect( - retainedMeta.rowRangeStart(), - retainedMeta.rowRangeEnd(), - addedMeta.rowRangeStart(), - addedMeta.rowRangeEnd())) { - continue; - } - } else { - // Multi-column: skip if different column group or no range overlap - if (!Arrays.equals(retainedMeta.extraFieldIds(), addedMeta.extraFieldIds()) - || !Range.intersect( - retainedMeta.rowRangeStart(), - retainedMeta.rowRangeEnd(), - addedMeta.rowRangeStart(), - addedMeta.rowRangeEnd())) { - continue; - } + if (retainedMeta.indexFieldId() != addedMeta.indexFieldId() + || !Range.intersect( + retainedMeta.rowRangeStart(), + retainedMeta.rowRangeEnd(), + addedMeta.rowRangeStart(), + addedMeta.rowRangeEnd())) { + continue; } throw new IllegalStateException( diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java index e905da67a4e9..7831ebb67a92 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextReadImpl.java @@ -34,7 +34,6 @@ import org.apache.paimon.predicate.FullTextSearch; import org.apache.paimon.table.FileStoreTable; import org.apache.paimon.types.DataField; -import org.apache.paimon.types.RowType; import org.apache.paimon.utils.IOUtils; import java.util.ArrayList; @@ -73,11 +72,7 @@ public GlobalIndexResult read(List splits) { GlobalIndexMeta firstMeta = checkNotNull(firstFile.globalIndexMeta()); GlobalIndexer globalIndexer; if (firstMeta.isMultiColumn()) { - RowType rowType = table.rowType(); - List fields = new ArrayList<>(); - for (int id : firstMeta.extraFieldIds()) { - fields.add(rowType.getField(id)); - } + List fields = firstMeta.getIndexedFields(table.rowType()); globalIndexer = GlobalIndexerFactoryUtils.load(indexType) .create(fields, table.coreOptions().toConfiguration()); diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java index 9621a82e37ce..e3210e95c144 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorReadImpl.java @@ -36,7 +36,6 @@ import org.apache.paimon.predicate.VectorSearch; import org.apache.paimon.table.FileStoreTable; import org.apache.paimon.types.DataField; -import org.apache.paimon.types.RowType; import org.apache.paimon.utils.IOUtils; import org.apache.paimon.utils.RoaringNavigableMap64; @@ -90,11 +89,7 @@ public GlobalIndexResult read(List splits) { GlobalIndexMeta firstMeta = checkNotNull(firstFile.globalIndexMeta()); GlobalIndexer globalIndexer; if (firstMeta.isMultiColumn()) { - RowType rowType = table.rowType(); - List fields = new ArrayList<>(); - for (int id : firstMeta.extraFieldIds()) { - fields.add(rowType.getField(id)); - } + List fields = firstMeta.getIndexedFields(table.rowType()); globalIndexer = GlobalIndexerFactoryUtils.load(indexType) .create(fields, table.coreOptions().toConfiguration()); diff --git a/paimon-core/src/main/java/org/apache/paimon/table/system/TableIndexesTable.java b/paimon-core/src/main/java/org/apache/paimon/table/system/TableIndexesTable.java index f5e693d4a79d..9ad88e977b3d 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/system/TableIndexesTable.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/system/TableIndexesTable.java @@ -234,20 +234,17 @@ private InternalRow toRow( GlobalIndexMeta globalMeta = indexManifestEntry.indexFile().globalIndexMeta(); String indexFieldName = null; if (globalMeta != null) { - if (globalMeta.isMultiColumn() && globalMeta.extraFieldIds() != null) { - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < globalMeta.extraFieldIds().length; i++) { - if (i > 0) { - sb.append(","); - } - sb.append(logicalRowType.getField(globalMeta.extraFieldIds()[i]).name()); - } - indexFieldName = sb.toString(); - } else { - try { - indexFieldName = logicalRowType.getField(globalMeta.indexFieldId()).name(); - } catch (RuntimeException ignored) { - } + try { + indexFieldName = + String.join(",", globalMeta.getIndexedFieldNames(logicalRowType)); + } catch (RuntimeException e) { + // Indexed columns may no longer exist in the current schema (e.g. dropped via + // ALTER TABLE); leave the name empty instead of failing the listing. + LOG.debug( + "Failed to resolve indexed field names for index file {} (primary field {}).", + indexManifestEntry.indexFile().fileName(), + globalMeta.indexFieldId(), + e); } } return GenericRow.of( diff --git a/paimon-core/src/test/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtilsTest.java b/paimon-core/src/test/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtilsTest.java index 703c01c69633..67852ae925ff 100644 --- a/paimon-core/src/test/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtilsTest.java +++ b/paimon-core/src/test/java/org/apache/paimon/globalindex/GlobalIndexBuilderUtilsTest.java @@ -77,7 +77,7 @@ public boolean isExternalPath() { coreOptions = new CoreOptions(new Options().toMap()); } - // Test: 2 columns (title + vec), indexFieldId=-1, all field ids stored in extraFieldIds + // Test: 2 columns (title + vec), primary column title is indexFieldId, rest in extraFieldIds @Test void testToIndexFileMetasMultiColumn() throws IOException { DataField titleField = new DataField(1, "title", new VarCharType(Integer.MAX_VALUE)); @@ -92,8 +92,8 @@ void testToIndexFileMetasMultiColumn() throws IOException { fileIO, indexPathFactory, coreOptions, range, fields, "test-type", entries); assertThat(metas).hasSize(1); - assertThat(metas.get(0).globalIndexMeta().indexFieldId()).isEqualTo(-1); - assertThat(metas.get(0).globalIndexMeta().extraFieldIds()).isEqualTo(new int[] {1, 2}); + assertThat(metas.get(0).globalIndexMeta().indexFieldId()).isEqualTo(1); + assertThat(metas.get(0).globalIndexMeta().extraFieldIds()).isEqualTo(new int[] {2}); assertThat(metas.get(0).globalIndexMeta().rowRangeStart()).isEqualTo(0); assertThat(metas.get(0).globalIndexMeta().rowRangeEnd()).isEqualTo(99); } @@ -117,7 +117,8 @@ void testToIndexFileMetasSingleColumn() throws IOException { assertThat(metas.get(0).globalIndexMeta().extraFieldIds()).isNull(); } - // Test: 3 columns (title + vec + id), indexFieldId=-1, all field ids in extraFieldIds + // Test: 3 columns (title + vec + id), primary column title is indexFieldId, rest in + // extraFieldIds @Test void testToIndexFileMetasThreeColumns() throws IOException { DataField titleField = new DataField(1, "title", new VarCharType(Integer.MAX_VALUE)); @@ -133,8 +134,8 @@ void testToIndexFileMetasThreeColumns() throws IOException { fileIO, indexPathFactory, coreOptions, range, fields, "test-type", entries); assertThat(metas).hasSize(1); - assertThat(metas.get(0).globalIndexMeta().indexFieldId()).isEqualTo(-1); - assertThat(metas.get(0).globalIndexMeta().extraFieldIds()).isEqualTo(new int[] {1, 2, 3}); + assertThat(metas.get(0).globalIndexMeta().indexFieldId()).isEqualTo(1); + assertThat(metas.get(0).globalIndexMeta().extraFieldIds()).isEqualTo(new int[] {2, 3}); } private List createDummyResultEntries() throws IOException { From cf6f843c6446898c55cd0cbd1ca7383bdd109ea1 Mon Sep 17 00:00:00 2001 From: CrownChu Date: Thu, 4 Jun 2026 01:25:08 +0800 Subject: [PATCH 18/20] [globalindex] Refine scanner routing, multi-column writer rowId, and overlap checks - GlobalIndexScanner: split single-/multi-column lookups (IndexMetaFileGroup), single-column index takes priority, fall back to the first multi-column index that has the field as an extra; reject a primary owning multiple indexes - GlobalIndexMultiColumnWriter.write now takes the shard-relative row id; the builders pass projected index columns plus that id - DefaultGlobalIndexBuilder (Spark): multi-column skips rows outside the shard range so the relative row id stays valid for boundary-spanning files - IndexManifestFileHandler: same-primary indexes with different columns always conflict, same columns only conflict on overlapping ranges - FullText/VectorScanImpl: match indexes by their primary column --- .../GlobalIndexMultiColumnWriter.java | 12 ++-- .../globalindex/GlobalIndexScanner.java | 69 +++++++++++++------ .../manifest/IndexManifestFileHandler.java | 19 ++--- .../paimon/table/source/FullTextScanImpl.java | 12 +--- .../paimon/table/source/VectorScanImpl.java | 30 ++------ .../globalindex/GenericIndexTopoBuilder.java | 9 +-- .../procedure/CreateGlobalIndexProcedure.java | 5 -- .../DefaultGlobalIndexBuilder.java | 12 ++-- 8 files changed, 82 insertions(+), 86 deletions(-) diff --git a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexMultiColumnWriter.java b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexMultiColumnWriter.java index a6ded78d33fd..58a847b64ca8 100644 --- a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexMultiColumnWriter.java +++ b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexMultiColumnWriter.java @@ -26,9 +26,13 @@ public interface GlobalIndexMultiColumnWriter extends GlobalIndexWriter { /** - * Write a projected row containing all indexed columns for one record. The row layout matches - * the fields order passed to {@link GlobalIndexerFactory#create(java.util.List, - * org.apache.paimon.options.Options)}. + * Write one record's indexed columns at the given relative row id. + * + * @param rowId the record's row id relative to the current shard (0 to rowCnt - 1); a null row + * still advances the row id without indexing a value + * @param row a projected row containing only the indexed columns, whose layout matches the + * fields order passed to {@link GlobalIndexerFactory#create(java.util.List, + * org.apache.paimon.options.Options)} */ - void write(@Nullable InternalRow row); + void write(long rowId, @Nullable InternalRow row); } diff --git a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java index 960a56d3b97f..f7264b1eb4dc 100644 --- a/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java +++ b/paimon-core/src/main/java/org/apache/paimon/globalindex/GlobalIndexScanner.java @@ -75,46 +75,73 @@ public GlobalIndexScanner( GlobalIndexReadThreadPool.getExecutorService(options.get(GLOBAL_INDEX_THREAD_NUM)); this.indexPathFactory = indexPathFactory; GlobalIndexFileReader indexFileReader = meta -> fileIO.newInputStream(meta.filePath()); - Map>>> indexMetas = new HashMap<>(); - Map> fieldIdToIndexFields = new HashMap<>(); + Map indexMetas = new HashMap<>(); + Map> extraIndexMetas = new HashMap<>(); for (IndexFileMeta indexFile : indexFiles) { GlobalIndexMeta meta = checkNotNull(indexFile.globalIndexMeta()); String indexType = indexFile.indexType(); Range range = new Range(meta.rowRangeStart(), meta.rowRangeEnd()); - int fieldId = meta.indexFieldId(); - List indexFields = meta.getIndexedFieldIds(); - List existing = fieldIdToIndexFields.get(fieldId); - if (existing == null) { - fieldIdToIndexFields.put(fieldId, indexFields); + int indexFieldId = meta.indexFieldId(); + List fieldIds = meta.getIndexedFieldIds(); + IndexMetaFileGroup group = indexMetas.get(indexFieldId); + if (group == null) { + group = new IndexMetaFileGroup(indexFieldId, fieldIds); + indexMetas.put(indexFieldId, group); + if (meta.extraFieldIds() != null) { + for (int extra : meta.extraFieldIds()) { + extraIndexMetas.computeIfAbsent(extra, k -> new ArrayList<>()).add(group); + } + } } else { checkArgument( - existing.equals(indexFields), + group.fieldIds.equals(fieldIds), "Primary field %s owns multiple indexes with different columns %s and %s; " + "a primary column can own at most one index.", - fieldId, - existing, - indexFields); + indexFieldId, + group.fieldIds, + fieldIds); } - indexMetas - .computeIfAbsent(fieldId, k -> new HashMap<>()) - .computeIfAbsent(indexType, k -> new HashMap<>()) - .computeIfAbsent(range, k -> new ArrayList<>()) - .add(indexFile); + group.addFile(indexType, range, indexFile); } IntFunction> readersFunction = fId -> { - List group = fieldIdToIndexFields.get(fId); + IndexMetaFileGroup group = indexMetas.get(fId); if (group == null) { - return Collections.emptyList(); + List extraGroups = extraIndexMetas.get(fId); + if (extraGroups == null || extraGroups.isEmpty()) { + return Collections.emptyList(); + } + group = extraGroups.get(0); } List fields = - group.stream().map(rowType::getField).collect(Collectors.toList()); - return createReaders(indexFileReader, indexMetas.get(fId), fields); + group.fieldIds.stream() + .map(rowType::getField) + .collect(Collectors.toList()); + return createReaders(indexFileReader, group.metas, fields); }; this.globalIndexEvaluator = new GlobalIndexEvaluator(rowType, readersFunction); } + /** All index files of one global index (single- or multi-column), grouped for reading. */ + private static class IndexMetaFileGroup { + + private final int indexFieldId; + private final List fieldIds; + private final Map>> metas = new HashMap<>(); + + IndexMetaFileGroup(int indexFieldId, List fieldIds) { + this.indexFieldId = indexFieldId; + this.fieldIds = fieldIds; + } + + void addFile(String indexType, Range range, IndexFileMeta indexFile) { + metas.computeIfAbsent(indexType, k -> new HashMap<>()) + .computeIfAbsent(range, k -> new ArrayList<>()) + .add(indexFile); + } + } + public static Optional create( FileStoreTable table, Collection indexFiles) { if (indexFiles.isEmpty()) { @@ -145,6 +172,8 @@ public static Optional create( if (globalIndex == null) { return false; } + // Collect indexes whose primary column is filtered, and also multi-column + // indexes that have a filtered column as an extra (used as a fallback). if (filterFieldIds.contains(globalIndex.indexFieldId())) { return true; } diff --git a/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestFileHandler.java b/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestFileHandler.java index 3dbd01d3a645..f99278085550 100644 --- a/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestFileHandler.java +++ b/paimon-core/src/main/java/org/apache/paimon/manifest/IndexManifestFileHandler.java @@ -28,6 +28,7 @@ import javax.annotation.Nullable; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; @@ -239,15 +240,15 @@ private void validateRetainedIndexFiles( for (IndexManifestEntry added : addedIndexFiles) { GlobalIndexMeta addedMeta = added.indexFile().globalIndexMeta(); - if (addedMeta == null) { - continue; - } - if (retainedMeta.indexFieldId() != addedMeta.indexFieldId() - || !Range.intersect( - retainedMeta.rowRangeStart(), - retainedMeta.rowRangeEnd(), - addedMeta.rowRangeStart(), - addedMeta.rowRangeEnd())) { + if (addedMeta == null + || retainedMeta.indexFieldId() != addedMeta.indexFieldId() + || (Arrays.equals( + retainedMeta.extraFieldIds(), addedMeta.extraFieldIds()) + && !Range.intersect( + retainedMeta.rowRangeStart(), + retainedMeta.rowRangeEnd(), + addedMeta.rowRangeStart(), + addedMeta.rowRangeEnd()))) { continue; } diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextScanImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextScanImpl.java index 6230b31336a3..cc77d9121ad5 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextScanImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/FullTextScanImpl.java @@ -61,17 +61,7 @@ public Plan scan() { if (globalIndex == null) { return false; } - if (textColumn.id() == globalIndex.indexFieldId()) { - return true; - } - if (globalIndex.extraFieldIds() != null) { - for (int id : globalIndex.extraFieldIds()) { - if (textColumn.id() == id) { - return true; - } - } - } - return false; + return textColumn.id() == globalIndex.indexFieldId(); }; List allIndexFiles = diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorScanImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorScanImpl.java index 1ff3f82852f6..5098cc959129 100644 --- a/paimon-core/src/main/java/org/apache/paimon/table/source/VectorScanImpl.java +++ b/paimon-core/src/main/java/org/apache/paimon/table/source/VectorScanImpl.java @@ -82,17 +82,7 @@ public Plan scan() { return false; } int fieldId = globalIndex.indexFieldId(); - if (vectorColumn.id() == fieldId || filterFieldIds.contains(fieldId)) { - return true; - } - if (globalIndex.extraFieldIds() != null) { - for (int id : globalIndex.extraFieldIds()) { - if (vectorColumn.id() == id || filterFieldIds.contains(id)) { - return true; - } - } - } - return false; + return vectorColumn.id() == fieldId || filterFieldIds.contains(fieldId); }; List allIndexFiles = @@ -104,7 +94,7 @@ public Plan scan() { Map> vectorByRange = new HashMap<>(); for (IndexFileMeta indexFile : allIndexFiles) { GlobalIndexMeta meta = checkNotNull(indexFile.globalIndexMeta()); - if (containsField(meta, vectorColumn.id())) { + if (isPrimaryColumn(meta, vectorColumn.id())) { Range range = new Range(meta.rowRangeStart(), meta.rowRangeEnd()); vectorByRange.computeIfAbsent(range, k -> new ArrayList<>()).add(indexFile); } @@ -121,7 +111,7 @@ public Plan scan() { f -> { GlobalIndexMeta globalIndex = checkNotNull(f.globalIndexMeta()); - if (containsField(globalIndex, vectorColumn.id())) { + if (isPrimaryColumn(globalIndex, vectorColumn.id())) { return false; } return range.hasIntersection(globalIndex.rowRange()); @@ -133,17 +123,7 @@ public Plan scan() { return () -> splits; } - private static boolean containsField(GlobalIndexMeta meta, int fieldId) { - if (meta.indexFieldId() == fieldId) { - return true; - } - if (meta.extraFieldIds() != null) { - for (int id : meta.extraFieldIds()) { - if (id == fieldId) { - return true; - } - } - } - return false; + private static boolean isPrimaryColumn(GlobalIndexMeta meta, int fieldId) { + return meta.indexFieldId() == fieldId; } } diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java index 4bba6fcc830b..c94bc2deda65 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/globalindex/GenericIndexTopoBuilder.java @@ -658,15 +658,10 @@ public void processElement(StreamRecord element) throws Exception { // Only write rows within this shard's range if (currentRowId >= task.shardRange.from) { if (multiColumn) { - // Pass the row through, including null fields; each index type - // decides how to handle nulls. A null field advances the logical - // row id without indexing a value, so it must not end the shard: - // later non-null rows still need to be indexed and row-id alignment - // must be preserved. + long rowId = currentRowId - task.shardRange.from; ((GlobalIndexMultiColumnWriter) indexWriter) - .write(writerProjection.replaceRow(row)); + .write(rowId, writerProjection.replaceRow(row)); } else { - // A null value advances the logical row id without indexing. Object fieldData = indexFieldGetters[0].getFieldOrNull(row); ((GlobalIndexSingletonWriter) indexWriter).write(fieldData); } diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java index f9f54918ea4a..c9cd4c5e39ee 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java @@ -90,11 +90,6 @@ public String[] call( indexColumns.size() == new HashSet<>(indexColumns).size(), "Duplicate index columns are not allowed: %s", indexColumns); - // No hard cap on the number of index columns: unlike row-store B-tree indexes - // (e.g. MySQL 16, PostgreSQL 32) whose limit comes from composing columns into a - // single key, the global index is built on per-type index frameworks. Whether - // multiple columns are supported, and any practical limit, is decided by each - // index type (single-column types reject multi-column via UnsupportedOperationException). for (String col : indexColumns) { checkArgument( rowType.containsField(col), diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java index 713965b01a32..bccf4899652c 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/globalindex/DefaultGlobalIndexBuilder.java @@ -29,6 +29,7 @@ import org.apache.paimon.io.DataIncrement; import org.apache.paimon.options.Options; import org.apache.paimon.table.FileStoreTable; +import org.apache.paimon.table.SpecialFields; import org.apache.paimon.table.sink.CommitMessage; import org.apache.paimon.table.sink.CommitMessageImpl; import org.apache.paimon.types.DataField; @@ -134,13 +135,14 @@ private List writePaimonRows( projection[i] = readType.getFieldIndex(field.name()); } ProjectedRow projectedRow = ProjectedRow.from(projection); + int rowIdIndex = readType.getFieldIndex(SpecialFields.ROW_ID.name()); while (rows.hasNext()) { InternalRow row = rows.next(); - // Pass the row through, including null fields; each index type decides how to - // handle nulls. A null field advances the logical row id without indexing a - // value, so it must not end the shard: later non-null rows still need to be - // indexed and row-id alignment must be preserved. - multiWriter.write(projectedRow.replaceRow(row)); + long absRowId = row.getLong(rowIdIndex); + if (absRowId < rowRange.from || absRowId > rowRange.to) { + continue; + } + multiWriter.write(absRowId - rowRange.from, projectedRow.replaceRow(row)); rowCounter.add(1); } } else { From fde5010575dfac53b3f5ebd40cedf28f1c0e37cb Mon Sep 17 00:00:00 2001 From: CrownChu Date: Thu, 4 Jun 2026 16:09:58 +0800 Subject: [PATCH 19/20] [globalindex] Reject unsupported multi-column index types at creation time Add GlobalIndexerFactory.supportsMultiColumn() (default false). CreateGlobalIndexProcedure (Spark and Flink) now checks it up front and fails fast with a clear message when a multi-column index is requested for a type whose factory does not support it, instead of failing later in the build job when create(List) throws. --- .../globalindex/GlobalIndexerFactory.java | 8 ++++++++ .../procedure/CreateGlobalIndexProcedure.java | 10 +++++++--- .../procedure/CreateGlobalIndexProcedure.java | 18 ++++++++++++------ 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java index cef643fa463f..b028ba4470cb 100644 --- a/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java +++ b/paimon-common/src/main/java/org/apache/paimon/globalindex/GlobalIndexerFactory.java @@ -31,6 +31,14 @@ public interface GlobalIndexerFactory { GlobalIndexer create(DataField dataField, Options options); + /** + * Whether this index type supports multi-column indexes. A factory that returns {@code true} + * must override {@link #create(List, Options)} to handle more than one column. + */ + default boolean supportsMultiColumn() { + return false; + } + default GlobalIndexer create(List fields, Options options) { if (fields.size() > 1) { throw new UnsupportedOperationException( diff --git a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java index c9cd4c5e39ee..dc3eccb513bf 100644 --- a/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java +++ b/paimon-flink/paimon-flink-common/src/main/java/org/apache/paimon/flink/procedure/CreateGlobalIndexProcedure.java @@ -20,6 +20,7 @@ import org.apache.paimon.flink.btree.BTreeIndexTopoBuilder; import org.apache.paimon.flink.globalindex.GenericIndexTopoBuilder; +import org.apache.paimon.globalindex.GlobalIndexerFactoryUtils; import org.apache.paimon.options.Options; import org.apache.paimon.partition.PartitionPredicate; import org.apache.paimon.predicate.Predicate; @@ -107,10 +108,13 @@ public String[] call( // Build global index based on index type indexType = indexType.toLowerCase().trim(); - if ("btree".equals(indexType)) { + if (indexColumns.size() > 1) { + // Whether multi-column is supported is decided by each index type's factory; fail fast + // up front instead of failing later in the build job. checkArgument( - indexColumns.size() == 1, - "BTree index only supports single column, got: %s", + GlobalIndexerFactoryUtils.load(indexType).supportsMultiColumn(), + "Index type '%s' does not support multi-column index, got columns: %s", + indexType, indexColumns); } try { diff --git a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java index a3173ca8ac1f..9bdb5c254290 100644 --- a/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java +++ b/paimon-spark/paimon-spark-common/src/main/java/org/apache/paimon/spark/procedure/CreateGlobalIndexProcedure.java @@ -18,6 +18,7 @@ package org.apache.paimon.spark.procedure; +import org.apache.paimon.globalindex.GlobalIndexerFactoryUtils; import org.apache.paimon.options.Options; import org.apache.paimon.partition.PartitionPredicate; import org.apache.paimon.spark.globalindex.GlobalIndexTopologyBuilder; @@ -146,12 +147,6 @@ public InternalRow[] call(InternalRow args) { col, tableIdent); } - if ("btree".equalsIgnoreCase(indexType)) { - checkArgument( - indexColumns.size() == 1, - "BTree index only supports single column, got: %s", - indexColumns); - } DataSourceV2Relation relation = createRelation(tableIdent, sparkTable); PartitionPredicate partitionPredicate = SparkProcedureUtils.convertToPartitionPredicate( @@ -171,6 +166,17 @@ public InternalRow[] call(InternalRow args) { ProcedureUtils.putAllOptions(parsedOptions, optionString); Options userOptions = Options.fromMap(parsedOptions); + if (indexColumns.size() > 1) { + // Whether multi-column is supported is decided by each index type's + // factory; fail fast up front instead of failing later in the build + // job. + checkArgument( + GlobalIndexerFactoryUtils.load(indexType).supportsMultiColumn(), + "Index type '%s' does not support multi-column index, got columns: %s", + indexType, + indexColumns); + } + GlobalIndexTopologyBuilder topoBuilder = GlobalIndexTopologyBuilderUtils.createTopoBuilder(indexType); From dbaea6a6cb099db22ab8246626f4e3dc1bb6a410 Mon Sep 17 00:00:00 2001 From: CrownChu Date: Thu, 4 Jun 2026 19:40:50 +0800 Subject: [PATCH 20/20] trigger CI re-run