From 942cce4da72ab2cb53b5c7d4eb0670f2d0f6e191 Mon Sep 17 00:00:00 2001 From: kaka11chen Date: Mon, 1 Jun 2026 12:59:21 +0800 Subject: [PATCH 01/10] [fix](be) Bound ANN build memory and train IVF indexes once --- be/src/common/config.cpp | 17 +- be/src/common/config.h | 6 +- be/src/storage/index/ann/ann_index_writer.cpp | 357 +++++++-- be/src/storage/index/ann/ann_index_writer.h | 38 +- be/src/storage/index/ann/faiss_ann_index.cpp | 3 +- .../index/ann/ann_index_writer_test.cpp | 703 +++++++++--------- .../ann_index_build_chunk_bytes.out | 8 + .../ann_index_p0/ivf_on_disk_index_test.out | 4 + .../data/ann_index_p0/ivf_pq_recall.out | 9 + .../ann_index_build_chunk_bytes.groovy | 56 ++ .../ivf_on_disk_index_test.groovy | 16 +- .../suites/ann_index_p0/ivf_pq_recall.groovy | 87 +++ 12 files changed, 852 insertions(+), 452 deletions(-) create mode 100644 regression-test/data/ann_index_p0/ann_index_build_chunk_bytes.out create mode 100644 regression-test/data/ann_index_p0/ivf_pq_recall.out create mode 100644 regression-test/suites/ann_index_p0/ann_index_build_chunk_bytes.groovy create mode 100644 regression-test/suites/ann_index_p0/ivf_pq_recall.groovy diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index 7b6db1636234aa..cc4a7c86254717 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1757,12 +1757,27 @@ DEFINE_String(ann_index_ivf_list_cache_limit, "70%"); // Stale sweep time for ANN index IVF list cache in seconds. 3600s is 1 hour. DEFINE_mInt32(ann_index_ivf_list_cache_stale_sweep_time_sec, "3600"); -// Chunk size for ANN/vector index building per training/adding batch +// Target row count upper bound for ANN/vector index build add batch and training sample. +// The effective build chunk also respects ann_index_build_chunk_bytes. // 1M By default. DEFINE_mInt64(ann_index_build_chunk_size, "1000000"); DEFINE_Validator(ann_index_build_chunk_size, [](const int64_t config) -> bool { return config > 0; }); +// Target byte bound for ANN/vector index build add batch and memory buffer before flush. +// If index-required minimum training rows cannot fit in this bound, skip ANN build for the segment. +// 128MB By default. +DEFINE_mInt64(ann_index_build_chunk_bytes, "134217728"); +DEFINE_Validator(ann_index_build_chunk_bytes, + [](const int64_t config) -> bool { return config > 0; }); + +// Maximum row count for ANN/vector index training sample. +// The effective sample keeps at least the index-required minimum training rows. +// 1M By default. +DEFINE_mInt64(ann_index_build_max_train_rows, "1000000"); +DEFINE_Validator(ann_index_build_max_train_rows, + [](const int64_t config) -> bool { return config > 0; }); + DEFINE_mBool(enable_wal_tde, "false"); DEFINE_mBool(print_stack_when_cache_miss, "false"); diff --git a/be/src/common/config.h b/be/src/common/config.h index 427282a4452bc4..3153b3ef9639a7 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1809,8 +1809,12 @@ DECLARE_mInt32(max_segment_partial_column_cache_size); DECLARE_String(ann_index_ivf_list_cache_limit); // Stale sweep time for ANN index IVF list cache in seconds. DECLARE_mInt32(ann_index_ivf_list_cache_stale_sweep_time_sec); -// Chunk size for ANN/vector index building per training/adding batch +// Target row count upper bound for ANN/vector index build add batch and training sample. DECLARE_mInt64(ann_index_build_chunk_size); +// Target byte bound for ANN/vector index build add batch and memory buffer before flush. +DECLARE_mInt64(ann_index_build_chunk_bytes); +// Maximum row count for ANN/vector index training sample. +DECLARE_mInt64(ann_index_build_max_train_rows); DECLARE_mBool(enable_prefill_output_dbm_agg_cache_after_compaction); DECLARE_mBool(enable_prefill_all_dbm_agg_cache_after_compaction); diff --git a/be/src/storage/index/ann/ann_index_writer.cpp b/be/src/storage/index/ann/ann_index_writer.cpp index 28d348cc319a48..d441f3a8b37f3f 100644 --- a/be/src/storage/index/ann/ann_index_writer.cpp +++ b/be/src/storage/index/ann/ann_index_writer.cpp @@ -17,13 +17,23 @@ #include "storage/index/ann/ann_index_writer.h" +#include + +#include #include +#include #include #include #include "common/cast_set.h" +#include "io/fs/file_reader.h" +#include "io/fs/file_writer.h" +#include "io/fs/local_file_system.h" +#include "runtime/exec_env.h" #include "storage/index/ann/faiss_ann_index.h" #include "storage/index/inverted/inverted_index_fs_directory.h" +#include "util/slice.h" +#include "util/uid_util.h" namespace doris::segment_v2 { static std::string get_or_default(const std::map& properties, @@ -39,7 +49,9 @@ AnnIndexColumnWriter::AnnIndexColumnWriter(IndexFileWriter* index_file_writer, const TabletIndex* index_meta) : _index_file_writer(index_file_writer), _index_meta(index_meta) {} -AnnIndexColumnWriter::~AnnIndexColumnWriter() {} +AnnIndexColumnWriter::~AnnIndexColumnWriter() { + _delete_spool_file(); +} Status AnnIndexColumnWriter::init() { Result> compound_dir = _index_file_writer->open(_index_meta); @@ -70,6 +82,8 @@ Status AnnIndexColumnWriter::init() { faiss_index->build(build_parameter); _vector_index = faiss_index; + _training_sample_seen_rows = 0; + _training_sample_rng.seed(0); LOG_INFO( "Create a new faiss index, index_type {} dim {} metric_type {} max_degree {}, " @@ -77,9 +91,6 @@ Status AnnIndexColumnWriter::init() { index_type, build_parameter.dim, metric_type, build_parameter.max_degree, build_parameter.ef_construction, quantizer); - size_t block_size = AnnIndexColumnWriter::chunk_size() * build_parameter.dim; - _float_array.reserve(block_size); - return Status::OK(); } @@ -87,7 +98,46 @@ Status AnnIndexColumnWriter::add_values(const std::string fn, const void* values return Status::OK(); } -void AnnIndexColumnWriter::close_on_error() {} +void AnnIndexColumnWriter::close_on_error() { + _delete_spool_file(); + _release_buffered_vectors(); + _training_sample.clear(); + _read_buffer.clear(); + _skip_build = true; +} + +size_t AnnIndexColumnWriter::_chunk_rows_by_bytes(size_t dim) const { + DCHECK(dim > 0); + static constexpr Int64 FLOAT_BYTES = static_cast(sizeof(float)); + DORIS_CHECK(dim <= static_cast(std::numeric_limits::max() / FLOAT_BYTES)); + const Int64 vector_bytes = cast_set(dim) * FLOAT_BYTES; + return cast_set(std::max(1, AnnIndexColumnWriter::chunk_bytes() / vector_bytes)); +} + +size_t AnnIndexColumnWriter::_add_chunk_rows(size_t dim) const { + return cast_set( + std::max(1, std::min(AnnIndexColumnWriter::chunk_size(), + cast_set(_chunk_rows_by_bytes(dim))))); +} + +bool AnnIndexColumnWriter::_train_rows_exceed_chunk_bytes(size_t dim, Int64 min_train_rows) const { + DCHECK(dim > 0); + DCHECK(min_train_rows >= 0); + if (min_train_rows == 0) { + return false; + } + static constexpr Int64 FLOAT_BYTES = static_cast(sizeof(float)); + DORIS_CHECK(dim <= static_cast(std::numeric_limits::max() / FLOAT_BYTES)); + const Int64 vector_bytes = cast_set(dim) * FLOAT_BYTES; + return min_train_rows > AnnIndexColumnWriter::chunk_bytes() / vector_bytes; +} + +size_t AnnIndexColumnWriter::_training_sample_rows_limit(Int64 min_train_rows, size_t dim) const { + DCHECK(min_train_rows > 0); + return cast_set(std::max( + min_train_rows, std::min(config::ann_index_build_max_train_rows, + cast_set(_add_chunk_rows(dim))))); +} Status AnnIndexColumnWriter::add_array_values(size_t field_size, const void* value_ptr, const uint8_t* null_map, const uint8_t* offsets_ptr, @@ -109,26 +159,36 @@ Status AnnIndexColumnWriter::add_array_values(size_t field_size, const void* val const float* p = reinterpret_cast(value_ptr); - const size_t full_elements = AnnIndexColumnWriter::chunk_size() * dim; - size_t remaining_elements = num_rows * dim; - size_t src_offset = 0; - while (remaining_elements > 0) { - size_t available_space = full_elements - _float_array.size(); - size_t elements_to_add = std::min(remaining_elements, available_space); - - _float_array.insert(_float_array.end(), p + src_offset, p + src_offset + elements_to_add); - src_offset += elements_to_add; - remaining_elements -= elements_to_add; - - if (_float_array.size() == full_elements) { - RETURN_IF_ERROR( - _vector_index->train(AnnIndexColumnWriter::chunk_size(), _float_array.data())); - RETURN_IF_ERROR( - _vector_index->add(AnnIndexColumnWriter::chunk_size(), _float_array.data())); - _float_array.clear(); - _need_save_index = true; + if (_skip_build) { + return Status::OK(); + } + + const Int64 min_train_rows = _vector_index->get_min_train_rows(); + if (min_train_rows == 0) { + RETURN_IF_ERROR(_add_vectors_in_chunks(p, num_rows)); + } else { + if (_train_rows_exceed_chunk_bytes(dim, min_train_rows)) { + static constexpr Int64 FLOAT_BYTES = static_cast(sizeof(float)); + const Int64 vector_bytes = cast_set(dim) * FLOAT_BYTES; + const Int64 required_bytes = + min_train_rows > std::numeric_limits::max() / vector_bytes + ? std::numeric_limits::max() + : min_train_rows * vector_bytes; + LOG(WARNING) << "Skip ANN index build because minimum training rows exceed build chunk " + "byte limit, dim=" + << dim << ", min_train_rows=" << min_train_rows + << ", required_bytes=" << required_bytes + << ", ann_index_build_chunk_bytes=" << AnnIndexColumnWriter::chunk_bytes(); + _skip_build = true; + _release_buffered_vectors(); + _training_sample.clear(); + _read_buffer.clear(); + _delete_spool_file(); + return Status::OK(); } + RETURN_IF_ERROR(_append_vectors_need_train(p, num_rows, min_train_rows)); } + _total_rows += cast_set(num_rows); return Status::OK(); } @@ -146,54 +206,223 @@ int64_t AnnIndexColumnWriter::size() const { } Status AnnIndexColumnWriter::finish() { - Int64 min_train_rows = _vector_index->get_min_train_rows(); + if (_skip_build || _total_rows == 0) { + LOG_INFO("No data to train/add for ANN index. Skipping index building."); + Status st = _index_file_writer->delete_index(_index_meta); + _delete_spool_file(); + return st; + } + + const Int64 min_train_rows = _vector_index->get_min_train_rows(); + Status st = + min_train_rows == 0 ? _vector_index->save(_dir.get()) : _train_and_add(min_train_rows); + _delete_spool_file(); + return st; +} + +Status AnnIndexColumnWriter::_add_vectors_in_chunks(const float* vectors, size_t num_rows) { + DCHECK(vectors != nullptr); + DCHECK(num_rows > 0); + + const size_t dim = _vector_index->get_dimension(); + const size_t chunk_rows = _add_chunk_rows(dim); + size_t row_offset = 0; + while (row_offset < num_rows) { + const size_t rows_to_add = std::min(chunk_rows, num_rows - row_offset); + RETURN_IF_ERROR( + _vector_index->add(cast_set(rows_to_add), vectors + row_offset * dim)); + row_offset += rows_to_add; + } + return Status::OK(); +} + +Status AnnIndexColumnWriter::_append_vectors_need_train(const float* vectors, size_t num_rows, + Int64 min_train_rows) { + DCHECK(vectors != nullptr); + DCHECK(num_rows > 0); + + const size_t dim = _vector_index->get_dimension(); + const size_t sample_rows_limit = _training_sample_rows_limit(min_train_rows, dim); + if (!_spool_file_path.empty()) { + _sample_training_vectors(vectors, num_rows, dim, sample_rows_limit); + return _append_to_spool_file(vectors, num_rows * dim); + } + + const size_t buffered_rows = _buffered_vectors.size() / dim; + const size_t buffered_rows_limit = sample_rows_limit; + if (buffered_rows <= buffered_rows_limit && num_rows <= buffered_rows_limit - buffered_rows) { + _buffered_vectors.insert(_buffered_vectors.end(), vectors, vectors + num_rows * dim); + return Status::OK(); + } + + RETURN_IF_ERROR(_spill_buffered_vectors(dim, sample_rows_limit)); + _sample_training_vectors(vectors, num_rows, dim, sample_rows_limit); + return _append_to_spool_file(vectors, num_rows * dim); +} + +void AnnIndexColumnWriter::_sample_training_vectors(const float* vectors, size_t num_rows, + size_t dim, size_t sample_rows_limit) { + DCHECK(vectors != nullptr); + DCHECK(num_rows > 0); + DCHECK(dim > 0); + DCHECK(sample_rows_limit > 0); + DCHECK(_training_sample.size() % dim == 0); + + for (size_t row = 0; row < num_rows; ++row) { + const float* vector = vectors + row * dim; + ++_training_sample_seen_rows; + const size_t sample_rows = _training_sample.size() / dim; + if (sample_rows < sample_rows_limit) { + _training_sample.insert(_training_sample.end(), vector, vector + dim); + continue; + } - // Check if we have enough rows to train the index - // train/add the remaining data - if (_float_array.empty()) { - if (_need_save_index) { - return _vector_index->save(_dir.get()); + std::uniform_int_distribution distribution(0, _training_sample_seen_rows - 1); + const uint64_t selected = distribution(_training_sample_rng); + if (selected < sample_rows_limit) { + float* dst = _training_sample.data() + cast_set(selected) * dim; + std::copy(vector, vector + dim, dst); + } + } +} + +Status AnnIndexColumnWriter::_spill_buffered_vectors(size_t dim, size_t sample_rows_limit) { + DCHECK(dim > 0); + DCHECK(_training_sample.empty()); + DCHECK_EQ(_training_sample_seen_rows, 0); + if (!_buffered_vectors.empty()) { + DCHECK(_buffered_vectors.size() % dim == 0); + } + RETURN_IF_ERROR(_ensure_spool_file()); + if (!_buffered_vectors.empty()) { + RETURN_IF_ERROR(_append_to_spool_file(_buffered_vectors.data(), _buffered_vectors.size())); + if (_buffered_vectors.size() / dim <= sample_rows_limit) { + _training_sample.swap(_buffered_vectors); + _training_sample_seen_rows = _training_sample.size() / dim; } else { - // No data was added at all. This can happen if the segment has 0 rows - // or all rows were filtered out. We need to delete the directory entry - // to avoid writing an empty/invalid index file. - LOG_INFO("No data to train/add for ANN index. Skipping index building."); - return _index_file_writer->delete_index(_index_meta); + _sample_training_vectors(_buffered_vectors.data(), _buffered_vectors.size() / dim, dim, + sample_rows_limit); } + } + _release_buffered_vectors(); + return Status::OK(); +} + +Status AnnIndexColumnWriter::_ensure_spool_file() { + if (_spool_file_writer != nullptr) { + return Status::OK(); + } + DORIS_CHECK(ExecEnv::GetInstance()->get_tmp_file_dirs() != nullptr); + _spool_file_path = ExecEnv::GetInstance()->get_tmp_file_dirs()->get_tmp_file_dir() / + fmt::format("ann_index_build_{}.spool", UniqueId::gen_uid().to_string()); + io::FileWriterOptions opts; + opts.sync_file_data = false; + return io::global_local_filesystem()->create_file(_spool_file_path, &_spool_file_writer, &opts); +} + +Status AnnIndexColumnWriter::_append_to_spool_file(const float* vectors, size_t num_elements) { + const size_t bytes = num_elements * sizeof(float); + return _spool_file_writer->append(Slice(reinterpret_cast(vectors), bytes)); +} + +Status AnnIndexColumnWriter::_flush_spool_writer() { + if (_spool_file_writer == nullptr) { + return Status::OK(); + } + RETURN_IF_ERROR(_spool_file_writer->close()); + _spool_file_writer.reset(); + return Status::OK(); +} + +Status AnnIndexColumnWriter::_train_and_add(Int64 min_train_rows) { + if (_total_rows < min_train_rows) { + LOG_INFO( + "Total data size {} is less than minimum {} rows required for ANN index training. " + "Skipping index building for this segment.", + _total_rows, min_train_rows); + RETURN_IF_ERROR(_flush_spool_writer()); + _release_buffered_vectors(); + return _index_file_writer->delete_index(_index_meta); + } + + const size_t dim = _vector_index->get_dimension(); + if (_spool_file_path.empty()) { + DCHECK(_buffered_vectors.size() % dim == 0); + const Int64 train_rows = cast_set(_buffered_vectors.size() / dim); + DORIS_CHECK(train_rows >= min_train_rows); + RETURN_IF_ERROR(_vector_index->train(train_rows, _buffered_vectors.data())); + RETURN_IF_ERROR(_add_vectors_in_chunks(_buffered_vectors.data(), train_rows)); + _release_buffered_vectors(); } else { - DCHECK(_float_array.size() % _vector_index->get_dimension() == 0); + DCHECK(_training_sample.size() % dim == 0); + const Int64 train_rows = cast_set(_training_sample.size() / dim); + DORIS_CHECK(train_rows >= min_train_rows); + RETURN_IF_ERROR(_vector_index->train(train_rows, _training_sample.data())); + { + PODArray empty_training_sample; + _training_sample.swap(empty_training_sample); + } + RETURN_IF_ERROR(_flush_spool_writer()); + RETURN_IF_ERROR(_add_spooled_vectors()); + } + return _vector_index->save(_dir.get()); +} - Int64 num_rows = _float_array.size() / _vector_index->get_dimension(); +Status AnnIndexColumnWriter::_add_spooled_vectors() { + DCHECK(!_spool_file_path.empty()); + io::FileReaderSPtr reader; + RETURN_IF_ERROR(io::global_local_filesystem()->open_file(_spool_file_path, &reader)); - if (num_rows >= min_train_rows) { - RETURN_IF_ERROR(_vector_index->train(num_rows, _float_array.data())); - RETURN_IF_ERROR(_vector_index->add(num_rows, _float_array.data())); - _float_array.clear(); - return _vector_index->save(_dir.get()); - } else { - // It happens to have not enough data to train. - // If we have data to add before, we still need to save the index. - if (_need_save_index) { - // For IVF indexes, adding remaining vectors without training is acceptable - // because the quantizer was already trained on previous batches. These vectors - // are simply added to the nearest clusters without retraining. - RETURN_IF_ERROR(_vector_index->add(num_rows, _float_array.data())); - _float_array.clear(); - return _vector_index->save(_dir.get()); - } else { - // Not enough data to train and no data added before. - // Means this is a very small segment, we can skip the index building. - // We need to delete the directory entry from index_file_writer to avoid - // writing an empty/invalid index file which causes "IndexInput read past EOF" error. - LOG_INFO( - "Remaining data size {} is less than minimum {} rows required for ANN " - "index " - "training. Skipping index building for this segment.", - num_rows, min_train_rows); - _float_array.clear(); - return _index_file_writer->delete_index(_index_meta); - } + const size_t dim = _vector_index->get_dimension(); + const size_t chunk_elements = _add_chunk_rows(dim) * dim; + _read_buffer.resize(chunk_elements); + const size_t buffer_bytes = chunk_elements * sizeof(float); + size_t offset = 0; + while (offset < reader->size()) { + const size_t bytes_to_read = std::min(buffer_bytes, reader->size() - offset); + DCHECK(bytes_to_read % sizeof(float) == 0); + size_t bytes_read = 0; + RETURN_IF_ERROR(reader->read_at( + offset, Slice(reinterpret_cast(_read_buffer.data()), bytes_to_read), + &bytes_read)); + if (bytes_read != bytes_to_read) { + return Status::IOError( + "Failed to read ANN index build spool file {}, expect {} bytes, " + "got {} bytes", + _spool_file_path.native(), bytes_to_read, bytes_read); + } + DCHECK((bytes_read / sizeof(float)) % dim == 0); + RETURN_IF_ERROR(_vector_index->add(cast_set(bytes_read / sizeof(float) / dim), + _read_buffer.data())); + offset += bytes_read; + } + RETURN_IF_ERROR(reader->close()); + PODArray empty_read_buffer; + _read_buffer.swap(empty_read_buffer); + return Status::OK(); +} + +void AnnIndexColumnWriter::_release_buffered_vectors() { + PODArray empty_buffered_vectors; + _buffered_vectors.swap(empty_buffered_vectors); +} + +void AnnIndexColumnWriter::_delete_spool_file() { + if (_spool_file_writer != nullptr) { + Status st = _spool_file_writer->close(); + if (!st.ok()) { + LOG(WARNING) << "Failed to close ANN index build spool file " + << _spool_file_path.native() << ": " << st; + } + _spool_file_writer.reset(); + } + if (!_spool_file_path.empty()) { + Status st = io::global_local_filesystem()->delete_file(_spool_file_path); + if (!st.ok()) { + LOG(WARNING) << "Failed to delete ANN index build spool file " + << _spool_file_path.native() << ": " << st; } + _spool_file_path.clear(); } } } // namespace doris::segment_v2 diff --git a/be/src/storage/index/ann/ann_index_writer.h b/be/src/storage/index/ann/ann_index_writer.h index 7b7e63f8574439..1d8b95ab65556e 100644 --- a/be/src/storage/index/ann/ann_index_writer.h +++ b/be/src/storage/index/ann/ann_index_writer.h @@ -24,11 +24,14 @@ #include #include +#include #include #include #include "common/config.h" #include "core/pod_array.h" +#include "io/fs/file_reader_writer_fwd.h" +#include "io/fs/path.h" #include "storage/index/ann/ann_index.h" #include "storage/index/index_file_writer.h" #include "storage/index/index_writer.h" @@ -45,6 +48,7 @@ class AnnIndexColumnWriter : public IndexColumnWriter { return config::ann_index_build_chunk_size; #endif } + static inline int64_t chunk_bytes() { return config::ann_index_build_chunk_bytes; } static constexpr const char* INDEX_TYPE = "index_type"; static constexpr const char* METRIC_TYPE = "metric_type"; static constexpr const char* DIM = "dim"; @@ -71,16 +75,42 @@ class AnnIndexColumnWriter : public IndexColumnWriter { Status finish() override; private: + size_t _chunk_rows_by_bytes(size_t dim) const; + size_t _add_chunk_rows(size_t dim) const; + bool _train_rows_exceed_chunk_bytes(size_t dim, Int64 min_train_rows) const; + size_t _training_sample_rows_limit(Int64 min_train_rows, size_t dim) const; + Status _add_vectors_in_chunks(const float* vectors, size_t num_rows); + Status _append_vectors_need_train(const float* vectors, size_t num_rows, Int64 min_train_rows); + void _sample_training_vectors(const float* vectors, size_t num_rows, size_t dim, + size_t sample_rows_limit); + Status _spill_buffered_vectors(size_t dim, size_t sample_rows_limit); + Status _ensure_spool_file(); + Status _append_to_spool_file(const float* vectors, size_t num_elements); + Status _flush_spool_writer(); + Status _train_and_add(Int64 min_train_rows); + Status _add_spooled_vectors(); + void _release_buffered_vectors(); + void _delete_spool_file(); + +#ifdef BE_TEST + friend class TestAnnIndexColumnWriter; +#endif + // VectorIndex shoule be managed by some cache. // VectorIndex should be weak shared by AnnIndexWriter and VectorIndexReader // This should be a weak_ptr std::shared_ptr _vector_index; - // _float_array is used to buffer the float data before training/adding to vector index - // if we dont do this, the performance(recall) will be very poor when adding small number of vectors one by one - PODArray _float_array; + PODArray _buffered_vectors; + PODArray _training_sample; + PODArray _read_buffer; + uint64_t _training_sample_seen_rows = 0; + std::mt19937_64 _training_sample_rng {0}; + io::Path _spool_file_path; + io::FileWriterPtr _spool_file_writer; + int64_t _total_rows = 0; IndexFileWriter* _index_file_writer; const TabletIndex* _index_meta; std::shared_ptr _dir; - bool _need_save_index = false; + bool _skip_build = false; }; } // namespace doris::segment_v2 diff --git a/be/src/storage/index/ann/faiss_ann_index.cpp b/be/src/storage/index/ann/faiss_ann_index.cpp index f933f3c683f940..68b06db2b9061a 100644 --- a/be/src/storage/index/ann/faiss_ann_index.cpp +++ b/be/src/storage/index/ann/faiss_ann_index.cpp @@ -501,7 +501,8 @@ Int64 FaissVectorIndex::get_min_train_rows() const { // For IVF indexes, the minimum number of training points should be at least // equal to the number of clusters (nlist). FAISS requires this for k-means clustering. Int64 ivf_min = 0; - if (_params.index_type == FaissBuildParameter::IndexType::IVF) { + if (_params.index_type == FaissBuildParameter::IndexType::IVF || + _params.index_type == FaissBuildParameter::IndexType::IVF_ON_DISK) { ivf_min = _params.ivf_nlist; } diff --git a/be/test/storage/index/ann/ann_index_writer_test.cpp b/be/test/storage/index/ann/ann_index_writer_test.cpp index bb30f9e19794af..9846b3093d34bc 100644 --- a/be/test/storage/index/ann/ann_index_writer_test.cpp +++ b/be/test/storage/index/ann/ann_index_writer_test.cpp @@ -26,10 +26,13 @@ #include #include +#include "common/config.h" +#include "storage/index/ann/faiss_ann_index.h" #include "storage/index/ann/vector_search_utils.h" #include "storage/index/index_file_writer.h" #include "storage/index/inverted/inverted_index_fs_directory.h" #include "storage/tablet/tablet_schema.h" +#include "util/defer_op.h" using namespace doris::vector_search_utils; @@ -60,7 +63,17 @@ class TestAnnIndexColumnWriter : public AnnIndexColumnWriter { : AnnIndexColumnWriter(index_file_writer, index_meta) {} void set_vector_index(std::shared_ptr index) { _vector_index = index; } - void set_need_save_index(bool value) { _need_save_index = value; } + size_t buffered_vector_capacity() const { return _buffered_vectors.capacity(); } + size_t read_buffer_capacity() const { return _read_buffer.capacity(); } + size_t buffered_vector_rows(size_t dim) const { return _buffered_vectors.size() / dim; } + bool has_spool_file() const { return !_spool_file_path.empty(); } + size_t add_chunk_rows(size_t dim) const { return _add_chunk_rows(dim); } + size_t training_sample_rows_limit(Int64 min_train_rows, size_t dim) const { + return _training_sample_rows_limit(min_train_rows, dim); + } + bool train_rows_exceed_chunk_bytes(size_t dim, Int64 min_train_rows) const { + return _train_rows_exceed_chunk_bytes(dim, min_train_rows); + } }; class AnnIndexWriterTest : public ::testing::Test { @@ -165,6 +178,19 @@ TEST_F(AnnIndexWriterTest, TestInitWithDifferentProperties) { } } +TEST_F(AnnIndexWriterTest, TestInitDoesNotPreallocateBuildChunk) { + auto writer = std::make_unique(_index_file_writer.get(), + _tablet_index.get()); + + auto fs_dir = std::make_shared(); + fs_dir->init(doris::io::global_local_filesystem(), "./ut_dir/tmp_vector_search", nullptr); + EXPECT_CALL(*_index_file_writer, open(testing::_)).WillOnce(testing::Return(fs_dir)); + + ASSERT_TRUE(writer->init().ok()); + EXPECT_EQ(writer->buffered_vector_capacity(), 0); + EXPECT_EQ(writer->read_buffer_capacity(), 0); +} + TEST_F(AnnIndexWriterTest, TestAddArrayValuesSuccess) { auto writer = std::make_unique(_index_file_writer.get(), _tablet_index.get()); @@ -415,7 +441,7 @@ TEST_F(AnnIndexWriterTest, TestInvalidMetricType) { EXPECT_THROW(writer->init(), doris::Exception); } -TEST_F(AnnIndexWriterTest, TestAddMoreThanChunkSize) { +TEST_F(AnnIndexWriterTest, TestNoTrainIndexAddsDirectly) { auto mock_index = std::make_shared(); auto writer = std::make_unique(_index_file_writer.get(), _tablet_index.get()); @@ -427,28 +453,24 @@ TEST_F(AnnIndexWriterTest, TestAddMoreThanChunkSize) { ASSERT_TRUE(writer->init().ok()); writer->set_vector_index(mock_index); - EXPECT_CALL(*mock_index, train(10, testing::_)) - .Times(1) - .WillOnce(testing::Return(Status::OK())); - EXPECT_CALL(*mock_index, add(10, testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); - EXPECT_CALL(*mock_index, train(2, testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); - EXPECT_CALL(*mock_index, add(2, testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); - EXPECT_CALL(*mock_index, save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); + EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(0)); + EXPECT_CALL(*mock_index, train(testing::_, testing::_)).Times(0); + EXPECT_CALL(*mock_index, add(6, testing::_)) + .Times(2) + .WillRepeatedly(testing::Return(Status::OK())); + EXPECT_CALL(*mock_index, save(testing::_)).Times(0); - // CHUNK_SIZE = 10 const size_t dim = 4; - - { + for (int batch = 0; batch < 2; ++batch) { const size_t num_rows = 6; - std::vector vectors = { - 1.0f, 2.0f, 3.0f, 4.0f, // Row 0 - 5.0f, 6.0f, 7.0f, 8.0f, // Row 1 - 9.0f, 10.0f, 11.0f, 12.0f, // Row 2 - 13.0f, 14.0f, 15.0f, 16.0f, // Row 3 - 17.0f, 18.0f, 19.0f, 20.0f, // Row 4 - 21.0f, 22.0f, 23.0f, 24.0f // Row 5 - }; - std::vector offsets = {0, 4, 8, 12, 16, 20, 24}; + std::vector vectors(num_rows * dim); + for (size_t i = 0; i < vectors.size(); ++i) { + vectors[i] = static_cast(batch * vectors.size() + i); + } + std::vector offsets; + for (size_t row = 0; row <= num_rows; ++row) { + offsets.push_back(row * dim); + } Status status = writer->add_array_values(sizeof(float), vectors.data(), nullptr, reinterpret_cast(offsets.data()), @@ -456,25 +478,73 @@ TEST_F(AnnIndexWriterTest, TestAddMoreThanChunkSize) { EXPECT_TRUE(status.ok()); } + EXPECT_FALSE(writer->has_spool_file()); + EXPECT_TRUE(testing::Mock::VerifyAndClearExpectations(mock_index.get())); + + EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(0)); + EXPECT_CALL(*mock_index, train(testing::_, testing::_)).Times(0); + EXPECT_CALL(*mock_index, add(testing::_, testing::_)).Times(0); + EXPECT_CALL(*mock_index, save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); + + Status status = writer->finish(); + EXPECT_TRUE(status.ok()); +} + +TEST_F(AnnIndexWriterTest, TestNoTrainIndexRespectsBuildChunkBytes) { + const int64_t old_chunk_bytes = config::ann_index_build_chunk_bytes; + config::ann_index_build_chunk_bytes = 32; + doris::Defer restore_config {[&] { config::ann_index_build_chunk_bytes = old_chunk_bytes; }}; + + auto mock_index = std::make_shared(); + auto writer = std::make_unique(_index_file_writer.get(), + _tablet_index.get()); + + auto fs_dir = std::make_shared(); + fs_dir->init(doris::io::global_local_filesystem(), "./ut_dir/tmp_vector_search", nullptr); + EXPECT_CALL(*_index_file_writer, open(testing::_)).WillOnce(testing::Return(fs_dir)); + + ASSERT_TRUE(writer->init().ok()); + writer->set_vector_index(mock_index); + + const size_t dim = 4; + EXPECT_EQ(writer->add_chunk_rows(dim), 2); + + EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(0)); + EXPECT_CALL(*mock_index, train(testing::_, testing::_)).Times(0); + EXPECT_CALL(*mock_index, save(testing::_)).Times(0); { - const size_t num_rows = 6; - std::vector vectors = { - 25.0f, 26.0f, 27.0f, 28.0f, // Row 6 - 29.0f, 30.0f, 31.0f, 32.0f, // Row 7 - 33.0f, 34.0f, 35.0f, 36.0f, // Row 8 - 37.0f, 38.0f, 39.0f, 40.0f, // Row 9 - 41.0f, 42.0f, 43.0f, 44.0f, // Row 10 - 45.0f, 46.0f, 47.0f, 48.0f // Row 11 - }; - std::vector offsets = {0, 4, 8, 12, 16, 20, 24}; + testing::InSequence sequence; + EXPECT_CALL(*mock_index, add(2, testing::_)) + .Times(2) + .WillRepeatedly(testing::Return(Status::OK())); + EXPECT_CALL(*mock_index, add(1, testing::_)) + .Times(1) + .WillOnce(testing::Return(Status::OK())); + } - Status status = writer->add_array_values(sizeof(float), vectors.data(), nullptr, - reinterpret_cast(offsets.data()), - num_rows); - EXPECT_TRUE(status.ok()); + const size_t num_rows = 5; + std::vector vectors(num_rows * dim); + for (size_t i = 0; i < vectors.size(); ++i) { + vectors[i] = static_cast(i); + } + std::vector offsets; + for (size_t row = 0; row <= num_rows; ++row) { + offsets.push_back(row * dim); } - Status status = writer->finish(); + Status status = + writer->add_array_values(sizeof(float), vectors.data(), nullptr, + reinterpret_cast(offsets.data()), num_rows); + EXPECT_TRUE(status.ok()); + EXPECT_FALSE(writer->has_spool_file()); + EXPECT_TRUE(testing::Mock::VerifyAndClearExpectations(mock_index.get())); + + EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(0)); + EXPECT_CALL(*mock_index, train(testing::_, testing::_)).Times(0); + EXPECT_CALL(*mock_index, add(testing::_, testing::_)).Times(0); + EXPECT_CALL(*mock_index, save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); + + status = writer->finish(); EXPECT_TRUE(status.ok()); } @@ -566,7 +636,7 @@ TEST_F(AnnIndexWriterTest, TestAddArrayValuesIVF) { EXPECT_TRUE(status.ok()); } -TEST_F(AnnIndexWriterTest, TestAddMoreThanChunkSizeIVF) { +TEST_F(AnnIndexWriterTest, TestSmallTrainRequiredIndexUsesMemoryBuffer) { auto mock_index = std::make_shared(); auto properties = _properties; properties["index_type"] = "ivf"; @@ -587,62 +657,53 @@ TEST_F(AnnIndexWriterTest, TestAddMoreThanChunkSizeIVF) { ASSERT_TRUE(writer->init().ok()); writer->set_vector_index(mock_index); - EXPECT_CALL(*mock_index, train(10, testing::_)) - .Times(1) - .WillOnce(testing::Return(Status::OK())); - EXPECT_CALL(*mock_index, add(10, testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); - EXPECT_CALL(*mock_index, train(2, testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); - EXPECT_CALL(*mock_index, add(2, testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); - EXPECT_CALL(*mock_index, save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); + EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(2)); + EXPECT_CALL(*mock_index, train(testing::_, testing::_)).Times(0); + EXPECT_CALL(*mock_index, add(testing::_, testing::_)).Times(0); + EXPECT_CALL(*mock_index, save(testing::_)).Times(0); - // CHUNK_SIZE = 10 const size_t dim = 4; - - { - const size_t num_rows = 6; - std::vector vectors = { - 1.0f, 2.0f, 3.0f, 4.0f, // Row 0 - 5.0f, 6.0f, 7.0f, 8.0f, // Row 1 - 9.0f, 10.0f, 11.0f, 12.0f, // Row 2 - 13.0f, 14.0f, 15.0f, 16.0f, // Row 3 - 17.0f, 18.0f, 19.0f, 20.0f, // Row 4 - 21.0f, 22.0f, 23.0f, 24.0f // Row 5 - }; - std::vector offsets = {0, 4, 8, 12, 16, 20, 24}; - - Status status = writer->add_array_values(sizeof(float), vectors.data(), nullptr, - reinterpret_cast(offsets.data()), - num_rows); - EXPECT_TRUE(status.ok()); + const size_t num_rows = 4; + std::vector vectors(num_rows * dim); + for (size_t i = 0; i < vectors.size(); ++i) { + vectors[i] = static_cast(i); + } + std::vector offsets; + for (size_t row = 0; row <= num_rows; ++row) { + offsets.push_back(row * dim); } - { - const size_t num_rows = 6; - std::vector vectors = { - 25.0f, 26.0f, 27.0f, 28.0f, // Row 6 - 29.0f, 30.0f, 31.0f, 32.0f, // Row 7 - 33.0f, 34.0f, 35.0f, 36.0f, // Row 8 - 37.0f, 38.0f, 39.0f, 40.0f, // Row 9 - 41.0f, 42.0f, 43.0f, 44.0f, // Row 10 - 45.0f, 46.0f, 47.0f, 48.0f // Row 11 - }; - std::vector offsets = {0, 4, 8, 12, 16, 20, 24}; + Status status = + writer->add_array_values(sizeof(float), vectors.data(), nullptr, + reinterpret_cast(offsets.data()), num_rows); + EXPECT_TRUE(status.ok()); + EXPECT_FALSE(writer->has_spool_file()); + EXPECT_EQ(writer->buffered_vector_rows(dim), num_rows); + EXPECT_TRUE(testing::Mock::VerifyAndClearExpectations(mock_index.get())); - Status status = writer->add_array_values(sizeof(float), vectors.data(), nullptr, - reinterpret_cast(offsets.data()), - num_rows); - EXPECT_TRUE(status.ok()); + EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(2)); + { + testing::InSequence sequence; + EXPECT_CALL(*mock_index, train(4, testing::_)) + .Times(1) + .WillOnce(testing::Return(Status::OK())); + EXPECT_CALL(*mock_index, add(4, testing::_)) + .Times(1) + .WillOnce(testing::Return(Status::OK())); + EXPECT_CALL(*mock_index, save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); } - Status status = writer->finish(); + status = writer->finish(); EXPECT_TRUE(status.ok()); + EXPECT_FALSE(writer->has_spool_file()); + EXPECT_EQ(writer->buffered_vector_rows(dim), 0); } -TEST_F(AnnIndexWriterTest, TestSkipTrainWhenRemainderLessThanNlist) { +TEST_F(AnnIndexWriterTest, TestTrainRequiredIndexTrainsOnceAndAddsAllRows) { auto mock_index = std::make_shared(); auto properties = _properties; properties["index_type"] = "ivf"; - properties["nlist"] = "5"; // Set nlist to 5 + properties["nlist"] = "2"; properties["quantizer"] = "flat"; auto tablet_index = std::make_unique(); @@ -659,30 +720,25 @@ TEST_F(AnnIndexWriterTest, TestSkipTrainWhenRemainderLessThanNlist) { ASSERT_TRUE(writer->init().ok()); writer->set_vector_index(mock_index); - // CHUNK_SIZE = 10, nlist = 5 - // Add 12 rows: first 10 will be trained/added in one batch, remaining 2 < 5 - // Since we have trained data before (_need_save_index = true), we should add the remaining 2 rows and save - EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(5)); - EXPECT_CALL(*mock_index, train(10, testing::_)) - .Times(1) - .WillOnce(testing::Return(Status::OK())); - EXPECT_CALL(*mock_index, add(10, testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); - EXPECT_CALL(*mock_index, add(2, testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); - EXPECT_CALL(*mock_index, save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); + EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(2)); + EXPECT_CALL(*mock_index, train(testing::_, testing::_)).Times(0); + EXPECT_CALL(*mock_index, add(testing::_, testing::_)).Times(0); + EXPECT_CALL(*mock_index, save(testing::_)).Times(0); + // CHUNK_SIZE = 10 const size_t dim = 4; - // Add 12 rows total { - const size_t num_rows = 10; - std::vector vectors(10 * 4); - for (size_t i = 0; i < 10 * 4; ++i) { - vectors[i] = static_cast(i); - } - std::vector offsets; - for (size_t i = 0; i <= num_rows; ++i) { - offsets.push_back(i * 4); - } + const size_t num_rows = 6; + std::vector vectors = { + 1.0f, 2.0f, 3.0f, 4.0f, // Row 0 + 5.0f, 6.0f, 7.0f, 8.0f, // Row 1 + 9.0f, 10.0f, 11.0f, 12.0f, // Row 2 + 13.0f, 14.0f, 15.0f, 16.0f, // Row 3 + 17.0f, 18.0f, 19.0f, 20.0f, // Row 4 + 21.0f, 22.0f, 23.0f, 24.0f // Row 5 + }; + std::vector offsets = {0, 4, 8, 12, 16, 20, 24}; Status status = writer->add_array_values(sizeof(float), vectors.data(), nullptr, reinterpret_cast(offsets.data()), @@ -690,14 +746,17 @@ TEST_F(AnnIndexWriterTest, TestSkipTrainWhenRemainderLessThanNlist) { EXPECT_TRUE(status.ok()); } - // Add 2 more rows { - const size_t num_rows = 2; + const size_t num_rows = 6; std::vector vectors = { - 40.0f, 41.0f, 42.0f, 43.0f, // Row 10 - 44.0f, 45.0f, 46.0f, 47.0f // Row 11 + 25.0f, 26.0f, 27.0f, 28.0f, // Row 6 + 29.0f, 30.0f, 31.0f, 32.0f, // Row 7 + 33.0f, 34.0f, 35.0f, 36.0f, // Row 8 + 37.0f, 38.0f, 39.0f, 40.0f, // Row 9 + 41.0f, 42.0f, 43.0f, 44.0f, // Row 10 + 45.0f, 46.0f, 47.0f, 48.0f // Row 11 }; - std::vector offsets = {0, 4, 8}; + std::vector offsets = {0, 4, 8, 12, 16, 20, 24}; Status status = writer->add_array_values(sizeof(float), vectors.data(), nullptr, reinterpret_cast(offsets.data()), @@ -705,23 +764,37 @@ TEST_F(AnnIndexWriterTest, TestSkipTrainWhenRemainderLessThanNlist) { EXPECT_TRUE(status.ok()); } + EXPECT_TRUE(writer->has_spool_file()); + EXPECT_TRUE(testing::Mock::VerifyAndClearExpectations(mock_index.get())); + + EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(2)); + { + testing::InSequence sequence; + EXPECT_CALL(*mock_index, train(10, testing::_)) + .Times(1) + .WillOnce(testing::Return(Status::OK())); + EXPECT_CALL(*mock_index, add(10, testing::_)) + .Times(1) + .WillOnce(testing::Return(Status::OK())); + EXPECT_CALL(*mock_index, add(2, testing::_)) + .Times(1) + .WillOnce(testing::Return(Status::OK())); + EXPECT_CALL(*mock_index, save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); + } + Status status = writer->finish(); EXPECT_TRUE(status.ok()); } -TEST_F(AnnIndexWriterTest, TestLargeDataVolumeWithRemainderSkip) { - auto mock_index = std::make_shared(); - auto properties = _properties; - properties["index_type"] = "ivf"; - properties["nlist"] = "3"; // Set nlist to 3 - properties["quantizer"] = "flat"; - - auto tablet_index = std::make_unique(); - tablet_index->_properties = properties; - tablet_index->_index_id = 1; +TEST_F(AnnIndexWriterTest, TestTrainingSampleUsesReservoirAndMaxRows) { + const int64_t old_max_train_rows = config::ann_index_build_max_train_rows; + config::ann_index_build_max_train_rows = 6; + doris::Defer restore_config { + [&] { config::ann_index_build_max_train_rows = old_max_train_rows; }}; + auto mock_index = std::make_shared(); auto writer = std::make_unique(_index_file_writer.get(), - tablet_index.get()); + _tablet_index.get()); auto fs_dir = std::make_shared(); fs_dir->init(doris::io::global_local_filesystem(), "./ut_dir/tmp_vector_search", nullptr); @@ -730,72 +803,72 @@ TEST_F(AnnIndexWriterTest, TestLargeDataVolumeWithRemainderSkip) { ASSERT_TRUE(writer->init().ok()); writer->set_vector_index(mock_index); - // CHUNK_SIZE = 10, nlist = 3 - // Add 23 rows: 2 full chunks of 10, remaining 3 == nlist, so train remaining - EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(3)); - EXPECT_CALL(*mock_index, train(10, testing::_)) - .Times(2) - .WillRepeatedly(testing::Return(Status::OK())); - EXPECT_CALL(*mock_index, add(10, testing::_)) - .Times(2) - .WillRepeatedly(testing::Return(Status::OK())); - EXPECT_CALL(*mock_index, train(3, testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); - EXPECT_CALL(*mock_index, add(3, testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); - EXPECT_CALL(*mock_index, save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); + EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(2)); + EXPECT_CALL(*mock_index, train(testing::_, testing::_)).Times(0); + EXPECT_CALL(*mock_index, add(testing::_, testing::_)).Times(0); + EXPECT_CALL(*mock_index, save(testing::_)).Times(0); const size_t dim = 4; - - // Add 3 batches: 10 + 10 + 3 = 23 rows - for (int batch = 0; batch < 2; ++batch) { - const size_t num_rows = 10; - std::vector vectors(10 * 4); - for (size_t i = 0; i < 10 * 4; ++i) { - vectors[i] = static_cast(batch * 40 + i); + const size_t num_rows = 20; + std::vector vectors(num_rows * dim); + for (size_t row = 0; row < num_rows; ++row) { + for (size_t col = 0; col < dim; ++col) { + vectors[row * dim + col] = static_cast(row); } - std::vector offsets; - for (size_t i = 0; i <= num_rows; ++i) { - offsets.push_back(i * 4); - } - - Status status = writer->add_array_values(sizeof(float), vectors.data(), nullptr, - reinterpret_cast(offsets.data()), - num_rows); - EXPECT_TRUE(status.ok()); + } + std::vector offsets; + for (size_t row = 0; row <= num_rows; ++row) { + offsets.push_back(row * dim); } - // Add remaining 3 rows - { - const size_t num_rows = 3; - std::vector vectors = { - 80.0f, 81.0f, 82.0f, 83.0f, // Row 20 - 84.0f, 85.0f, 86.0f, 87.0f, // Row 21 - 88.0f, 89.0f, 90.0f, 91.0f // Row 22 - }; - std::vector offsets = {0, 4, 8, 12}; + Status status = + writer->add_array_values(sizeof(float), vectors.data(), nullptr, + reinterpret_cast(offsets.data()), num_rows); + EXPECT_TRUE(status.ok()); + EXPECT_TRUE(writer->has_spool_file()); + EXPECT_TRUE(testing::Mock::VerifyAndClearExpectations(mock_index.get())); - Status status = writer->add_array_values(sizeof(float), vectors.data(), nullptr, - reinterpret_cast(offsets.data()), - num_rows); - EXPECT_TRUE(status.ok()); + EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(2)); + { + testing::InSequence sequence; + EXPECT_CALL(*mock_index, train(6, testing::_)) + .Times(1) + .WillOnce(testing::Invoke([&](Int64 n, const float* vec) { + EXPECT_EQ(n, 6); + bool has_row_after_initial_sample = false; + bool is_prefix_sample = true; + for (size_t row = 0; row < static_cast(n); ++row) { + const auto row_id = static_cast(vec[row * dim]); + EXPECT_LT(row_id, num_rows); + if (row_id >= static_cast(n)) { + has_row_after_initial_sample = true; + } + if (row_id != row) { + is_prefix_sample = false; + } + } + EXPECT_TRUE(has_row_after_initial_sample); + EXPECT_FALSE(is_prefix_sample); + return Status::OK(); + })); + EXPECT_CALL(*mock_index, add(10, testing::_)) + .Times(2) + .WillRepeatedly(testing::Return(Status::OK())); + EXPECT_CALL(*mock_index, save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); } - Status status = writer->finish(); + status = writer->finish(); EXPECT_TRUE(status.ok()); } -TEST_F(AnnIndexWriterTest, TestLargeDataVolumeSkipRemainder) { - auto mock_index = std::make_shared(); - auto properties = _properties; - properties["index_type"] = "ivf"; - properties["nlist"] = "4"; // Set nlist to 4 - properties["quantizer"] = "flat"; - - auto tablet_index = std::make_unique(); - tablet_index->_properties = properties; - tablet_index->_index_id = 1; +TEST_F(AnnIndexWriterTest, TestBuildChunkBytesCapsAddRows) { + const int64_t old_chunk_bytes = config::ann_index_build_chunk_bytes; + config::ann_index_build_chunk_bytes = 32; + doris::Defer restore_config {[&] { config::ann_index_build_chunk_bytes = old_chunk_bytes; }}; + auto mock_index = std::make_shared(); auto writer = std::make_unique(_index_file_writer.get(), - tablet_index.get()); + _tablet_index.get()); auto fs_dir = std::make_shared(); fs_dir->init(doris::io::global_local_filesystem(), "./ut_dir/tmp_vector_search", nullptr); @@ -804,71 +877,51 @@ TEST_F(AnnIndexWriterTest, TestLargeDataVolumeSkipRemainder) { ASSERT_TRUE(writer->init().ok()); writer->set_vector_index(mock_index); - // CHUNK_SIZE = 10, nlist = 4 - // Add 22 rows: 2 full chunks of 10, remaining 2 < 4 - // Since we have trained data before (_need_save_index = true), we should add the remaining 2 rows and save - EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(4)); - EXPECT_CALL(*mock_index, train(10, testing::_)) - .Times(2) - .WillRepeatedly(testing::Return(Status::OK())); - EXPECT_CALL(*mock_index, add(10, testing::_)) - .Times(2) - .WillRepeatedly(testing::Return(Status::OK())); - EXPECT_CALL(*mock_index, add(2, testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); - EXPECT_CALL(*mock_index, save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); - const size_t dim = 4; + EXPECT_EQ(writer->add_chunk_rows(dim), 2); + EXPECT_EQ(writer->training_sample_rows_limit(1, dim), 2); - // Add 2 batches of 10 rows - for (int batch = 0; batch < 2; ++batch) { - const size_t num_rows = 10; - std::vector vectors(10 * 4); - for (size_t i = 0; i < 10 * 4; ++i) { - vectors[i] = static_cast(batch * 40 + i); - } - std::vector offsets; - for (size_t i = 0; i <= num_rows; ++i) { - offsets.push_back(i * 4); - } + EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(1)); + EXPECT_CALL(*mock_index, train(testing::_, testing::_)).Times(0); + EXPECT_CALL(*mock_index, add(testing::_, testing::_)).Times(0); + EXPECT_CALL(*mock_index, save(testing::_)).Times(0); - Status status = writer->add_array_values(sizeof(float), vectors.data(), nullptr, - reinterpret_cast(offsets.data()), - num_rows); - EXPECT_TRUE(status.ok()); + const size_t num_rows = 6; + std::vector vectors(num_rows * dim); + for (size_t i = 0; i < vectors.size(); ++i) { + vectors[i] = static_cast(i); + } + std::vector offsets; + for (size_t row = 0; row <= num_rows; ++row) { + offsets.push_back(row * dim); } - // Add remaining 2 rows - { - const size_t num_rows = 2; - std::vector vectors = { - 80.0f, 81.0f, 82.0f, 83.0f, // Row 20 - 84.0f, 85.0f, 86.0f, 87.0f // Row 21 - }; - std::vector offsets = {0, 4, 8}; + Status status = + writer->add_array_values(sizeof(float), vectors.data(), nullptr, + reinterpret_cast(offsets.data()), num_rows); + EXPECT_TRUE(status.ok()); + EXPECT_TRUE(testing::Mock::VerifyAndClearExpectations(mock_index.get())); - Status status = writer->add_array_values(sizeof(float), vectors.data(), nullptr, - reinterpret_cast(offsets.data()), - num_rows); - EXPECT_TRUE(status.ok()); + EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(1)); + { + testing::InSequence sequence; + EXPECT_CALL(*mock_index, train(2, testing::_)) + .Times(1) + .WillOnce(testing::Return(Status::OK())); + EXPECT_CALL(*mock_index, add(2, testing::_)) + .Times(3) + .WillRepeatedly(testing::Return(Status::OK())); + EXPECT_CALL(*mock_index, save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); } - Status status = writer->finish(); + status = writer->finish(); EXPECT_TRUE(status.ok()); } -TEST_F(AnnIndexWriterTest, TestSkipIndexWhenTotalRowsLessThanNlist) { +TEST_F(AnnIndexWriterTest, TestSkipIndexWhenTotalRowsLessThanMinTrainRows) { auto mock_index = std::make_shared(); - auto properties = _properties; - properties["index_type"] = "ivf"; - properties["nlist"] = "5"; // Set nlist to 5 - properties["quantizer"] = "flat"; - - auto tablet_index = std::make_unique(); - tablet_index->_properties = properties; - tablet_index->_index_id = 1; - auto writer = std::make_unique(_index_file_writer.get(), - tablet_index.get()); + _tablet_index.get()); auto fs_dir = std::make_shared(); fs_dir->init(doris::io::global_local_filesystem(), "./ut_dir/tmp_vector_search", nullptr); @@ -876,94 +929,45 @@ TEST_F(AnnIndexWriterTest, TestSkipIndexWhenTotalRowsLessThanNlist) { ASSERT_TRUE(writer->init().ok()); writer->set_vector_index(mock_index); - writer->set_need_save_index(false); // No previous training, so should skip entirely - // Add only 3 rows, which is less than nlist (5) - // Since no data was trained before (_need_save_index = false), we should skip index building entirely - // No train, add, or save should be called EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(5)); EXPECT_CALL(*mock_index, train(testing::_, testing::_)).Times(0); EXPECT_CALL(*mock_index, add(testing::_, testing::_)).Times(0); EXPECT_CALL(*mock_index, save(testing::_)).Times(0); const size_t dim = 4; - - // Add 3 rows - { - const size_t num_rows = 3; - std::vector vectors = { - 1.0f, 2.0f, 3.0f, 4.0f, // Row 0 - 5.0f, 6.0f, 7.0f, 8.0f, // Row 1 - 9.0f, 10.0f, 11.0f, 12.0f // Row 2 - }; - std::vector offsets = {0, 4, 8, 12}; - - Status status = writer->add_array_values(sizeof(float), vectors.data(), nullptr, - reinterpret_cast(offsets.data()), - num_rows); - EXPECT_TRUE(status.ok()); + const size_t num_rows = 3; + std::vector vectors(num_rows * dim); + for (size_t i = 0; i < vectors.size(); ++i) { + vectors[i] = static_cast(i); + } + std::vector offsets; + for (size_t row = 0; row <= num_rows; ++row) { + offsets.push_back(row * dim); } - Status status = writer->finish(); + Status status = + writer->add_array_values(sizeof(float), vectors.data(), nullptr, + reinterpret_cast(offsets.data()), num_rows); EXPECT_TRUE(status.ok()); -} - -TEST_F(AnnIndexWriterTest, TestPQMinTrainRows) { - // Test writer behavior under a large mocked min_train_rows threshold. - - auto mock_index = std::make_shared(); - auto writer = std::make_unique(_index_file_writer.get(), - _tablet_index.get()); - - auto fs_dir = std::make_shared(); - fs_dir->init(doris::io::global_local_filesystem(), "./ut_dir/tmp_vector_search", nullptr); - EXPECT_CALL(*_index_file_writer, open(testing::_)).WillOnce(testing::Return(fs_dir)); - - ASSERT_TRUE(writer->init().ok()); - writer->set_vector_index(mock_index); - - // Set up expectations: mock a very large min_train_rows threshold. - // Since we only provide 1000 vectors, which is less than 131072, training will happen in batches - // but finish() will skip saving since remaining data is insufficient - EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(131072)); - // 1000 vectors will be processed in 100 batches of 10 vectors each - EXPECT_CALL(*mock_index, train(10, testing::_)) - .Times(100) - .WillRepeatedly(testing::Return(Status::OK())); - EXPECT_CALL(*mock_index, add(10, testing::_)) - .Times(100) - .WillRepeatedly(testing::Return(Status::OK())); - // Since we have trained data in batches, the index will be saved even though total data is insufficient - EXPECT_CALL(*mock_index, save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); - - const size_t dim = 4; - - // Add only 1000 rows, which is less than the required 131072 - { - const size_t num_rows = 1000; - std::vector vectors(num_rows * dim); - for (size_t i = 0; i < num_rows * dim; ++i) { - vectors[i] = static_cast(i % 100); - } - std::vector offsets; - for (size_t i = 0; i <= num_rows; ++i) { - offsets.push_back(i * dim); - } - Status status = writer->add_array_values(sizeof(float), vectors.data(), nullptr, - reinterpret_cast(offsets.data()), - num_rows); - EXPECT_TRUE(status.ok()); - } + EXPECT_FALSE(writer->has_spool_file()); + EXPECT_EQ(writer->buffered_vector_rows(dim), num_rows); + EXPECT_TRUE(testing::Mock::VerifyAndClearExpectations(mock_index.get())); + EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(5)); + EXPECT_CALL(*mock_index, train(testing::_, testing::_)).Times(0); + EXPECT_CALL(*mock_index, add(testing::_, testing::_)).Times(0); + EXPECT_CALL(*mock_index, save(testing::_)).Times(0); - // Finish should skip index building due to insufficient training data - Status status = writer->finish(); + status = writer->finish(); EXPECT_TRUE(status.ok()); + EXPECT_EQ(writer->buffered_vector_rows(dim), 0); } -TEST_F(AnnIndexWriterTest, TestSQMinTrainRows) { - // Test that SQ quantizer requires sufficient training data - // SQ requires at least nlist * 2 = 10 * 2 = 20 training vectors +TEST_F(AnnIndexWriterTest, TestSkipBuildWhenMinTrainRowsExceedsChunkBytes) { + const int64_t old_chunk_bytes = config::ann_index_build_chunk_bytes; + config::ann_index_build_chunk_bytes = 32; + doris::Defer restore_config {[&] { config::ann_index_build_chunk_bytes = old_chunk_bytes; }}; auto mock_index = std::make_shared(); auto writer = std::make_unique(_index_file_writer.get(), @@ -976,94 +980,49 @@ TEST_F(AnnIndexWriterTest, TestSQMinTrainRows) { ASSERT_TRUE(writer->init().ok()); writer->set_vector_index(mock_index); - // Set up expectations: SQ should require at least 20 training vectors - // Since we only provide 15 vectors, training will happen in batches but finish() will skip saving - EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(20)); - // 15 vectors will be processed in 1 batch of 10 vectors and remaining 5 vectors - EXPECT_CALL(*mock_index, train(10, testing::_)) - .Times(1) - .WillOnce(testing::Return(Status::OK())); - EXPECT_CALL(*mock_index, add(10, testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); - EXPECT_CALL(*mock_index, add(5, testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); - // Since we have trained data, the index will be saved even though total data is insufficient - EXPECT_CALL(*mock_index, save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); - const size_t dim = 4; + EXPECT_TRUE(writer->train_rows_exceed_chunk_bytes(dim, 5)); - // Add only 15 rows, which is less than the required 20 - { - const size_t num_rows = 15; - std::vector vectors(num_rows * dim); - for (size_t i = 0; i < num_rows * dim; ++i) { - vectors[i] = static_cast(i % 100); - } - std::vector offsets; - for (size_t i = 0; i <= num_rows; ++i) { - offsets.push_back(i * dim); - } + EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(5)); + EXPECT_CALL(*mock_index, train(testing::_, testing::_)).Times(0); + EXPECT_CALL(*mock_index, add(testing::_, testing::_)).Times(0); + EXPECT_CALL(*mock_index, save(testing::_)).Times(0); - Status status = writer->add_array_values(sizeof(float), vectors.data(), nullptr, - reinterpret_cast(offsets.data()), - num_rows); - EXPECT_TRUE(status.ok()); + const size_t num_rows = 6; + std::vector vectors(num_rows * dim); + for (size_t i = 0; i < vectors.size(); ++i) { + vectors[i] = static_cast(i); + } + std::vector offsets; + for (size_t row = 0; row <= num_rows; ++row) { + offsets.push_back(row * dim); } - // Finish should skip index building due to insufficient training data - Status status = writer->finish(); + Status status = + writer->add_array_values(sizeof(float), vectors.data(), nullptr, + reinterpret_cast(offsets.data()), num_rows); EXPECT_TRUE(status.ok()); -} - -TEST_F(AnnIndexWriterTest, TestPQWithSufficientData) { - // Test that PQ works when sufficient training data is provided - - auto mock_index = std::make_shared(); - auto writer = std::make_unique(_index_file_writer.get(), - _tablet_index.get()); - - auto fs_dir = std::make_shared(); - fs_dir->init(doris::io::global_local_filesystem(), "./ut_dir/tmp_vector_search", nullptr); - EXPECT_CALL(*_index_file_writer, open(testing::_)).WillOnce(testing::Return(fs_dir)); - - ASSERT_TRUE(writer->init().ok()); - writer->set_vector_index(mock_index); - - // Mock min_train_rows to 131072 and provide exactly that amount. - EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(131072)); - // Since we provide exactly 131072 vectors, they will be trained and added in chunks - // Each chunk is 10 vectors, so we expect 13107 train calls and 13107 add calls for full chunks - EXPECT_CALL(*mock_index, train(10, testing::_)) - .Times(13107) - .WillRepeatedly(testing::Return(Status::OK())); - EXPECT_CALL(*mock_index, add(10, testing::_)) - .Times(13107) - .WillRepeatedly(testing::Return(Status::OK())); - // The remaining 2 vectors will be added without training since min_train_rows > 2 - EXPECT_CALL(*mock_index, add(2, testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); - EXPECT_CALL(*mock_index, save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); - const size_t dim = 4; + EXPECT_FALSE(writer->has_spool_file()); + EXPECT_TRUE(testing::Mock::VerifyAndClearExpectations(mock_index.get())); + EXPECT_CALL(*mock_index, train(testing::_, testing::_)).Times(0); + EXPECT_CALL(*mock_index, add(testing::_, testing::_)).Times(0); + EXPECT_CALL(*mock_index, save(testing::_)).Times(0); - // Add exactly 131072 rows - { - const size_t num_rows = 131072; - std::vector vectors(num_rows * dim); - for (size_t i = 0; i < num_rows * dim; ++i) { - vectors[i] = static_cast(i % 100); - } - std::vector offsets; - for (size_t i = 0; i <= num_rows; ++i) { - offsets.push_back(i * dim); - } + status = writer->finish(); + EXPECT_TRUE(status.ok()); +} - Status status = writer->add_array_values(sizeof(float), vectors.data(), nullptr, - reinterpret_cast(offsets.data()), - num_rows); - EXPECT_TRUE(status.ok()); - } +TEST_F(AnnIndexWriterTest, TestIVFOnDiskMinTrainRows) { + FaissVectorIndex index; + FaissBuildParameter params; + params.index_type = FaissBuildParameter::IndexType::IVF_ON_DISK; + params.quantizer = FaissBuildParameter::Quantizer::FLAT; + params.dim = 4; + params.ivf_nlist = 7; - // Finish should successfully build the index - Status status = writer->finish(); - EXPECT_TRUE(status.ok()); + index.build(params); + EXPECT_EQ(index.get_min_train_rows(), 7); } } // namespace doris::segment_v2 diff --git a/regression-test/data/ann_index_p0/ann_index_build_chunk_bytes.out b/regression-test/data/ann_index_p0/ann_index_build_chunk_bytes.out new file mode 100644 index 00000000000000..20381ff6259a85 --- /dev/null +++ b/regression-test/data/ann_index_p0/ann_index_build_chunk_bytes.out @@ -0,0 +1,8 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !row_count -- +8 + +-- !nearest -- +1 +2 +3 diff --git a/regression-test/data/ann_index_p0/ivf_on_disk_index_test.out b/regression-test/data/ann_index_p0/ivf_on_disk_index_test.out index bcd94f4ac52341..239c104321c9a5 100644 --- a/regression-test/data/ann_index_p0/ivf_on_disk_index_test.out +++ b/regression-test/data/ann_index_p0/ivf_on_disk_index_test.out @@ -11,6 +11,10 @@ 1 2 +-- !sql_l2_insufficient_train_rows -- +1 +2 + -- !sql -- 1 [1, 2, 3] 2 [0.5, 2.1, 2.9] diff --git a/regression-test/data/ann_index_p0/ivf_pq_recall.out b/regression-test/data/ann_index_p0/ivf_pq_recall.out new file mode 100644 index 00000000000000..14aab16eedc5fa --- /dev/null +++ b/regression-test/data/ann_index_p0/ivf_pq_recall.out @@ -0,0 +1,9 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !row_count -- +800 + +-- !first_cluster_recall -- +20 + +-- !second_cluster_recall -- +20 diff --git a/regression-test/suites/ann_index_p0/ann_index_build_chunk_bytes.groovy b/regression-test/suites/ann_index_p0/ann_index_build_chunk_bytes.groovy new file mode 100644 index 00000000000000..cf0cce386421b9 --- /dev/null +++ b/regression-test/suites/ann_index_p0/ann_index_build_chunk_bytes.groovy @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("ann_index_build_chunk_bytes", "nonConcurrent") { + sql "set enable_common_expr_pushdown=true;" + + setBeConfigTemporary([ + ann_index_build_chunk_size: 1000000, + ann_index_build_chunk_bytes: 256 + ]) { + sql "drop table if exists ann_index_build_chunk_bytes" + sql """ + create table ann_index_build_chunk_bytes ( + id int not null, + embedding array not null, + index idx_embedding (`embedding`) using ann properties( + "index_type" = "hnsw", + "metric_type" = "l2_distance", + "dim" = "3072" + ) + ) engine=olap + duplicate key(id) + distributed by hash(id) buckets 1 + properties("replication_num" = "1"); + """ + + def rows = [] + for (int i = 1; i <= 8; i++) { + rows.add("(${i}, array_with_constant(3072, cast(${i}.0 as float)))") + } + sql "insert into ann_index_build_chunk_bytes values ${rows.join(', ')};" + sql "sync" + + qt_row_count "select count(*) from ann_index_build_chunk_bytes;" + qt_nearest """ + select id + from ann_index_build_chunk_bytes + order by l2_distance_approximate(embedding, array_with_constant(3072, cast(1.0 as float))) + limit 3; + """ + } +} diff --git a/regression-test/suites/ann_index_p0/ivf_on_disk_index_test.groovy b/regression-test/suites/ann_index_p0/ivf_on_disk_index_test.groovy index a9eed51d7a4125..63fab34d072d3a 100644 --- a/regression-test/suites/ann_index_p0/ivf_on_disk_index_test.groovy +++ b/regression-test/suites/ann_index_p0/ivf_on_disk_index_test.groovy @@ -68,7 +68,7 @@ suite ("ivf_on_disk_index_test") { exception """nlist of ann index must be specified for ivf/ivf_on_disk type""" } - // ========== Error: not enough training points ========== + // Not enough training points: should not throw exception anymore, just skip index building. sql """ CREATE TABLE tbl_ivf_on_disk_l2 ( id INT NOT NULL, @@ -84,14 +84,12 @@ suite ("ivf_on_disk_index_test") { DISTRIBUTED BY HASH(id) BUCKETS 1 PROPERTIES ("replication_num" = "1"); """ - test { - sql """ - INSERT INTO tbl_ivf_on_disk_l2 VALUES - (1, [1.0, 2.0, 3.0]), - (2, [0.5, 2.1, 2.9]); - """ - exception """exception occurred during training""" - } + sql """ + INSERT INTO tbl_ivf_on_disk_l2 VALUES + (1, [1.0, 2.0, 3.0]), + (2, [0.5, 2.1, 2.9]); + """ + qt_sql_l2_insufficient_train_rows "select id from tbl_ivf_on_disk_l2 order by l2_distance_approximate(embedding, [1.0,2.0,3.0]) limit 2;" // ========== IVF_ON_DISK with inner product ========== sql "drop table if exists tbl_ivf_on_disk_ip" diff --git a/regression-test/suites/ann_index_p0/ivf_pq_recall.groovy b/regression-test/suites/ann_index_p0/ivf_pq_recall.groovy new file mode 100644 index 00000000000000..a5cddfeb881e69 --- /dev/null +++ b/regression-test/suites/ann_index_p0/ivf_pq_recall.groovy @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("ivf_pq_recall", "nonConcurrent") { + sql "set enable_common_expr_pushdown=true;" + sql "set enable_ann_index_result_cache=false;" + sql "set ivf_nprobe=8;" + + setBeConfigTemporary([ann_index_build_chunk_size: 400]) { + sql "drop table if exists ivf_pq_recall" + sql """ + create table ivf_pq_recall ( + id int not null, + embedding array not null, + index idx_embedding (`embedding`) using ann properties( + "index_type" = "ivf", + "metric_type" = "l2_distance", + "nlist" = "8", + "dim" = "4", + "quantizer" = "pq", + "pq_m" = "2", + "pq_nbits" = "2" + ) + ) engine=olap + duplicate key(id) + distributed by hash(id) buckets 1 + properties( + "replication_num" = "1", + "disable_auto_compaction" = "true" + ); + """ + + def formatFloat = { double value -> + String.format(java.util.Locale.ROOT, "%.3f", value) + } + def vector = { double x -> + "[${formatFloat(x)}, ${formatFloat(x * 2)}, ${formatFloat(x * 3)}, ${formatFloat(x * 4)}]" + } + def rows = [] + for (int i = 1; i <= 400; i++) { + double x = (i - 1) / 1000.0 + rows.add("(${i}, ${vector(x)})") + } + for (int i = 401; i <= 800; i++) { + double x = 1000.0 + (i - 401) / 1000.0 + rows.add("(${i}, ${vector(x)})") + } + sql "insert into ivf_pq_recall values ${rows.join(',')};" + sql "sync" + + qt_row_count "select count(*) from ivf_pq_recall;" + + qt_first_cluster_recall """ + select count(*) from ( + select id + from ivf_pq_recall + order by l2_distance_approximate(embedding, [0.0, 0.0, 0.0, 0.0]) + limit 20 + ) t + where id between 1 and 400; + """ + + qt_second_cluster_recall """ + select count(*) from ( + select id + from ivf_pq_recall + order by l2_distance_approximate(embedding, [1000.0, 2000.0, 3000.0, 4000.0]) + limit 20 + ) t + where id between 401 and 800; + """ + } +} From 318691490a7f5762abc36ab89304b5e5a40b70e8 Mon Sep 17 00:00:00 2001 From: kaka11chen Date: Wed, 3 Jun 2026 19:58:30 +0800 Subject: [PATCH 02/10] [fix](ann-index) Fix ivf recall zero and oom. --- be/src/common/config.cpp | 12 +- be/src/common/config.h | 6 +- be/src/storage/index/ann/ann_index_writer.cpp | 238 +----------------- be/src/storage/index/ann/ann_index_writer.h | 21 +- .../index/ann/ann_index_writer_test.cpp | 67 ++--- .../ivf_pq_full_buffer_train_recall.groovy | 70 ++++++ 6 files changed, 110 insertions(+), 304 deletions(-) create mode 100644 regression-test/suites/ann_index_p0/ivf_pq_full_buffer_train_recall.groovy diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index cc4a7c86254717..aee6de23764c72 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1757,27 +1757,19 @@ DEFINE_String(ann_index_ivf_list_cache_limit, "70%"); // Stale sweep time for ANN index IVF list cache in seconds. 3600s is 1 hour. DEFINE_mInt32(ann_index_ivf_list_cache_stale_sweep_time_sec, "3600"); -// Target row count upper bound for ANN/vector index build add batch and training sample. +// Target row count upper bound for ANN/vector index build add batch. // The effective build chunk also respects ann_index_build_chunk_bytes. // 1M By default. DEFINE_mInt64(ann_index_build_chunk_size, "1000000"); DEFINE_Validator(ann_index_build_chunk_size, [](const int64_t config) -> bool { return config > 0; }); -// Target byte bound for ANN/vector index build add batch and memory buffer before flush. -// If index-required minimum training rows cannot fit in this bound, skip ANN build for the segment. +// Target byte bound for ANN/vector index build add batch. // 128MB By default. DEFINE_mInt64(ann_index_build_chunk_bytes, "134217728"); DEFINE_Validator(ann_index_build_chunk_bytes, [](const int64_t config) -> bool { return config > 0; }); -// Maximum row count for ANN/vector index training sample. -// The effective sample keeps at least the index-required minimum training rows. -// 1M By default. -DEFINE_mInt64(ann_index_build_max_train_rows, "1000000"); -DEFINE_Validator(ann_index_build_max_train_rows, - [](const int64_t config) -> bool { return config > 0; }); - DEFINE_mBool(enable_wal_tde, "false"); DEFINE_mBool(print_stack_when_cache_miss, "false"); diff --git a/be/src/common/config.h b/be/src/common/config.h index 3153b3ef9639a7..4678c5b6400f38 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1809,12 +1809,10 @@ DECLARE_mInt32(max_segment_partial_column_cache_size); DECLARE_String(ann_index_ivf_list_cache_limit); // Stale sweep time for ANN index IVF list cache in seconds. DECLARE_mInt32(ann_index_ivf_list_cache_stale_sweep_time_sec); -// Target row count upper bound for ANN/vector index build add batch and training sample. +// Target row count upper bound for ANN/vector index build add batch. DECLARE_mInt64(ann_index_build_chunk_size); -// Target byte bound for ANN/vector index build add batch and memory buffer before flush. +// Target byte bound for ANN/vector index build add batch. DECLARE_mInt64(ann_index_build_chunk_bytes); -// Maximum row count for ANN/vector index training sample. -DECLARE_mInt64(ann_index_build_max_train_rows); DECLARE_mBool(enable_prefill_output_dbm_agg_cache_after_compaction); DECLARE_mBool(enable_prefill_all_dbm_agg_cache_after_compaction); diff --git a/be/src/storage/index/ann/ann_index_writer.cpp b/be/src/storage/index/ann/ann_index_writer.cpp index d441f3a8b37f3f..1d2b2ab8008f55 100644 --- a/be/src/storage/index/ann/ann_index_writer.cpp +++ b/be/src/storage/index/ann/ann_index_writer.cpp @@ -17,8 +17,6 @@ #include "storage/index/ann/ann_index_writer.h" -#include - #include #include #include @@ -26,14 +24,8 @@ #include #include "common/cast_set.h" -#include "io/fs/file_reader.h" -#include "io/fs/file_writer.h" -#include "io/fs/local_file_system.h" -#include "runtime/exec_env.h" #include "storage/index/ann/faiss_ann_index.h" #include "storage/index/inverted/inverted_index_fs_directory.h" -#include "util/slice.h" -#include "util/uid_util.h" namespace doris::segment_v2 { static std::string get_or_default(const std::map& properties, @@ -49,9 +41,7 @@ AnnIndexColumnWriter::AnnIndexColumnWriter(IndexFileWriter* index_file_writer, const TabletIndex* index_meta) : _index_file_writer(index_file_writer), _index_meta(index_meta) {} -AnnIndexColumnWriter::~AnnIndexColumnWriter() { - _delete_spool_file(); -} +AnnIndexColumnWriter::~AnnIndexColumnWriter() = default; Status AnnIndexColumnWriter::init() { Result> compound_dir = _index_file_writer->open(_index_meta); @@ -82,8 +72,6 @@ Status AnnIndexColumnWriter::init() { faiss_index->build(build_parameter); _vector_index = faiss_index; - _training_sample_seen_rows = 0; - _training_sample_rng.seed(0); LOG_INFO( "Create a new faiss index, index_type {} dim {} metric_type {} max_degree {}, " @@ -99,10 +87,7 @@ Status AnnIndexColumnWriter::add_values(const std::string fn, const void* values } void AnnIndexColumnWriter::close_on_error() { - _delete_spool_file(); _release_buffered_vectors(); - _training_sample.clear(); - _read_buffer.clear(); _skip_build = true; } @@ -120,25 +105,6 @@ size_t AnnIndexColumnWriter::_add_chunk_rows(size_t dim) const { cast_set(_chunk_rows_by_bytes(dim))))); } -bool AnnIndexColumnWriter::_train_rows_exceed_chunk_bytes(size_t dim, Int64 min_train_rows) const { - DCHECK(dim > 0); - DCHECK(min_train_rows >= 0); - if (min_train_rows == 0) { - return false; - } - static constexpr Int64 FLOAT_BYTES = static_cast(sizeof(float)); - DORIS_CHECK(dim <= static_cast(std::numeric_limits::max() / FLOAT_BYTES)); - const Int64 vector_bytes = cast_set(dim) * FLOAT_BYTES; - return min_train_rows > AnnIndexColumnWriter::chunk_bytes() / vector_bytes; -} - -size_t AnnIndexColumnWriter::_training_sample_rows_limit(Int64 min_train_rows, size_t dim) const { - DCHECK(min_train_rows > 0); - return cast_set(std::max( - min_train_rows, std::min(config::ann_index_build_max_train_rows, - cast_set(_add_chunk_rows(dim))))); -} - Status AnnIndexColumnWriter::add_array_values(size_t field_size, const void* value_ptr, const uint8_t* null_map, const uint8_t* offsets_ptr, size_t num_rows) { @@ -167,26 +133,7 @@ Status AnnIndexColumnWriter::add_array_values(size_t field_size, const void* val if (min_train_rows == 0) { RETURN_IF_ERROR(_add_vectors_in_chunks(p, num_rows)); } else { - if (_train_rows_exceed_chunk_bytes(dim, min_train_rows)) { - static constexpr Int64 FLOAT_BYTES = static_cast(sizeof(float)); - const Int64 vector_bytes = cast_set(dim) * FLOAT_BYTES; - const Int64 required_bytes = - min_train_rows > std::numeric_limits::max() / vector_bytes - ? std::numeric_limits::max() - : min_train_rows * vector_bytes; - LOG(WARNING) << "Skip ANN index build because minimum training rows exceed build chunk " - "byte limit, dim=" - << dim << ", min_train_rows=" << min_train_rows - << ", required_bytes=" << required_bytes - << ", ann_index_build_chunk_bytes=" << AnnIndexColumnWriter::chunk_bytes(); - _skip_build = true; - _release_buffered_vectors(); - _training_sample.clear(); - _read_buffer.clear(); - _delete_spool_file(); - return Status::OK(); - } - RETURN_IF_ERROR(_append_vectors_need_train(p, num_rows, min_train_rows)); + RETURN_IF_ERROR(_append_vectors_need_train(p, num_rows)); } _total_rows += cast_set(num_rows); @@ -208,16 +155,11 @@ int64_t AnnIndexColumnWriter::size() const { Status AnnIndexColumnWriter::finish() { if (_skip_build || _total_rows == 0) { LOG_INFO("No data to train/add for ANN index. Skipping index building."); - Status st = _index_file_writer->delete_index(_index_meta); - _delete_spool_file(); - return st; + return _index_file_writer->delete_index(_index_meta); } const Int64 min_train_rows = _vector_index->get_min_train_rows(); - Status st = - min_train_rows == 0 ? _vector_index->save(_dir.get()) : _train_and_add(min_train_rows); - _delete_spool_file(); - return st; + return min_train_rows == 0 ? _vector_index->save(_dir.get()) : _train_and_add(min_train_rows); } Status AnnIndexColumnWriter::_add_vectors_in_chunks(const float* vectors, size_t num_rows) { @@ -236,101 +178,12 @@ Status AnnIndexColumnWriter::_add_vectors_in_chunks(const float* vectors, size_t return Status::OK(); } -Status AnnIndexColumnWriter::_append_vectors_need_train(const float* vectors, size_t num_rows, - Int64 min_train_rows) { +Status AnnIndexColumnWriter::_append_vectors_need_train(const float* vectors, size_t num_rows) { DCHECK(vectors != nullptr); DCHECK(num_rows > 0); const size_t dim = _vector_index->get_dimension(); - const size_t sample_rows_limit = _training_sample_rows_limit(min_train_rows, dim); - if (!_spool_file_path.empty()) { - _sample_training_vectors(vectors, num_rows, dim, sample_rows_limit); - return _append_to_spool_file(vectors, num_rows * dim); - } - - const size_t buffered_rows = _buffered_vectors.size() / dim; - const size_t buffered_rows_limit = sample_rows_limit; - if (buffered_rows <= buffered_rows_limit && num_rows <= buffered_rows_limit - buffered_rows) { - _buffered_vectors.insert(_buffered_vectors.end(), vectors, vectors + num_rows * dim); - return Status::OK(); - } - - RETURN_IF_ERROR(_spill_buffered_vectors(dim, sample_rows_limit)); - _sample_training_vectors(vectors, num_rows, dim, sample_rows_limit); - return _append_to_spool_file(vectors, num_rows * dim); -} - -void AnnIndexColumnWriter::_sample_training_vectors(const float* vectors, size_t num_rows, - size_t dim, size_t sample_rows_limit) { - DCHECK(vectors != nullptr); - DCHECK(num_rows > 0); - DCHECK(dim > 0); - DCHECK(sample_rows_limit > 0); - DCHECK(_training_sample.size() % dim == 0); - - for (size_t row = 0; row < num_rows; ++row) { - const float* vector = vectors + row * dim; - ++_training_sample_seen_rows; - const size_t sample_rows = _training_sample.size() / dim; - if (sample_rows < sample_rows_limit) { - _training_sample.insert(_training_sample.end(), vector, vector + dim); - continue; - } - - std::uniform_int_distribution distribution(0, _training_sample_seen_rows - 1); - const uint64_t selected = distribution(_training_sample_rng); - if (selected < sample_rows_limit) { - float* dst = _training_sample.data() + cast_set(selected) * dim; - std::copy(vector, vector + dim, dst); - } - } -} - -Status AnnIndexColumnWriter::_spill_buffered_vectors(size_t dim, size_t sample_rows_limit) { - DCHECK(dim > 0); - DCHECK(_training_sample.empty()); - DCHECK_EQ(_training_sample_seen_rows, 0); - if (!_buffered_vectors.empty()) { - DCHECK(_buffered_vectors.size() % dim == 0); - } - RETURN_IF_ERROR(_ensure_spool_file()); - if (!_buffered_vectors.empty()) { - RETURN_IF_ERROR(_append_to_spool_file(_buffered_vectors.data(), _buffered_vectors.size())); - if (_buffered_vectors.size() / dim <= sample_rows_limit) { - _training_sample.swap(_buffered_vectors); - _training_sample_seen_rows = _training_sample.size() / dim; - } else { - _sample_training_vectors(_buffered_vectors.data(), _buffered_vectors.size() / dim, dim, - sample_rows_limit); - } - } - _release_buffered_vectors(); - return Status::OK(); -} - -Status AnnIndexColumnWriter::_ensure_spool_file() { - if (_spool_file_writer != nullptr) { - return Status::OK(); - } - DORIS_CHECK(ExecEnv::GetInstance()->get_tmp_file_dirs() != nullptr); - _spool_file_path = ExecEnv::GetInstance()->get_tmp_file_dirs()->get_tmp_file_dir() / - fmt::format("ann_index_build_{}.spool", UniqueId::gen_uid().to_string()); - io::FileWriterOptions opts; - opts.sync_file_data = false; - return io::global_local_filesystem()->create_file(_spool_file_path, &_spool_file_writer, &opts); -} - -Status AnnIndexColumnWriter::_append_to_spool_file(const float* vectors, size_t num_elements) { - const size_t bytes = num_elements * sizeof(float); - return _spool_file_writer->append(Slice(reinterpret_cast(vectors), bytes)); -} - -Status AnnIndexColumnWriter::_flush_spool_writer() { - if (_spool_file_writer == nullptr) { - return Status::OK(); - } - RETURN_IF_ERROR(_spool_file_writer->close()); - _spool_file_writer.reset(); + _buffered_vectors.insert(_buffered_vectors.end(), vectors, vectors + num_rows * dim); return Status::OK(); } @@ -340,89 +193,22 @@ Status AnnIndexColumnWriter::_train_and_add(Int64 min_train_rows) { "Total data size {} is less than minimum {} rows required for ANN index training. " "Skipping index building for this segment.", _total_rows, min_train_rows); - RETURN_IF_ERROR(_flush_spool_writer()); _release_buffered_vectors(); return _index_file_writer->delete_index(_index_meta); } const size_t dim = _vector_index->get_dimension(); - if (_spool_file_path.empty()) { - DCHECK(_buffered_vectors.size() % dim == 0); - const Int64 train_rows = cast_set(_buffered_vectors.size() / dim); - DORIS_CHECK(train_rows >= min_train_rows); - RETURN_IF_ERROR(_vector_index->train(train_rows, _buffered_vectors.data())); - RETURN_IF_ERROR(_add_vectors_in_chunks(_buffered_vectors.data(), train_rows)); - _release_buffered_vectors(); - } else { - DCHECK(_training_sample.size() % dim == 0); - const Int64 train_rows = cast_set(_training_sample.size() / dim); - DORIS_CHECK(train_rows >= min_train_rows); - RETURN_IF_ERROR(_vector_index->train(train_rows, _training_sample.data())); - { - PODArray empty_training_sample; - _training_sample.swap(empty_training_sample); - } - RETURN_IF_ERROR(_flush_spool_writer()); - RETURN_IF_ERROR(_add_spooled_vectors()); - } + DCHECK(_buffered_vectors.size() % dim == 0); + const Int64 train_rows = cast_set(_buffered_vectors.size() / dim); + DORIS_CHECK(train_rows >= min_train_rows); + RETURN_IF_ERROR(_vector_index->train(train_rows, _buffered_vectors.data())); + RETURN_IF_ERROR(_add_vectors_in_chunks(_buffered_vectors.data(), train_rows)); + _release_buffered_vectors(); return _vector_index->save(_dir.get()); } -Status AnnIndexColumnWriter::_add_spooled_vectors() { - DCHECK(!_spool_file_path.empty()); - io::FileReaderSPtr reader; - RETURN_IF_ERROR(io::global_local_filesystem()->open_file(_spool_file_path, &reader)); - - const size_t dim = _vector_index->get_dimension(); - const size_t chunk_elements = _add_chunk_rows(dim) * dim; - _read_buffer.resize(chunk_elements); - const size_t buffer_bytes = chunk_elements * sizeof(float); - size_t offset = 0; - while (offset < reader->size()) { - const size_t bytes_to_read = std::min(buffer_bytes, reader->size() - offset); - DCHECK(bytes_to_read % sizeof(float) == 0); - size_t bytes_read = 0; - RETURN_IF_ERROR(reader->read_at( - offset, Slice(reinterpret_cast(_read_buffer.data()), bytes_to_read), - &bytes_read)); - if (bytes_read != bytes_to_read) { - return Status::IOError( - "Failed to read ANN index build spool file {}, expect {} bytes, " - "got {} bytes", - _spool_file_path.native(), bytes_to_read, bytes_read); - } - DCHECK((bytes_read / sizeof(float)) % dim == 0); - RETURN_IF_ERROR(_vector_index->add(cast_set(bytes_read / sizeof(float) / dim), - _read_buffer.data())); - offset += bytes_read; - } - RETURN_IF_ERROR(reader->close()); - PODArray empty_read_buffer; - _read_buffer.swap(empty_read_buffer); - return Status::OK(); -} - void AnnIndexColumnWriter::_release_buffered_vectors() { PODArray empty_buffered_vectors; _buffered_vectors.swap(empty_buffered_vectors); } - -void AnnIndexColumnWriter::_delete_spool_file() { - if (_spool_file_writer != nullptr) { - Status st = _spool_file_writer->close(); - if (!st.ok()) { - LOG(WARNING) << "Failed to close ANN index build spool file " - << _spool_file_path.native() << ": " << st; - } - _spool_file_writer.reset(); - } - if (!_spool_file_path.empty()) { - Status st = io::global_local_filesystem()->delete_file(_spool_file_path); - if (!st.ok()) { - LOG(WARNING) << "Failed to delete ANN index build spool file " - << _spool_file_path.native() << ": " << st; - } - _spool_file_path.clear(); - } -} } // namespace doris::segment_v2 diff --git a/be/src/storage/index/ann/ann_index_writer.h b/be/src/storage/index/ann/ann_index_writer.h index 1d8b95ab65556e..72682d6f8cee56 100644 --- a/be/src/storage/index/ann/ann_index_writer.h +++ b/be/src/storage/index/ann/ann_index_writer.h @@ -24,14 +24,11 @@ #include #include -#include #include #include #include "common/config.h" #include "core/pod_array.h" -#include "io/fs/file_reader_writer_fwd.h" -#include "io/fs/path.h" #include "storage/index/ann/ann_index.h" #include "storage/index/index_file_writer.h" #include "storage/index/index_writer.h" @@ -77,20 +74,10 @@ class AnnIndexColumnWriter : public IndexColumnWriter { private: size_t _chunk_rows_by_bytes(size_t dim) const; size_t _add_chunk_rows(size_t dim) const; - bool _train_rows_exceed_chunk_bytes(size_t dim, Int64 min_train_rows) const; - size_t _training_sample_rows_limit(Int64 min_train_rows, size_t dim) const; Status _add_vectors_in_chunks(const float* vectors, size_t num_rows); - Status _append_vectors_need_train(const float* vectors, size_t num_rows, Int64 min_train_rows); - void _sample_training_vectors(const float* vectors, size_t num_rows, size_t dim, - size_t sample_rows_limit); - Status _spill_buffered_vectors(size_t dim, size_t sample_rows_limit); - Status _ensure_spool_file(); - Status _append_to_spool_file(const float* vectors, size_t num_elements); - Status _flush_spool_writer(); + Status _append_vectors_need_train(const float* vectors, size_t num_rows); Status _train_and_add(Int64 min_train_rows); - Status _add_spooled_vectors(); void _release_buffered_vectors(); - void _delete_spool_file(); #ifdef BE_TEST friend class TestAnnIndexColumnWriter; @@ -101,12 +88,6 @@ class AnnIndexColumnWriter : public IndexColumnWriter { // This should be a weak_ptr std::shared_ptr _vector_index; PODArray _buffered_vectors; - PODArray _training_sample; - PODArray _read_buffer; - uint64_t _training_sample_seen_rows = 0; - std::mt19937_64 _training_sample_rng {0}; - io::Path _spool_file_path; - io::FileWriterPtr _spool_file_writer; int64_t _total_rows = 0; IndexFileWriter* _index_file_writer; const TabletIndex* _index_meta; diff --git a/be/test/storage/index/ann/ann_index_writer_test.cpp b/be/test/storage/index/ann/ann_index_writer_test.cpp index 9846b3093d34bc..e570f2829eef5d 100644 --- a/be/test/storage/index/ann/ann_index_writer_test.cpp +++ b/be/test/storage/index/ann/ann_index_writer_test.cpp @@ -64,16 +64,8 @@ class TestAnnIndexColumnWriter : public AnnIndexColumnWriter { void set_vector_index(std::shared_ptr index) { _vector_index = index; } size_t buffered_vector_capacity() const { return _buffered_vectors.capacity(); } - size_t read_buffer_capacity() const { return _read_buffer.capacity(); } size_t buffered_vector_rows(size_t dim) const { return _buffered_vectors.size() / dim; } - bool has_spool_file() const { return !_spool_file_path.empty(); } size_t add_chunk_rows(size_t dim) const { return _add_chunk_rows(dim); } - size_t training_sample_rows_limit(Int64 min_train_rows, size_t dim) const { - return _training_sample_rows_limit(min_train_rows, dim); - } - bool train_rows_exceed_chunk_bytes(size_t dim, Int64 min_train_rows) const { - return _train_rows_exceed_chunk_bytes(dim, min_train_rows); - } }; class AnnIndexWriterTest : public ::testing::Test { @@ -188,7 +180,6 @@ TEST_F(AnnIndexWriterTest, TestInitDoesNotPreallocateBuildChunk) { ASSERT_TRUE(writer->init().ok()); EXPECT_EQ(writer->buffered_vector_capacity(), 0); - EXPECT_EQ(writer->read_buffer_capacity(), 0); } TEST_F(AnnIndexWriterTest, TestAddArrayValuesSuccess) { @@ -478,7 +469,6 @@ TEST_F(AnnIndexWriterTest, TestNoTrainIndexAddsDirectly) { EXPECT_TRUE(status.ok()); } - EXPECT_FALSE(writer->has_spool_file()); EXPECT_TRUE(testing::Mock::VerifyAndClearExpectations(mock_index.get())); EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(0)); @@ -536,7 +526,6 @@ TEST_F(AnnIndexWriterTest, TestNoTrainIndexRespectsBuildChunkBytes) { writer->add_array_values(sizeof(float), vectors.data(), nullptr, reinterpret_cast(offsets.data()), num_rows); EXPECT_TRUE(status.ok()); - EXPECT_FALSE(writer->has_spool_file()); EXPECT_TRUE(testing::Mock::VerifyAndClearExpectations(mock_index.get())); EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(0)); @@ -677,7 +666,6 @@ TEST_F(AnnIndexWriterTest, TestSmallTrainRequiredIndexUsesMemoryBuffer) { writer->add_array_values(sizeof(float), vectors.data(), nullptr, reinterpret_cast(offsets.data()), num_rows); EXPECT_TRUE(status.ok()); - EXPECT_FALSE(writer->has_spool_file()); EXPECT_EQ(writer->buffered_vector_rows(dim), num_rows); EXPECT_TRUE(testing::Mock::VerifyAndClearExpectations(mock_index.get())); @@ -695,7 +683,6 @@ TEST_F(AnnIndexWriterTest, TestSmallTrainRequiredIndexUsesMemoryBuffer) { status = writer->finish(); EXPECT_TRUE(status.ok()); - EXPECT_FALSE(writer->has_spool_file()); EXPECT_EQ(writer->buffered_vector_rows(dim), 0); } @@ -764,13 +751,13 @@ TEST_F(AnnIndexWriterTest, TestTrainRequiredIndexTrainsOnceAndAddsAllRows) { EXPECT_TRUE(status.ok()); } - EXPECT_TRUE(writer->has_spool_file()); + EXPECT_EQ(writer->buffered_vector_rows(dim), 12); EXPECT_TRUE(testing::Mock::VerifyAndClearExpectations(mock_index.get())); EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(2)); { testing::InSequence sequence; - EXPECT_CALL(*mock_index, train(10, testing::_)) + EXPECT_CALL(*mock_index, train(12, testing::_)) .Times(1) .WillOnce(testing::Return(Status::OK())); EXPECT_CALL(*mock_index, add(10, testing::_)) @@ -786,12 +773,7 @@ TEST_F(AnnIndexWriterTest, TestTrainRequiredIndexTrainsOnceAndAddsAllRows) { EXPECT_TRUE(status.ok()); } -TEST_F(AnnIndexWriterTest, TestTrainingSampleUsesReservoirAndMaxRows) { - const int64_t old_max_train_rows = config::ann_index_build_max_train_rows; - config::ann_index_build_max_train_rows = 6; - doris::Defer restore_config { - [&] { config::ann_index_build_max_train_rows = old_max_train_rows; }}; - +TEST_F(AnnIndexWriterTest, TestTrainRequiredIndexTrainsWithAllBufferedRows) { auto mock_index = std::make_shared(); auto writer = std::make_unique(_index_file_writer.get(), _tablet_index.get()); @@ -825,30 +807,20 @@ TEST_F(AnnIndexWriterTest, TestTrainingSampleUsesReservoirAndMaxRows) { writer->add_array_values(sizeof(float), vectors.data(), nullptr, reinterpret_cast(offsets.data()), num_rows); EXPECT_TRUE(status.ok()); - EXPECT_TRUE(writer->has_spool_file()); + EXPECT_EQ(writer->buffered_vector_rows(dim), num_rows); EXPECT_TRUE(testing::Mock::VerifyAndClearExpectations(mock_index.get())); EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(2)); { testing::InSequence sequence; - EXPECT_CALL(*mock_index, train(6, testing::_)) + EXPECT_CALL(*mock_index, train(20, testing::_)) .Times(1) .WillOnce(testing::Invoke([&](Int64 n, const float* vec) { - EXPECT_EQ(n, 6); - bool has_row_after_initial_sample = false; - bool is_prefix_sample = true; + EXPECT_EQ(n, num_rows); for (size_t row = 0; row < static_cast(n); ++row) { const auto row_id = static_cast(vec[row * dim]); - EXPECT_LT(row_id, num_rows); - if (row_id >= static_cast(n)) { - has_row_after_initial_sample = true; - } - if (row_id != row) { - is_prefix_sample = false; - } + EXPECT_EQ(row_id, row); } - EXPECT_TRUE(has_row_after_initial_sample); - EXPECT_FALSE(is_prefix_sample); return Status::OK(); })); EXPECT_CALL(*mock_index, add(10, testing::_)) @@ -879,7 +851,6 @@ TEST_F(AnnIndexWriterTest, TestBuildChunkBytesCapsAddRows) { const size_t dim = 4; EXPECT_EQ(writer->add_chunk_rows(dim), 2); - EXPECT_EQ(writer->training_sample_rows_limit(1, dim), 2); EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(1)); EXPECT_CALL(*mock_index, train(testing::_, testing::_)).Times(0); @@ -905,7 +876,7 @@ TEST_F(AnnIndexWriterTest, TestBuildChunkBytesCapsAddRows) { EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(1)); { testing::InSequence sequence; - EXPECT_CALL(*mock_index, train(2, testing::_)) + EXPECT_CALL(*mock_index, train(6, testing::_)) .Times(1) .WillOnce(testing::Return(Status::OK())); EXPECT_CALL(*mock_index, add(2, testing::_)) @@ -951,7 +922,6 @@ TEST_F(AnnIndexWriterTest, TestSkipIndexWhenTotalRowsLessThanMinTrainRows) { reinterpret_cast(offsets.data()), num_rows); EXPECT_TRUE(status.ok()); - EXPECT_FALSE(writer->has_spool_file()); EXPECT_EQ(writer->buffered_vector_rows(dim), num_rows); EXPECT_TRUE(testing::Mock::VerifyAndClearExpectations(mock_index.get())); EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(5)); @@ -964,7 +934,7 @@ TEST_F(AnnIndexWriterTest, TestSkipIndexWhenTotalRowsLessThanMinTrainRows) { EXPECT_EQ(writer->buffered_vector_rows(dim), 0); } -TEST_F(AnnIndexWriterTest, TestSkipBuildWhenMinTrainRowsExceedsChunkBytes) { +TEST_F(AnnIndexWriterTest, TestMinTrainRowsCanExceedChunkBytes) { const int64_t old_chunk_bytes = config::ann_index_build_chunk_bytes; config::ann_index_build_chunk_bytes = 32; doris::Defer restore_config {[&] { config::ann_index_build_chunk_bytes = old_chunk_bytes; }}; @@ -981,7 +951,7 @@ TEST_F(AnnIndexWriterTest, TestSkipBuildWhenMinTrainRowsExceedsChunkBytes) { writer->set_vector_index(mock_index); const size_t dim = 4; - EXPECT_TRUE(writer->train_rows_exceed_chunk_bytes(dim, 5)); + EXPECT_EQ(writer->add_chunk_rows(dim), 2); EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(5)); EXPECT_CALL(*mock_index, train(testing::_, testing::_)).Times(0); @@ -1003,14 +973,23 @@ TEST_F(AnnIndexWriterTest, TestSkipBuildWhenMinTrainRowsExceedsChunkBytes) { reinterpret_cast(offsets.data()), num_rows); EXPECT_TRUE(status.ok()); - EXPECT_FALSE(writer->has_spool_file()); + EXPECT_EQ(writer->buffered_vector_rows(dim), num_rows); EXPECT_TRUE(testing::Mock::VerifyAndClearExpectations(mock_index.get())); - EXPECT_CALL(*mock_index, train(testing::_, testing::_)).Times(0); - EXPECT_CALL(*mock_index, add(testing::_, testing::_)).Times(0); - EXPECT_CALL(*mock_index, save(testing::_)).Times(0); + EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(5)); + { + testing::InSequence sequence; + EXPECT_CALL(*mock_index, train(6, testing::_)) + .Times(1) + .WillOnce(testing::Return(Status::OK())); + EXPECT_CALL(*mock_index, add(2, testing::_)) + .Times(3) + .WillRepeatedly(testing::Return(Status::OK())); + EXPECT_CALL(*mock_index, save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); + } status = writer->finish(); EXPECT_TRUE(status.ok()); + EXPECT_EQ(writer->buffered_vector_rows(dim), 0); } TEST_F(AnnIndexWriterTest, TestIVFOnDiskMinTrainRows) { diff --git a/regression-test/suites/ann_index_p0/ivf_pq_full_buffer_train_recall.groovy b/regression-test/suites/ann_index_p0/ivf_pq_full_buffer_train_recall.groovy new file mode 100644 index 00000000000000..bc367918fa00e6 --- /dev/null +++ b/regression-test/suites/ann_index_p0/ivf_pq_full_buffer_train_recall.groovy @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("ivf_pq_full_buffer_train_recall", "nonConcurrent") { + sql "set enable_common_expr_pushdown=true;" + sql "set enable_ann_index_result_cache=false;" + sql "set ivf_nprobe=8;" + + setBeConfigTemporary([ann_index_build_chunk_size: 200]) { + // pq_nbits=1 needs 200 train rows. Setting the build chunk size to 200 + // verifies that add batching does not cap the rows used for training. + sql "drop table if exists tbl_ivf_pq_full_buffer_train_recall" + sql """ + CREATE TABLE tbl_ivf_pq_full_buffer_train_recall ( + id INT NOT NULL, + embedding ARRAY NOT NULL, + INDEX idx_emb (`embedding`) USING ANN PROPERTIES( + "index_type"="ivf", + "metric_type"="l2_distance", + "nlist"="8", + "dim"="4", + "quantizer"="pq", + "pq_m"="2", + "pq_nbits"="1" + ) + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("replication_num" = "1"); + """ + + def insertData = [] + for (int i = 1; i <= 400; i++) { + if (i == 250) { + insertData.add("(${i}, [0.0, 0.0, 0.0, 0.0])") + } else if (i <= 200) { + insertData.add("(${i}, [1000.0, ${i}.0, ${(i % 17)}.0, ${(i % 19)}.0])") + } else { + insertData.add( + "(${i}, [${(i - 250) / 50.0}, ${(250 - i) / 50.0}, " + + "${(i % 7 - 3) / 10.0}, ${(i % 5 - 2) / 10.0}])") + } + } + sql "INSERT INTO tbl_ivf_pq_full_buffer_train_recall VALUES ${insertData.join(', ')};" + sql "sync" + + def hits = sql """ + select id + from tbl_ivf_pq_full_buffer_train_recall + order by l2_distance_approximate(embedding, [0.0, 0.0, 0.0, 0.0]), id + limit 20; + """ + assertTrue(hits.any { row -> row[0] == 250 }, + "Expected id 250 in ANN top 20, but got ${hits}") + } +} From 582071f0eeb96aa5a7754df2d0ef4ec12745e462 Mon Sep 17 00:00:00 2001 From: kaka11chen Date: Wed, 3 Jun 2026 20:08:01 +0800 Subject: [PATCH 03/10] [refactor](be) Rename ANN build add chunk configs --- be/src/common/config.cpp | 14 ++++----- be/src/common/config.h | 8 ++--- be/src/storage/index/ann/ann_index_writer.cpp | 9 +++--- be/src/storage/index/ann/ann_index_writer.h | 8 ++--- .../index/ann/ann_index_writer_test.cpp | 29 ++++++++++--------- ...=> ann_index_build_add_chunk_bytes.groovy} | 16 +++++----- .../ivf_pq_full_buffer_train_recall.groovy | 4 +-- .../suites/ann_index_p0/ivf_pq_recall.groovy | 2 +- 8 files changed, 47 insertions(+), 43 deletions(-) rename regression-test/suites/ann_index_p0/{ann_index_build_chunk_bytes.groovy => ann_index_build_add_chunk_bytes.groovy} (77%) diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index aee6de23764c72..d4787d04d995c0 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1757,17 +1757,17 @@ DEFINE_String(ann_index_ivf_list_cache_limit, "70%"); // Stale sweep time for ANN index IVF list cache in seconds. 3600s is 1 hour. DEFINE_mInt32(ann_index_ivf_list_cache_stale_sweep_time_sec, "3600"); -// Target row count upper bound for ANN/vector index build add batch. -// The effective build chunk also respects ann_index_build_chunk_bytes. +// Target row count upper bound for ANN/vector index add batch during build. +// The effective add chunk also respects ann_index_build_add_chunk_bytes. // 1M By default. -DEFINE_mInt64(ann_index_build_chunk_size, "1000000"); -DEFINE_Validator(ann_index_build_chunk_size, +DEFINE_mInt64(ann_index_build_add_chunk_size, "1000000"); +DEFINE_Validator(ann_index_build_add_chunk_size, [](const int64_t config) -> bool { return config > 0; }); -// Target byte bound for ANN/vector index build add batch. +// Target byte bound for ANN/vector index add batch during build. // 128MB By default. -DEFINE_mInt64(ann_index_build_chunk_bytes, "134217728"); -DEFINE_Validator(ann_index_build_chunk_bytes, +DEFINE_mInt64(ann_index_build_add_chunk_bytes, "134217728"); +DEFINE_Validator(ann_index_build_add_chunk_bytes, [](const int64_t config) -> bool { return config > 0; }); DEFINE_mBool(enable_wal_tde, "false"); diff --git a/be/src/common/config.h b/be/src/common/config.h index 4678c5b6400f38..9eab19cf50da28 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1809,10 +1809,10 @@ DECLARE_mInt32(max_segment_partial_column_cache_size); DECLARE_String(ann_index_ivf_list_cache_limit); // Stale sweep time for ANN index IVF list cache in seconds. DECLARE_mInt32(ann_index_ivf_list_cache_stale_sweep_time_sec); -// Target row count upper bound for ANN/vector index build add batch. -DECLARE_mInt64(ann_index_build_chunk_size); -// Target byte bound for ANN/vector index build add batch. -DECLARE_mInt64(ann_index_build_chunk_bytes); +// Target row count upper bound for ANN/vector index add batch during build. +DECLARE_mInt64(ann_index_build_add_chunk_size); +// Target byte bound for ANN/vector index add batch during build. +DECLARE_mInt64(ann_index_build_add_chunk_bytes); DECLARE_mBool(enable_prefill_output_dbm_agg_cache_after_compaction); DECLARE_mBool(enable_prefill_all_dbm_agg_cache_after_compaction); diff --git a/be/src/storage/index/ann/ann_index_writer.cpp b/be/src/storage/index/ann/ann_index_writer.cpp index 1d2b2ab8008f55..f517ba0bf038f3 100644 --- a/be/src/storage/index/ann/ann_index_writer.cpp +++ b/be/src/storage/index/ann/ann_index_writer.cpp @@ -91,18 +91,19 @@ void AnnIndexColumnWriter::close_on_error() { _skip_build = true; } -size_t AnnIndexColumnWriter::_chunk_rows_by_bytes(size_t dim) const { +size_t AnnIndexColumnWriter::_add_chunk_rows_by_bytes(size_t dim) const { DCHECK(dim > 0); static constexpr Int64 FLOAT_BYTES = static_cast(sizeof(float)); DORIS_CHECK(dim <= static_cast(std::numeric_limits::max() / FLOAT_BYTES)); const Int64 vector_bytes = cast_set(dim) * FLOAT_BYTES; - return cast_set(std::max(1, AnnIndexColumnWriter::chunk_bytes() / vector_bytes)); + return cast_set( + std::max(1, AnnIndexColumnWriter::add_chunk_bytes() / vector_bytes)); } size_t AnnIndexColumnWriter::_add_chunk_rows(size_t dim) const { return cast_set( - std::max(1, std::min(AnnIndexColumnWriter::chunk_size(), - cast_set(_chunk_rows_by_bytes(dim))))); + std::max(1, std::min(AnnIndexColumnWriter::add_chunk_size(), + cast_set(_add_chunk_rows_by_bytes(dim))))); } Status AnnIndexColumnWriter::add_array_values(size_t field_size, const void* value_ptr, diff --git a/be/src/storage/index/ann/ann_index_writer.h b/be/src/storage/index/ann/ann_index_writer.h index 72682d6f8cee56..b2dc842be74bc7 100644 --- a/be/src/storage/index/ann/ann_index_writer.h +++ b/be/src/storage/index/ann/ann_index_writer.h @@ -38,14 +38,14 @@ namespace doris::segment_v2 { class AnnIndexColumnWriter : public IndexColumnWriter { public: - static inline int64_t chunk_size() { + static inline int64_t add_chunk_size() { #ifdef BE_TEST return 10; #else - return config::ann_index_build_chunk_size; + return config::ann_index_build_add_chunk_size; #endif } - static inline int64_t chunk_bytes() { return config::ann_index_build_chunk_bytes; } + static inline int64_t add_chunk_bytes() { return config::ann_index_build_add_chunk_bytes; } static constexpr const char* INDEX_TYPE = "index_type"; static constexpr const char* METRIC_TYPE = "metric_type"; static constexpr const char* DIM = "dim"; @@ -72,7 +72,7 @@ class AnnIndexColumnWriter : public IndexColumnWriter { Status finish() override; private: - size_t _chunk_rows_by_bytes(size_t dim) const; + size_t _add_chunk_rows_by_bytes(size_t dim) const; size_t _add_chunk_rows(size_t dim) const; Status _add_vectors_in_chunks(const float* vectors, size_t num_rows); Status _append_vectors_need_train(const float* vectors, size_t num_rows); diff --git a/be/test/storage/index/ann/ann_index_writer_test.cpp b/be/test/storage/index/ann/ann_index_writer_test.cpp index e570f2829eef5d..d4ab12a6f646c5 100644 --- a/be/test/storage/index/ann/ann_index_writer_test.cpp +++ b/be/test/storage/index/ann/ann_index_writer_test.cpp @@ -170,7 +170,7 @@ TEST_F(AnnIndexWriterTest, TestInitWithDifferentProperties) { } } -TEST_F(AnnIndexWriterTest, TestInitDoesNotPreallocateBuildChunk) { +TEST_F(AnnIndexWriterTest, TestInitDoesNotPreallocateAddChunk) { auto writer = std::make_unique(_index_file_writer.get(), _tablet_index.get()); @@ -480,10 +480,11 @@ TEST_F(AnnIndexWriterTest, TestNoTrainIndexAddsDirectly) { EXPECT_TRUE(status.ok()); } -TEST_F(AnnIndexWriterTest, TestNoTrainIndexRespectsBuildChunkBytes) { - const int64_t old_chunk_bytes = config::ann_index_build_chunk_bytes; - config::ann_index_build_chunk_bytes = 32; - doris::Defer restore_config {[&] { config::ann_index_build_chunk_bytes = old_chunk_bytes; }}; +TEST_F(AnnIndexWriterTest, TestNoTrainIndexRespectsAddChunkBytes) { + const int64_t old_add_chunk_bytes = config::ann_index_build_add_chunk_bytes; + config::ann_index_build_add_chunk_bytes = 32; + doris::Defer restore_config { + [&] { config::ann_index_build_add_chunk_bytes = old_add_chunk_bytes; }}; auto mock_index = std::make_shared(); auto writer = std::make_unique(_index_file_writer.get(), @@ -833,10 +834,11 @@ TEST_F(AnnIndexWriterTest, TestTrainRequiredIndexTrainsWithAllBufferedRows) { EXPECT_TRUE(status.ok()); } -TEST_F(AnnIndexWriterTest, TestBuildChunkBytesCapsAddRows) { - const int64_t old_chunk_bytes = config::ann_index_build_chunk_bytes; - config::ann_index_build_chunk_bytes = 32; - doris::Defer restore_config {[&] { config::ann_index_build_chunk_bytes = old_chunk_bytes; }}; +TEST_F(AnnIndexWriterTest, TestAddChunkBytesCapsAddRows) { + const int64_t old_add_chunk_bytes = config::ann_index_build_add_chunk_bytes; + config::ann_index_build_add_chunk_bytes = 32; + doris::Defer restore_config { + [&] { config::ann_index_build_add_chunk_bytes = old_add_chunk_bytes; }}; auto mock_index = std::make_shared(); auto writer = std::make_unique(_index_file_writer.get(), @@ -934,10 +936,11 @@ TEST_F(AnnIndexWriterTest, TestSkipIndexWhenTotalRowsLessThanMinTrainRows) { EXPECT_EQ(writer->buffered_vector_rows(dim), 0); } -TEST_F(AnnIndexWriterTest, TestMinTrainRowsCanExceedChunkBytes) { - const int64_t old_chunk_bytes = config::ann_index_build_chunk_bytes; - config::ann_index_build_chunk_bytes = 32; - doris::Defer restore_config {[&] { config::ann_index_build_chunk_bytes = old_chunk_bytes; }}; +TEST_F(AnnIndexWriterTest, TestMinTrainRowsCanExceedAddChunkBytes) { + const int64_t old_add_chunk_bytes = config::ann_index_build_add_chunk_bytes; + config::ann_index_build_add_chunk_bytes = 32; + doris::Defer restore_config { + [&] { config::ann_index_build_add_chunk_bytes = old_add_chunk_bytes; }}; auto mock_index = std::make_shared(); auto writer = std::make_unique(_index_file_writer.get(), diff --git a/regression-test/suites/ann_index_p0/ann_index_build_chunk_bytes.groovy b/regression-test/suites/ann_index_p0/ann_index_build_add_chunk_bytes.groovy similarity index 77% rename from regression-test/suites/ann_index_p0/ann_index_build_chunk_bytes.groovy rename to regression-test/suites/ann_index_p0/ann_index_build_add_chunk_bytes.groovy index cf0cce386421b9..9944609f275c04 100644 --- a/regression-test/suites/ann_index_p0/ann_index_build_chunk_bytes.groovy +++ b/regression-test/suites/ann_index_p0/ann_index_build_add_chunk_bytes.groovy @@ -15,16 +15,16 @@ // specific language governing permissions and limitations // under the License. -suite("ann_index_build_chunk_bytes", "nonConcurrent") { +suite("ann_index_build_add_chunk_bytes", "nonConcurrent") { sql "set enable_common_expr_pushdown=true;" setBeConfigTemporary([ - ann_index_build_chunk_size: 1000000, - ann_index_build_chunk_bytes: 256 + ann_index_build_add_chunk_size: 1000000, + ann_index_build_add_chunk_bytes: 256 ]) { - sql "drop table if exists ann_index_build_chunk_bytes" + sql "drop table if exists ann_index_build_add_chunk_bytes" sql """ - create table ann_index_build_chunk_bytes ( + create table ann_index_build_add_chunk_bytes ( id int not null, embedding array not null, index idx_embedding (`embedding`) using ann properties( @@ -42,13 +42,13 @@ suite("ann_index_build_chunk_bytes", "nonConcurrent") { for (int i = 1; i <= 8; i++) { rows.add("(${i}, array_with_constant(3072, cast(${i}.0 as float)))") } - sql "insert into ann_index_build_chunk_bytes values ${rows.join(', ')};" + sql "insert into ann_index_build_add_chunk_bytes values ${rows.join(', ')};" sql "sync" - qt_row_count "select count(*) from ann_index_build_chunk_bytes;" + qt_row_count "select count(*) from ann_index_build_add_chunk_bytes;" qt_nearest """ select id - from ann_index_build_chunk_bytes + from ann_index_build_add_chunk_bytes order by l2_distance_approximate(embedding, array_with_constant(3072, cast(1.0 as float))) limit 3; """ diff --git a/regression-test/suites/ann_index_p0/ivf_pq_full_buffer_train_recall.groovy b/regression-test/suites/ann_index_p0/ivf_pq_full_buffer_train_recall.groovy index bc367918fa00e6..cb7a1b2c737fb1 100644 --- a/regression-test/suites/ann_index_p0/ivf_pq_full_buffer_train_recall.groovy +++ b/regression-test/suites/ann_index_p0/ivf_pq_full_buffer_train_recall.groovy @@ -20,8 +20,8 @@ suite("ivf_pq_full_buffer_train_recall", "nonConcurrent") { sql "set enable_ann_index_result_cache=false;" sql "set ivf_nprobe=8;" - setBeConfigTemporary([ann_index_build_chunk_size: 200]) { - // pq_nbits=1 needs 200 train rows. Setting the build chunk size to 200 + setBeConfigTemporary([ann_index_build_add_chunk_size: 200]) { + // pq_nbits=1 needs 200 train rows. Setting the add chunk size to 200 // verifies that add batching does not cap the rows used for training. sql "drop table if exists tbl_ivf_pq_full_buffer_train_recall" sql """ diff --git a/regression-test/suites/ann_index_p0/ivf_pq_recall.groovy b/regression-test/suites/ann_index_p0/ivf_pq_recall.groovy index a5cddfeb881e69..d9bd6727744e52 100644 --- a/regression-test/suites/ann_index_p0/ivf_pq_recall.groovy +++ b/regression-test/suites/ann_index_p0/ivf_pq_recall.groovy @@ -20,7 +20,7 @@ suite("ivf_pq_recall", "nonConcurrent") { sql "set enable_ann_index_result_cache=false;" sql "set ivf_nprobe=8;" - setBeConfigTemporary([ann_index_build_chunk_size: 400]) { + setBeConfigTemporary([ann_index_build_add_chunk_size: 400]) { sql "drop table if exists ivf_pq_recall" sql """ create table ivf_pq_recall ( From 04d8a048174595050f3fb6792f07bf1a7aceee6b Mon Sep 17 00:00:00 2001 From: kaka11chen Date: Thu, 4 Jun 2026 09:40:22 +0800 Subject: [PATCH 04/10] update. - Test: Regression test - ./run-regression-test.sh --run -d ann_index_p0 -s ivf_pq_full_buffer_train_recall - Behavior changed: No - Does this need documentation: No --- ...s.out => ann_index_build_add_chunk_bytes.out} | 1 + .../ivf_pq_full_buffer_train_recall.out | 4 ++++ .../ivf_pq_full_buffer_train_recall.groovy | 16 +++++++++------- 3 files changed, 14 insertions(+), 7 deletions(-) rename regression-test/data/ann_index_p0/{ann_index_build_chunk_bytes.out => ann_index_build_add_chunk_bytes.out} (99%) create mode 100644 regression-test/data/ann_index_p0/ivf_pq_full_buffer_train_recall.out diff --git a/regression-test/data/ann_index_p0/ann_index_build_chunk_bytes.out b/regression-test/data/ann_index_p0/ann_index_build_add_chunk_bytes.out similarity index 99% rename from regression-test/data/ann_index_p0/ann_index_build_chunk_bytes.out rename to regression-test/data/ann_index_p0/ann_index_build_add_chunk_bytes.out index 20381ff6259a85..f6353561501136 100644 --- a/regression-test/data/ann_index_p0/ann_index_build_chunk_bytes.out +++ b/regression-test/data/ann_index_p0/ann_index_build_add_chunk_bytes.out @@ -6,3 +6,4 @@ 1 2 3 + diff --git a/regression-test/data/ann_index_p0/ivf_pq_full_buffer_train_recall.out b/regression-test/data/ann_index_p0/ivf_pq_full_buffer_train_recall.out new file mode 100644 index 00000000000000..6c3458e27355a5 --- /dev/null +++ b/regression-test/data/ann_index_p0/ivf_pq_full_buffer_train_recall.out @@ -0,0 +1,4 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !target_in_top20 -- +1 + diff --git a/regression-test/suites/ann_index_p0/ivf_pq_full_buffer_train_recall.groovy b/regression-test/suites/ann_index_p0/ivf_pq_full_buffer_train_recall.groovy index cb7a1b2c737fb1..7cf9ef182e2b3b 100644 --- a/regression-test/suites/ann_index_p0/ivf_pq_full_buffer_train_recall.groovy +++ b/regression-test/suites/ann_index_p0/ivf_pq_full_buffer_train_recall.groovy @@ -58,13 +58,15 @@ suite("ivf_pq_full_buffer_train_recall", "nonConcurrent") { sql "INSERT INTO tbl_ivf_pq_full_buffer_train_recall VALUES ${insertData.join(', ')};" sql "sync" - def hits = sql """ - select id - from tbl_ivf_pq_full_buffer_train_recall - order by l2_distance_approximate(embedding, [0.0, 0.0, 0.0, 0.0]), id - limit 20; + qt_target_in_top20 """ + select count(*) + from ( + select id + from tbl_ivf_pq_full_buffer_train_recall + order by l2_distance_approximate(embedding, [0.0, 0.0, 0.0, 0.0]), id + limit 20 + ) t + where id = 250; """ - assertTrue(hits.any { row -> row[0] == 250 }, - "Expected id 250 in ANN top 20, but got ${hits}") } } From 52f79deb4a1f1855b01a59464be7e2999320f5f3 Mon Sep 17 00:00:00 2001 From: kaka11chen Date: Thu, 4 Jun 2026 11:38:49 +0800 Subject: [PATCH 05/10] [chore](be) Add ANN train buffer release comment ### What problem does this PR solve? Issue Number: None Related PR: #64082 Problem Summary: Clarify why ANN index writer swaps the buffered vectors with an empty PODArray instead of using clear(). The swap intentionally releases the full-segment training buffer before saving the index, while clear() would keep the allocated capacity. ### Release note None ### Check List (For Author) - Test: No need to test (comment-only change) - Behavior changed: No - Does this need documentation: No --- be/src/storage/index/ann/ann_index_writer.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/be/src/storage/index/ann/ann_index_writer.cpp b/be/src/storage/index/ann/ann_index_writer.cpp index f517ba0bf038f3..95cb0e3f2ea5cb 100644 --- a/be/src/storage/index/ann/ann_index_writer.cpp +++ b/be/src/storage/index/ann/ann_index_writer.cpp @@ -209,6 +209,8 @@ Status AnnIndexColumnWriter::_train_and_add(Int64 min_train_rows) { } void AnnIndexColumnWriter::_release_buffered_vectors() { + // PODArray::clear() keeps the allocated capacity. Swap with an empty array so the + // full-segment training buffer is released before saving the index. PODArray empty_buffered_vectors; _buffered_vectors.swap(empty_buffered_vectors); } From 8d33856fd95c57165ba8e2336ff703f7ab942fa9 Mon Sep 17 00:00:00 2001 From: kaka11chen Date: Thu, 4 Jun 2026 15:21:34 +0800 Subject: [PATCH 06/10] [refactor](be) Remove ANN build abort flag ### What problem does this PR solve? Issue Number: None Related PR: #64082 Problem Summary: Remove the redundant ANN writer `_skip_build` state. The flag was only set from `close_on_error()`, while normal index skip behavior is already driven by zero rows or by the segment row count being smaller than the index training requirement. Keeping the writer state explicit avoids carrying an abort flag into regular add and finish paths. ### Release note None ### Check List (For Author) - Test: Unit Test - `ENABLE_PCH=OFF ./run-be-ut.sh --run --filter=AnnIndexWriterTest.*` - Behavior changed: No - Does this need documentation: No --- be/src/storage/index/ann/ann_index_writer.cpp | 7 +------ be/src/storage/index/ann/ann_index_writer.h | 1 - 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/be/src/storage/index/ann/ann_index_writer.cpp b/be/src/storage/index/ann/ann_index_writer.cpp index 95cb0e3f2ea5cb..a841e40bdb1071 100644 --- a/be/src/storage/index/ann/ann_index_writer.cpp +++ b/be/src/storage/index/ann/ann_index_writer.cpp @@ -88,7 +88,6 @@ Status AnnIndexColumnWriter::add_values(const std::string fn, const void* values void AnnIndexColumnWriter::close_on_error() { _release_buffered_vectors(); - _skip_build = true; } size_t AnnIndexColumnWriter::_add_chunk_rows_by_bytes(size_t dim) const { @@ -126,10 +125,6 @@ Status AnnIndexColumnWriter::add_array_values(size_t field_size, const void* val const float* p = reinterpret_cast(value_ptr); - if (_skip_build) { - return Status::OK(); - } - const Int64 min_train_rows = _vector_index->get_min_train_rows(); if (min_train_rows == 0) { RETURN_IF_ERROR(_add_vectors_in_chunks(p, num_rows)); @@ -154,7 +149,7 @@ int64_t AnnIndexColumnWriter::size() const { } Status AnnIndexColumnWriter::finish() { - if (_skip_build || _total_rows == 0) { + if (_total_rows == 0) { LOG_INFO("No data to train/add for ANN index. Skipping index building."); return _index_file_writer->delete_index(_index_meta); } diff --git a/be/src/storage/index/ann/ann_index_writer.h b/be/src/storage/index/ann/ann_index_writer.h index b2dc842be74bc7..9c06e4993b8360 100644 --- a/be/src/storage/index/ann/ann_index_writer.h +++ b/be/src/storage/index/ann/ann_index_writer.h @@ -92,6 +92,5 @@ class AnnIndexColumnWriter : public IndexColumnWriter { IndexFileWriter* _index_file_writer; const TabletIndex* _index_meta; std::shared_ptr _dir; - bool _skip_build = false; }; } // namespace doris::segment_v2 From d32d73cd11c7503bcc32ac3d0e28e8f09b2b87c1 Mon Sep 17 00:00:00 2001 From: kaka11chen Date: Thu, 4 Jun 2026 18:56:07 +0800 Subject: [PATCH 07/10] Problem Summary: ANN index build used separate add chunk configs and added no-train indexes during segment writing. This made the build strategy harder to reason about and could still spend CPU/memory building small HNSW/FLAT segments that should be skipped by a Doris-side row threshold. This change removes the chunk add configs, buffers ANN vectors for the whole segment, applies effective_min_rows = max(vector_index->get_min_train_rows(), config::ann_index_build_min_segment_rows) in finish(), and then trains when needed, adds once, releases the build buffer, and saves the index. Empty segments or segments below the effective threshold delete only the current index entry instead of persisting an ANN index. Add BE config ann_index_build_min_segment_rows to skip persisting ANN indexes for small segments. Remove ann_index_build_add_chunk_size and ann_index_build_add_chunk_bytes. --- be/src/common/config.cpp | 17 +- be/src/common/config.h | 6 +- be/src/storage/index/ann/ann_index_writer.cpp | 76 ++---- be/src/storage/index/ann/ann_index_writer.h | 19 +- .../index/ann/ann_index_writer_test.cpp | 222 ++++++------------ .../ann_index_build_add_chunk_bytes.out | 9 - .../ann_index_build_add_chunk_bytes.groovy | 56 ----- .../ann_index_build_min_segment_rows.groovy | 66 ++++++ .../ivf_pq_full_buffer_train_recall.groovy | 88 ++++--- .../suites/ann_index_p0/ivf_pq_recall.groovy | 118 +++++----- 10 files changed, 270 insertions(+), 407 deletions(-) delete mode 100644 regression-test/data/ann_index_p0/ann_index_build_add_chunk_bytes.out delete mode 100644 regression-test/suites/ann_index_p0/ann_index_build_add_chunk_bytes.groovy create mode 100644 regression-test/suites/ann_index_p0/ann_index_build_min_segment_rows.groovy diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index d4787d04d995c0..26d1a32b25b337 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1756,19 +1756,10 @@ DEFINE_mBool(enable_prefill_all_dbm_agg_cache_after_compaction, "true"); DEFINE_String(ann_index_ivf_list_cache_limit, "70%"); // Stale sweep time for ANN index IVF list cache in seconds. 3600s is 1 hour. DEFINE_mInt32(ann_index_ivf_list_cache_stale_sweep_time_sec, "3600"); - -// Target row count upper bound for ANN/vector index add batch during build. -// The effective add chunk also respects ann_index_build_add_chunk_bytes. -// 1M By default. -DEFINE_mInt64(ann_index_build_add_chunk_size, "1000000"); -DEFINE_Validator(ann_index_build_add_chunk_size, - [](const int64_t config) -> bool { return config > 0; }); - -// Target byte bound for ANN/vector index add batch during build. -// 128MB By default. -DEFINE_mInt64(ann_index_build_add_chunk_bytes, "134217728"); -DEFINE_Validator(ann_index_build_add_chunk_bytes, - [](const int64_t config) -> bool { return config > 0; }); +// Minimum segment rows required to persist an ANN index. 0 keeps the default behavior. +DEFINE_mInt64(ann_index_build_min_segment_rows, "0"); +DEFINE_Validator(ann_index_build_min_segment_rows, + [](const int64_t config) -> bool { return config >= 0; }); DEFINE_mBool(enable_wal_tde, "false"); diff --git a/be/src/common/config.h b/be/src/common/config.h index 9eab19cf50da28..f39dee69c4ffd8 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1809,10 +1809,8 @@ DECLARE_mInt32(max_segment_partial_column_cache_size); DECLARE_String(ann_index_ivf_list_cache_limit); // Stale sweep time for ANN index IVF list cache in seconds. DECLARE_mInt32(ann_index_ivf_list_cache_stale_sweep_time_sec); -// Target row count upper bound for ANN/vector index add batch during build. -DECLARE_mInt64(ann_index_build_add_chunk_size); -// Target byte bound for ANN/vector index add batch during build. -DECLARE_mInt64(ann_index_build_add_chunk_bytes); +// Minimum segment rows required to persist an ANN index. +DECLARE_mInt64(ann_index_build_min_segment_rows); DECLARE_mBool(enable_prefill_output_dbm_agg_cache_after_compaction); DECLARE_mBool(enable_prefill_all_dbm_agg_cache_after_compaction); diff --git a/be/src/storage/index/ann/ann_index_writer.cpp b/be/src/storage/index/ann/ann_index_writer.cpp index a841e40bdb1071..18ae3b5b1a1a7a 100644 --- a/be/src/storage/index/ann/ann_index_writer.cpp +++ b/be/src/storage/index/ann/ann_index_writer.cpp @@ -19,7 +19,6 @@ #include #include -#include #include #include @@ -52,6 +51,7 @@ Status AnnIndexColumnWriter::init() { _dir = compound_dir.value(); + _min_segment_rows = AnnIndexColumnWriter::min_segment_rows(); _vector_index = nullptr; const auto& properties = _index_meta->properties(); const std::string index_type = get_or_default(properties, INDEX_TYPE, "hnsw"); @@ -90,21 +90,6 @@ void AnnIndexColumnWriter::close_on_error() { _release_buffered_vectors(); } -size_t AnnIndexColumnWriter::_add_chunk_rows_by_bytes(size_t dim) const { - DCHECK(dim > 0); - static constexpr Int64 FLOAT_BYTES = static_cast(sizeof(float)); - DORIS_CHECK(dim <= static_cast(std::numeric_limits::max() / FLOAT_BYTES)); - const Int64 vector_bytes = cast_set(dim) * FLOAT_BYTES; - return cast_set( - std::max(1, AnnIndexColumnWriter::add_chunk_bytes() / vector_bytes)); -} - -size_t AnnIndexColumnWriter::_add_chunk_rows(size_t dim) const { - return cast_set( - std::max(1, std::min(AnnIndexColumnWriter::add_chunk_size(), - cast_set(_add_chunk_rows_by_bytes(dim))))); -} - Status AnnIndexColumnWriter::add_array_values(size_t field_size, const void* value_ptr, const uint8_t* null_map, const uint8_t* offsets_ptr, size_t num_rows) { @@ -125,12 +110,7 @@ Status AnnIndexColumnWriter::add_array_values(size_t field_size, const void* val const float* p = reinterpret_cast(value_ptr); - const Int64 min_train_rows = _vector_index->get_min_train_rows(); - if (min_train_rows == 0) { - RETURN_IF_ERROR(_add_vectors_in_chunks(p, num_rows)); - } else { - RETURN_IF_ERROR(_append_vectors_need_train(p, num_rows)); - } + RETURN_IF_ERROR(_append_vectors_to_buffer(p, num_rows)); _total_rows += cast_set(num_rows); return Status::OK(); @@ -155,26 +135,24 @@ Status AnnIndexColumnWriter::finish() { } const Int64 min_train_rows = _vector_index->get_min_train_rows(); - return min_train_rows == 0 ? _vector_index->save(_dir.get()) : _train_and_add(min_train_rows); -} + const Int64 effective_min_rows = _effective_min_rows(min_train_rows); + if (_total_rows < effective_min_rows) { + LOG_INFO( + "Total data size {} is less than minimum {} rows required for ANN index build. " + "Skipping index building for this segment.", + _total_rows, effective_min_rows); + _release_buffered_vectors(); + return _index_file_writer->delete_index(_index_meta); + } -Status AnnIndexColumnWriter::_add_vectors_in_chunks(const float* vectors, size_t num_rows) { - DCHECK(vectors != nullptr); - DCHECK(num_rows > 0); + return _build_and_save(min_train_rows, effective_min_rows); +} - const size_t dim = _vector_index->get_dimension(); - const size_t chunk_rows = _add_chunk_rows(dim); - size_t row_offset = 0; - while (row_offset < num_rows) { - const size_t rows_to_add = std::min(chunk_rows, num_rows - row_offset); - RETURN_IF_ERROR( - _vector_index->add(cast_set(rows_to_add), vectors + row_offset * dim)); - row_offset += rows_to_add; - } - return Status::OK(); +Int64 AnnIndexColumnWriter::_effective_min_rows(Int64 min_train_rows) const { + return std::max(min_train_rows, cast_set(_min_segment_rows)); } -Status AnnIndexColumnWriter::_append_vectors_need_train(const float* vectors, size_t num_rows) { +Status AnnIndexColumnWriter::_append_vectors_to_buffer(const float* vectors, size_t num_rows) { DCHECK(vectors != nullptr); DCHECK(num_rows > 0); @@ -183,29 +161,23 @@ Status AnnIndexColumnWriter::_append_vectors_need_train(const float* vectors, si return Status::OK(); } -Status AnnIndexColumnWriter::_train_and_add(Int64 min_train_rows) { - if (_total_rows < min_train_rows) { - LOG_INFO( - "Total data size {} is less than minimum {} rows required for ANN index training. " - "Skipping index building for this segment.", - _total_rows, min_train_rows); - _release_buffered_vectors(); - return _index_file_writer->delete_index(_index_meta); - } - +Status AnnIndexColumnWriter::_build_and_save(Int64 min_train_rows, Int64 effective_min_rows) { const size_t dim = _vector_index->get_dimension(); DCHECK(_buffered_vectors.size() % dim == 0); const Int64 train_rows = cast_set(_buffered_vectors.size() / dim); - DORIS_CHECK(train_rows >= min_train_rows); - RETURN_IF_ERROR(_vector_index->train(train_rows, _buffered_vectors.data())); - RETURN_IF_ERROR(_add_vectors_in_chunks(_buffered_vectors.data(), train_rows)); + DORIS_CHECK(train_rows == _total_rows); + DORIS_CHECK(train_rows >= effective_min_rows); + if (min_train_rows > 0) { + RETURN_IF_ERROR(_vector_index->train(train_rows, _buffered_vectors.data())); + } + RETURN_IF_ERROR(_vector_index->add(train_rows, _buffered_vectors.data())); _release_buffered_vectors(); return _vector_index->save(_dir.get()); } void AnnIndexColumnWriter::_release_buffered_vectors() { // PODArray::clear() keeps the allocated capacity. Swap with an empty array so the - // full-segment training buffer is released before saving the index. + // full-segment build buffer is released before saving the index. PODArray empty_buffered_vectors; _buffered_vectors.swap(empty_buffered_vectors); } diff --git a/be/src/storage/index/ann/ann_index_writer.h b/be/src/storage/index/ann/ann_index_writer.h index 9c06e4993b8360..8e279335b670ec 100644 --- a/be/src/storage/index/ann/ann_index_writer.h +++ b/be/src/storage/index/ann/ann_index_writer.h @@ -38,14 +38,6 @@ namespace doris::segment_v2 { class AnnIndexColumnWriter : public IndexColumnWriter { public: - static inline int64_t add_chunk_size() { -#ifdef BE_TEST - return 10; -#else - return config::ann_index_build_add_chunk_size; -#endif - } - static inline int64_t add_chunk_bytes() { return config::ann_index_build_add_chunk_bytes; } static constexpr const char* INDEX_TYPE = "index_type"; static constexpr const char* METRIC_TYPE = "metric_type"; static constexpr const char* DIM = "dim"; @@ -72,11 +64,11 @@ class AnnIndexColumnWriter : public IndexColumnWriter { Status finish() override; private: - size_t _add_chunk_rows_by_bytes(size_t dim) const; - size_t _add_chunk_rows(size_t dim) const; - Status _add_vectors_in_chunks(const float* vectors, size_t num_rows); - Status _append_vectors_need_train(const float* vectors, size_t num_rows); - Status _train_and_add(Int64 min_train_rows); + static inline int64_t min_segment_rows() { return config::ann_index_build_min_segment_rows; } + + Int64 _effective_min_rows(Int64 min_train_rows) const; + Status _append_vectors_to_buffer(const float* vectors, size_t num_rows); + Status _build_and_save(Int64 min_train_rows, Int64 effective_min_rows); void _release_buffered_vectors(); #ifdef BE_TEST @@ -89,6 +81,7 @@ class AnnIndexColumnWriter : public IndexColumnWriter { std::shared_ptr _vector_index; PODArray _buffered_vectors; int64_t _total_rows = 0; + int64_t _min_segment_rows = 0; IndexFileWriter* _index_file_writer; const TabletIndex* _index_meta; std::shared_ptr _dir; diff --git a/be/test/storage/index/ann/ann_index_writer_test.cpp b/be/test/storage/index/ann/ann_index_writer_test.cpp index d4ab12a6f646c5..20107c90779501 100644 --- a/be/test/storage/index/ann/ann_index_writer_test.cpp +++ b/be/test/storage/index/ann/ann_index_writer_test.cpp @@ -65,7 +65,6 @@ class TestAnnIndexColumnWriter : public AnnIndexColumnWriter { void set_vector_index(std::shared_ptr index) { _vector_index = index; } size_t buffered_vector_capacity() const { return _buffered_vectors.capacity(); } size_t buffered_vector_rows(size_t dim) const { return _buffered_vectors.size() / dim; } - size_t add_chunk_rows(size_t dim) const { return _add_chunk_rows(dim); } }; class AnnIndexWriterTest : public ::testing::Test { @@ -170,7 +169,7 @@ TEST_F(AnnIndexWriterTest, TestInitWithDifferentProperties) { } } -TEST_F(AnnIndexWriterTest, TestInitDoesNotPreallocateAddChunk) { +TEST_F(AnnIndexWriterTest, TestInitDoesNotPreallocateBuildBuffer) { auto writer = std::make_unique(_index_file_writer.get(), _tablet_index.get()); @@ -432,7 +431,7 @@ TEST_F(AnnIndexWriterTest, TestInvalidMetricType) { EXPECT_THROW(writer->init(), doris::Exception); } -TEST_F(AnnIndexWriterTest, TestNoTrainIndexAddsDirectly) { +TEST_F(AnnIndexWriterTest, TestNoTrainIndexAddsAtFinish) { auto mock_index = std::make_shared(); auto writer = std::make_unique(_index_file_writer.get(), _tablet_index.get()); @@ -446,45 +445,50 @@ TEST_F(AnnIndexWriterTest, TestNoTrainIndexAddsDirectly) { EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(0)); EXPECT_CALL(*mock_index, train(testing::_, testing::_)).Times(0); - EXPECT_CALL(*mock_index, add(6, testing::_)) - .Times(2) - .WillRepeatedly(testing::Return(Status::OK())); + EXPECT_CALL(*mock_index, add(testing::_, testing::_)).Times(0); EXPECT_CALL(*mock_index, save(testing::_)).Times(0); const size_t dim = 4; + constexpr size_t batch_rows = 6; for (int batch = 0; batch < 2; ++batch) { - const size_t num_rows = 6; - std::vector vectors(num_rows * dim); + std::vector vectors(batch_rows * dim); for (size_t i = 0; i < vectors.size(); ++i) { vectors[i] = static_cast(batch * vectors.size() + i); } std::vector offsets; - for (size_t row = 0; row <= num_rows; ++row) { + for (size_t row = 0; row <= batch_rows; ++row) { offsets.push_back(row * dim); } Status status = writer->add_array_values(sizeof(float), vectors.data(), nullptr, reinterpret_cast(offsets.data()), - num_rows); + batch_rows); EXPECT_TRUE(status.ok()); } + EXPECT_EQ(writer->buffered_vector_rows(dim), 2 * batch_rows); EXPECT_TRUE(testing::Mock::VerifyAndClearExpectations(mock_index.get())); EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(0)); EXPECT_CALL(*mock_index, train(testing::_, testing::_)).Times(0); - EXPECT_CALL(*mock_index, add(testing::_, testing::_)).Times(0); - EXPECT_CALL(*mock_index, save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); + { + testing::InSequence sequence; + EXPECT_CALL(*mock_index, add(12, testing::_)) + .Times(1) + .WillOnce(testing::Return(Status::OK())); + EXPECT_CALL(*mock_index, save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); + } Status status = writer->finish(); EXPECT_TRUE(status.ok()); + EXPECT_EQ(writer->buffered_vector_rows(dim), 0); } -TEST_F(AnnIndexWriterTest, TestNoTrainIndexRespectsAddChunkBytes) { - const int64_t old_add_chunk_bytes = config::ann_index_build_add_chunk_bytes; - config::ann_index_build_add_chunk_bytes = 32; +TEST_F(AnnIndexWriterTest, TestNoTrainIndexSkipsWhenRowsLessThanMinSegmentRows) { + const int64_t old_min_segment_rows = config::ann_index_build_min_segment_rows; + config::ann_index_build_min_segment_rows = 5; doris::Defer restore_config { - [&] { config::ann_index_build_add_chunk_bytes = old_add_chunk_bytes; }}; + [&] { config::ann_index_build_min_segment_rows = old_min_segment_rows; }}; auto mock_index = std::make_shared(); auto writer = std::make_unique(_index_file_writer.get(), @@ -497,23 +501,13 @@ TEST_F(AnnIndexWriterTest, TestNoTrainIndexRespectsAddChunkBytes) { ASSERT_TRUE(writer->init().ok()); writer->set_vector_index(mock_index); - const size_t dim = 4; - EXPECT_EQ(writer->add_chunk_rows(dim), 2); - EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(0)); EXPECT_CALL(*mock_index, train(testing::_, testing::_)).Times(0); + EXPECT_CALL(*mock_index, add(testing::_, testing::_)).Times(0); EXPECT_CALL(*mock_index, save(testing::_)).Times(0); - { - testing::InSequence sequence; - EXPECT_CALL(*mock_index, add(2, testing::_)) - .Times(2) - .WillRepeatedly(testing::Return(Status::OK())); - EXPECT_CALL(*mock_index, add(1, testing::_)) - .Times(1) - .WillOnce(testing::Return(Status::OK())); - } - const size_t num_rows = 5; + const size_t dim = 4; + const size_t num_rows = 3; std::vector vectors(num_rows * dim); for (size_t i = 0; i < vectors.size(); ++i) { vectors[i] = static_cast(i); @@ -527,15 +521,55 @@ TEST_F(AnnIndexWriterTest, TestNoTrainIndexRespectsAddChunkBytes) { writer->add_array_values(sizeof(float), vectors.data(), nullptr, reinterpret_cast(offsets.data()), num_rows); EXPECT_TRUE(status.ok()); - EXPECT_TRUE(testing::Mock::VerifyAndClearExpectations(mock_index.get())); + EXPECT_EQ(writer->buffered_vector_rows(dim), num_rows); - EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(0)); + status = writer->finish(); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(writer->buffered_vector_rows(dim), 0); +} + +TEST_F(AnnIndexWriterTest, TestTrainRequiredIndexUsesEffectiveMinSegmentRows) { + const int64_t old_min_segment_rows = config::ann_index_build_min_segment_rows; + config::ann_index_build_min_segment_rows = 10; + doris::Defer restore_config { + [&] { config::ann_index_build_min_segment_rows = old_min_segment_rows; }}; + + auto mock_index = std::make_shared(); + auto writer = std::make_unique(_index_file_writer.get(), + _tablet_index.get()); + + auto fs_dir = std::make_shared(); + fs_dir->init(doris::io::global_local_filesystem(), "./ut_dir/tmp_vector_search", nullptr); + EXPECT_CALL(*_index_file_writer, open(testing::_)).WillOnce(testing::Return(fs_dir)); + + ASSERT_TRUE(writer->init().ok()); + writer->set_vector_index(mock_index); + + EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(2)); EXPECT_CALL(*mock_index, train(testing::_, testing::_)).Times(0); EXPECT_CALL(*mock_index, add(testing::_, testing::_)).Times(0); - EXPECT_CALL(*mock_index, save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); + EXPECT_CALL(*mock_index, save(testing::_)).Times(0); + + const size_t dim = 4; + const size_t num_rows = 6; + std::vector vectors(num_rows * dim); + for (size_t i = 0; i < vectors.size(); ++i) { + vectors[i] = static_cast(i); + } + std::vector offsets; + for (size_t row = 0; row <= num_rows; ++row) { + offsets.push_back(row * dim); + } + + Status status = + writer->add_array_values(sizeof(float), vectors.data(), nullptr, + reinterpret_cast(offsets.data()), num_rows); + EXPECT_TRUE(status.ok()); + EXPECT_EQ(writer->buffered_vector_rows(dim), num_rows); status = writer->finish(); EXPECT_TRUE(status.ok()); + EXPECT_EQ(writer->buffered_vector_rows(dim), 0); } TEST_F(AnnIndexWriterTest, TestCreateFromIndexColumnWriter) { @@ -713,7 +747,6 @@ TEST_F(AnnIndexWriterTest, TestTrainRequiredIndexTrainsOnceAndAddsAllRows) { EXPECT_CALL(*mock_index, add(testing::_, testing::_)).Times(0); EXPECT_CALL(*mock_index, save(testing::_)).Times(0); - // CHUNK_SIZE = 10 const size_t dim = 4; { @@ -761,10 +794,7 @@ TEST_F(AnnIndexWriterTest, TestTrainRequiredIndexTrainsOnceAndAddsAllRows) { EXPECT_CALL(*mock_index, train(12, testing::_)) .Times(1) .WillOnce(testing::Return(Status::OK())); - EXPECT_CALL(*mock_index, add(10, testing::_)) - .Times(1) - .WillOnce(testing::Return(Status::OK())); - EXPECT_CALL(*mock_index, add(2, testing::_)) + EXPECT_CALL(*mock_index, add(12, testing::_)) .Times(1) .WillOnce(testing::Return(Status::OK())); EXPECT_CALL(*mock_index, save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); @@ -824,66 +854,9 @@ TEST_F(AnnIndexWriterTest, TestTrainRequiredIndexTrainsWithAllBufferedRows) { } return Status::OK(); })); - EXPECT_CALL(*mock_index, add(10, testing::_)) - .Times(2) - .WillRepeatedly(testing::Return(Status::OK())); - EXPECT_CALL(*mock_index, save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); - } - - status = writer->finish(); - EXPECT_TRUE(status.ok()); -} - -TEST_F(AnnIndexWriterTest, TestAddChunkBytesCapsAddRows) { - const int64_t old_add_chunk_bytes = config::ann_index_build_add_chunk_bytes; - config::ann_index_build_add_chunk_bytes = 32; - doris::Defer restore_config { - [&] { config::ann_index_build_add_chunk_bytes = old_add_chunk_bytes; }}; - - auto mock_index = std::make_shared(); - auto writer = std::make_unique(_index_file_writer.get(), - _tablet_index.get()); - - auto fs_dir = std::make_shared(); - fs_dir->init(doris::io::global_local_filesystem(), "./ut_dir/tmp_vector_search", nullptr); - EXPECT_CALL(*_index_file_writer, open(testing::_)).WillOnce(testing::Return(fs_dir)); - - ASSERT_TRUE(writer->init().ok()); - writer->set_vector_index(mock_index); - - const size_t dim = 4; - EXPECT_EQ(writer->add_chunk_rows(dim), 2); - - EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(1)); - EXPECT_CALL(*mock_index, train(testing::_, testing::_)).Times(0); - EXPECT_CALL(*mock_index, add(testing::_, testing::_)).Times(0); - EXPECT_CALL(*mock_index, save(testing::_)).Times(0); - - const size_t num_rows = 6; - std::vector vectors(num_rows * dim); - for (size_t i = 0; i < vectors.size(); ++i) { - vectors[i] = static_cast(i); - } - std::vector offsets; - for (size_t row = 0; row <= num_rows; ++row) { - offsets.push_back(row * dim); - } - - Status status = - writer->add_array_values(sizeof(float), vectors.data(), nullptr, - reinterpret_cast(offsets.data()), num_rows); - EXPECT_TRUE(status.ok()); - EXPECT_TRUE(testing::Mock::VerifyAndClearExpectations(mock_index.get())); - - EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(1)); - { - testing::InSequence sequence; - EXPECT_CALL(*mock_index, train(6, testing::_)) + EXPECT_CALL(*mock_index, add(20, testing::_)) .Times(1) .WillOnce(testing::Return(Status::OK())); - EXPECT_CALL(*mock_index, add(2, testing::_)) - .Times(3) - .WillRepeatedly(testing::Return(Status::OK())); EXPECT_CALL(*mock_index, save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); } @@ -936,65 +909,6 @@ TEST_F(AnnIndexWriterTest, TestSkipIndexWhenTotalRowsLessThanMinTrainRows) { EXPECT_EQ(writer->buffered_vector_rows(dim), 0); } -TEST_F(AnnIndexWriterTest, TestMinTrainRowsCanExceedAddChunkBytes) { - const int64_t old_add_chunk_bytes = config::ann_index_build_add_chunk_bytes; - config::ann_index_build_add_chunk_bytes = 32; - doris::Defer restore_config { - [&] { config::ann_index_build_add_chunk_bytes = old_add_chunk_bytes; }}; - - auto mock_index = std::make_shared(); - auto writer = std::make_unique(_index_file_writer.get(), - _tablet_index.get()); - - auto fs_dir = std::make_shared(); - fs_dir->init(doris::io::global_local_filesystem(), "./ut_dir/tmp_vector_search", nullptr); - EXPECT_CALL(*_index_file_writer, open(testing::_)).WillOnce(testing::Return(fs_dir)); - - ASSERT_TRUE(writer->init().ok()); - writer->set_vector_index(mock_index); - - const size_t dim = 4; - EXPECT_EQ(writer->add_chunk_rows(dim), 2); - - EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(5)); - EXPECT_CALL(*mock_index, train(testing::_, testing::_)).Times(0); - EXPECT_CALL(*mock_index, add(testing::_, testing::_)).Times(0); - EXPECT_CALL(*mock_index, save(testing::_)).Times(0); - - const size_t num_rows = 6; - std::vector vectors(num_rows * dim); - for (size_t i = 0; i < vectors.size(); ++i) { - vectors[i] = static_cast(i); - } - std::vector offsets; - for (size_t row = 0; row <= num_rows; ++row) { - offsets.push_back(row * dim); - } - - Status status = - writer->add_array_values(sizeof(float), vectors.data(), nullptr, - reinterpret_cast(offsets.data()), num_rows); - EXPECT_TRUE(status.ok()); - - EXPECT_EQ(writer->buffered_vector_rows(dim), num_rows); - EXPECT_TRUE(testing::Mock::VerifyAndClearExpectations(mock_index.get())); - EXPECT_CALL(*mock_index, get_min_train_rows()).WillRepeatedly(testing::Return(5)); - { - testing::InSequence sequence; - EXPECT_CALL(*mock_index, train(6, testing::_)) - .Times(1) - .WillOnce(testing::Return(Status::OK())); - EXPECT_CALL(*mock_index, add(2, testing::_)) - .Times(3) - .WillRepeatedly(testing::Return(Status::OK())); - EXPECT_CALL(*mock_index, save(testing::_)).Times(1).WillOnce(testing::Return(Status::OK())); - } - - status = writer->finish(); - EXPECT_TRUE(status.ok()); - EXPECT_EQ(writer->buffered_vector_rows(dim), 0); -} - TEST_F(AnnIndexWriterTest, TestIVFOnDiskMinTrainRows) { FaissVectorIndex index; FaissBuildParameter params; diff --git a/regression-test/data/ann_index_p0/ann_index_build_add_chunk_bytes.out b/regression-test/data/ann_index_p0/ann_index_build_add_chunk_bytes.out deleted file mode 100644 index f6353561501136..00000000000000 --- a/regression-test/data/ann_index_p0/ann_index_build_add_chunk_bytes.out +++ /dev/null @@ -1,9 +0,0 @@ --- This file is automatically generated. You should know what you did if you want to edit this --- !row_count -- -8 - --- !nearest -- -1 -2 -3 - diff --git a/regression-test/suites/ann_index_p0/ann_index_build_add_chunk_bytes.groovy b/regression-test/suites/ann_index_p0/ann_index_build_add_chunk_bytes.groovy deleted file mode 100644 index 9944609f275c04..00000000000000 --- a/regression-test/suites/ann_index_p0/ann_index_build_add_chunk_bytes.groovy +++ /dev/null @@ -1,56 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -suite("ann_index_build_add_chunk_bytes", "nonConcurrent") { - sql "set enable_common_expr_pushdown=true;" - - setBeConfigTemporary([ - ann_index_build_add_chunk_size: 1000000, - ann_index_build_add_chunk_bytes: 256 - ]) { - sql "drop table if exists ann_index_build_add_chunk_bytes" - sql """ - create table ann_index_build_add_chunk_bytes ( - id int not null, - embedding array not null, - index idx_embedding (`embedding`) using ann properties( - "index_type" = "hnsw", - "metric_type" = "l2_distance", - "dim" = "3072" - ) - ) engine=olap - duplicate key(id) - distributed by hash(id) buckets 1 - properties("replication_num" = "1"); - """ - - def rows = [] - for (int i = 1; i <= 8; i++) { - rows.add("(${i}, array_with_constant(3072, cast(${i}.0 as float)))") - } - sql "insert into ann_index_build_add_chunk_bytes values ${rows.join(', ')};" - sql "sync" - - qt_row_count "select count(*) from ann_index_build_add_chunk_bytes;" - qt_nearest """ - select id - from ann_index_build_add_chunk_bytes - order by l2_distance_approximate(embedding, array_with_constant(3072, cast(1.0 as float))) - limit 3; - """ - } -} diff --git a/regression-test/suites/ann_index_p0/ann_index_build_min_segment_rows.groovy b/regression-test/suites/ann_index_p0/ann_index_build_min_segment_rows.groovy new file mode 100644 index 00000000000000..01393444830471 --- /dev/null +++ b/regression-test/suites/ann_index_p0/ann_index_build_min_segment_rows.groovy @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("ann_index_build_min_segment_rows", "nonConcurrent") { + sql "unset variable all;" + sql "set enable_common_expr_pushdown=true;" + sql "set experimental_enable_virtual_slot_for_cse=true;" + sql "set enable_no_need_read_data_opt=true;" + sql "set parallel_pipeline_task_num=1;" + sql "set enable_sql_cache=false;" + sql "set enable_condition_cache=false;" + + setBeConfigTemporary([ann_index_build_min_segment_rows: 100]) { + sql "drop table if exists ann_index_build_min_segment_rows" + sql """ + create table ann_index_build_min_segment_rows ( + id int not null, + embedding array not null, + index idx_embedding(`embedding`) using ann properties( + "index_type"="hnsw", + "metric_type"="l2_distance", + "dim"="3" + ) + ) duplicate key(id) + distributed by hash(id) buckets 1 + properties("replication_num"="1"); + """ + + sql """ + insert into ann_index_build_min_segment_rows values + (1, [0.0, 0.0, 0.0]), + (2, [0.1, 0.0, 0.0]), + (3, [0.2, 0.0, 0.0]); + """ + + try { + GetDebugPoint().enableDebugPointForAllBEs( + "segment_iterator._read_columns_by_index", [column_name: "embedding"]) + test { + sql """ + select id + from ann_index_build_min_segment_rows + where l2_distance_approximate(embedding, [0.0, 0.0, 0.0]) < 1.0 + order by id; + """ + exception "does not need to read data" + } + } finally { + GetDebugPoint().disableDebugPointForAllBEs("segment_iterator._read_columns_by_index") + } + } +} diff --git a/regression-test/suites/ann_index_p0/ivf_pq_full_buffer_train_recall.groovy b/regression-test/suites/ann_index_p0/ivf_pq_full_buffer_train_recall.groovy index 7cf9ef182e2b3b..20cccfb28b6e25 100644 --- a/regression-test/suites/ann_index_p0/ivf_pq_full_buffer_train_recall.groovy +++ b/regression-test/suites/ann_index_p0/ivf_pq_full_buffer_train_recall.groovy @@ -20,53 +20,49 @@ suite("ivf_pq_full_buffer_train_recall", "nonConcurrent") { sql "set enable_ann_index_result_cache=false;" sql "set ivf_nprobe=8;" - setBeConfigTemporary([ann_index_build_add_chunk_size: 200]) { - // pq_nbits=1 needs 200 train rows. Setting the add chunk size to 200 - // verifies that add batching does not cap the rows used for training. - sql "drop table if exists tbl_ivf_pq_full_buffer_train_recall" - sql """ - CREATE TABLE tbl_ivf_pq_full_buffer_train_recall ( - id INT NOT NULL, - embedding ARRAY NOT NULL, - INDEX idx_emb (`embedding`) USING ANN PROPERTIES( - "index_type"="ivf", - "metric_type"="l2_distance", - "nlist"="8", - "dim"="4", - "quantizer"="pq", - "pq_m"="2", - "pq_nbits"="1" - ) - ) ENGINE=OLAP - DUPLICATE KEY(id) - DISTRIBUTED BY HASH(id) BUCKETS 1 - PROPERTIES ("replication_num" = "1"); - """ + sql "drop table if exists tbl_ivf_pq_full_buffer_train_recall" + sql """ + CREATE TABLE tbl_ivf_pq_full_buffer_train_recall ( + id INT NOT NULL, + embedding ARRAY NOT NULL, + INDEX idx_emb (`embedding`) USING ANN PROPERTIES( + "index_type"="ivf", + "metric_type"="l2_distance", + "nlist"="8", + "dim"="4", + "quantizer"="pq", + "pq_m"="2", + "pq_nbits"="1" + ) + ) ENGINE=OLAP + DUPLICATE KEY(id) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("replication_num" = "1"); + """ - def insertData = [] - for (int i = 1; i <= 400; i++) { - if (i == 250) { - insertData.add("(${i}, [0.0, 0.0, 0.0, 0.0])") - } else if (i <= 200) { - insertData.add("(${i}, [1000.0, ${i}.0, ${(i % 17)}.0, ${(i % 19)}.0])") - } else { - insertData.add( - "(${i}, [${(i - 250) / 50.0}, ${(250 - i) / 50.0}, " - + "${(i % 7 - 3) / 10.0}, ${(i % 5 - 2) / 10.0}])") - } + def insertData = [] + for (int i = 1; i <= 400; i++) { + if (i == 250) { + insertData.add("(${i}, [0.0, 0.0, 0.0, 0.0])") + } else if (i <= 200) { + insertData.add("(${i}, [1000.0, ${i}.0, ${(i % 17)}.0, ${(i % 19)}.0])") + } else { + insertData.add( + "(${i}, [${(i - 250) / 50.0}, ${(250 - i) / 50.0}, " + + "${(i % 7 - 3) / 10.0}, ${(i % 5 - 2) / 10.0}])") } - sql "INSERT INTO tbl_ivf_pq_full_buffer_train_recall VALUES ${insertData.join(', ')};" - sql "sync" - - qt_target_in_top20 """ - select count(*) - from ( - select id - from tbl_ivf_pq_full_buffer_train_recall - order by l2_distance_approximate(embedding, [0.0, 0.0, 0.0, 0.0]), id - limit 20 - ) t - where id = 250; - """ } + sql "INSERT INTO tbl_ivf_pq_full_buffer_train_recall VALUES ${insertData.join(', ')};" + sql "sync" + + qt_target_in_top20 """ + select count(*) + from ( + select id + from tbl_ivf_pq_full_buffer_train_recall + order by l2_distance_approximate(embedding, [0.0, 0.0, 0.0, 0.0]), id + limit 20 + ) t + where id = 250; + """ } diff --git a/regression-test/suites/ann_index_p0/ivf_pq_recall.groovy b/regression-test/suites/ann_index_p0/ivf_pq_recall.groovy index d9bd6727744e52..c1c6a7b7651d03 100644 --- a/regression-test/suites/ann_index_p0/ivf_pq_recall.groovy +++ b/regression-test/suites/ann_index_p0/ivf_pq_recall.groovy @@ -20,68 +20,66 @@ suite("ivf_pq_recall", "nonConcurrent") { sql "set enable_ann_index_result_cache=false;" sql "set ivf_nprobe=8;" - setBeConfigTemporary([ann_index_build_add_chunk_size: 400]) { - sql "drop table if exists ivf_pq_recall" - sql """ - create table ivf_pq_recall ( - id int not null, - embedding array not null, - index idx_embedding (`embedding`) using ann properties( - "index_type" = "ivf", - "metric_type" = "l2_distance", - "nlist" = "8", - "dim" = "4", - "quantizer" = "pq", - "pq_m" = "2", - "pq_nbits" = "2" - ) - ) engine=olap - duplicate key(id) - distributed by hash(id) buckets 1 - properties( - "replication_num" = "1", - "disable_auto_compaction" = "true" - ); - """ + sql "drop table if exists ivf_pq_recall" + sql """ + create table ivf_pq_recall ( + id int not null, + embedding array not null, + index idx_embedding (`embedding`) using ann properties( + "index_type" = "ivf", + "metric_type" = "l2_distance", + "nlist" = "8", + "dim" = "4", + "quantizer" = "pq", + "pq_m" = "2", + "pq_nbits" = "2" + ) + ) engine=olap + duplicate key(id) + distributed by hash(id) buckets 1 + properties( + "replication_num" = "1", + "disable_auto_compaction" = "true" + ); + """ - def formatFloat = { double value -> - String.format(java.util.Locale.ROOT, "%.3f", value) - } - def vector = { double x -> - "[${formatFloat(x)}, ${formatFloat(x * 2)}, ${formatFloat(x * 3)}, ${formatFloat(x * 4)}]" - } - def rows = [] - for (int i = 1; i <= 400; i++) { - double x = (i - 1) / 1000.0 - rows.add("(${i}, ${vector(x)})") - } - for (int i = 401; i <= 800; i++) { - double x = 1000.0 + (i - 401) / 1000.0 - rows.add("(${i}, ${vector(x)})") - } - sql "insert into ivf_pq_recall values ${rows.join(',')};" - sql "sync" + def formatFloat = { double value -> + String.format(java.util.Locale.ROOT, "%.3f", value) + } + def vector = { double x -> + "[${formatFloat(x)}, ${formatFloat(x * 2)}, ${formatFloat(x * 3)}, ${formatFloat(x * 4)}]" + } + def rows = [] + for (int i = 1; i <= 400; i++) { + double x = (i - 1) / 1000.0 + rows.add("(${i}, ${vector(x)})") + } + for (int i = 401; i <= 800; i++) { + double x = 1000.0 + (i - 401) / 1000.0 + rows.add("(${i}, ${vector(x)})") + } + sql "insert into ivf_pq_recall values ${rows.join(',')};" + sql "sync" - qt_row_count "select count(*) from ivf_pq_recall;" + qt_row_count "select count(*) from ivf_pq_recall;" - qt_first_cluster_recall """ - select count(*) from ( - select id - from ivf_pq_recall - order by l2_distance_approximate(embedding, [0.0, 0.0, 0.0, 0.0]) - limit 20 - ) t - where id between 1 and 400; - """ + qt_first_cluster_recall """ + select count(*) from ( + select id + from ivf_pq_recall + order by l2_distance_approximate(embedding, [0.0, 0.0, 0.0, 0.0]) + limit 20 + ) t + where id between 1 and 400; + """ - qt_second_cluster_recall """ - select count(*) from ( - select id - from ivf_pq_recall - order by l2_distance_approximate(embedding, [1000.0, 2000.0, 3000.0, 4000.0]) - limit 20 - ) t - where id between 401 and 800; - """ - } + qt_second_cluster_recall """ + select count(*) from ( + select id + from ivf_pq_recall + order by l2_distance_approximate(embedding, [1000.0, 2000.0, 3000.0, 4000.0]) + limit 20 + ) t + where id between 401 and 800; + """ } From a2d5ec9caabb30ce6b2fee5ffadf0d03e715de89 Mon Sep 17 00:00:00 2001 From: kaka11chen Date: Fri, 5 Jun 2026 12:03:57 +0800 Subject: [PATCH 08/10] [chore](be) Clarify ANN vector dimension validation ### What problem does this PR solve? Issue Number: None Related PR: #64082 Problem Summary: The ANN writer buffers vectors through an internal helper after validating array dimensions in add_array_values(). Add a short comment to make the validation precondition explicit for the buffer helper path. ### Release note None ### Check List (For Author) - Test: Manual test - Ran git diff --check - Behavior changed: No - Does this need documentation: No --- be/src/storage/index/ann/ann_index_writer.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/be/src/storage/index/ann/ann_index_writer.cpp b/be/src/storage/index/ann/ann_index_writer.cpp index 18ae3b5b1a1a7a..754d27245e06c0 100644 --- a/be/src/storage/index/ann/ann_index_writer.cpp +++ b/be/src/storage/index/ann/ann_index_writer.cpp @@ -110,6 +110,7 @@ Status AnnIndexColumnWriter::add_array_values(size_t field_size, const void* val const float* p = reinterpret_cast(value_ptr); + // The offsets check above guarantees every array row matches the ANN index dimension. RETURN_IF_ERROR(_append_vectors_to_buffer(p, num_rows)); _total_rows += cast_set(num_rows); From 62a05d747121f84d4b6f32a191f5ea41df4245fc Mon Sep 17 00:00:00 2001 From: kaka11chen Date: Fri, 5 Jun 2026 14:22:46 +0800 Subject: [PATCH 09/10] [chore](be) Inline ANN effective min rows calculation ### What problem does this PR solve? Issue Number: None Related PR: #64082 Problem Summary: The ANN writer used a tiny helper only to compute max(min_train_rows, ann_index_build_min_segment_rows). Inline the single-use calculation in finish() to keep the build threshold logic local and reduce unnecessary indirection. ### Release note None ### Check List (For Author) - Test: Manual test - Ran git diff --check - Ran rg to verify _effective_min_rows has no remaining references - Behavior changed: No - Does this need documentation: No --- be/src/storage/index/ann/ann_index_writer.cpp | 7 ++----- be/src/storage/index/ann/ann_index_writer.h | 1 - 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/be/src/storage/index/ann/ann_index_writer.cpp b/be/src/storage/index/ann/ann_index_writer.cpp index 754d27245e06c0..0c9cf381f75e3b 100644 --- a/be/src/storage/index/ann/ann_index_writer.cpp +++ b/be/src/storage/index/ann/ann_index_writer.cpp @@ -136,7 +136,8 @@ Status AnnIndexColumnWriter::finish() { } const Int64 min_train_rows = _vector_index->get_min_train_rows(); - const Int64 effective_min_rows = _effective_min_rows(min_train_rows); + const Int64 effective_min_rows = + std::max(min_train_rows, cast_set(_min_segment_rows)); if (_total_rows < effective_min_rows) { LOG_INFO( "Total data size {} is less than minimum {} rows required for ANN index build. " @@ -149,10 +150,6 @@ Status AnnIndexColumnWriter::finish() { return _build_and_save(min_train_rows, effective_min_rows); } -Int64 AnnIndexColumnWriter::_effective_min_rows(Int64 min_train_rows) const { - return std::max(min_train_rows, cast_set(_min_segment_rows)); -} - Status AnnIndexColumnWriter::_append_vectors_to_buffer(const float* vectors, size_t num_rows) { DCHECK(vectors != nullptr); DCHECK(num_rows > 0); diff --git a/be/src/storage/index/ann/ann_index_writer.h b/be/src/storage/index/ann/ann_index_writer.h index 8e279335b670ec..2d6cd285ba398a 100644 --- a/be/src/storage/index/ann/ann_index_writer.h +++ b/be/src/storage/index/ann/ann_index_writer.h @@ -66,7 +66,6 @@ class AnnIndexColumnWriter : public IndexColumnWriter { private: static inline int64_t min_segment_rows() { return config::ann_index_build_min_segment_rows; } - Int64 _effective_min_rows(Int64 min_train_rows) const; Status _append_vectors_to_buffer(const float* vectors, size_t num_rows); Status _build_and_save(Int64 min_train_rows, Int64 effective_min_rows); void _release_buffered_vectors(); From 388ae6f272fe02af5a674b51da76f70fb986364a Mon Sep 17 00:00:00 2001 From: kaka11chen Date: Fri, 5 Jun 2026 14:30:45 +0800 Subject: [PATCH 10/10] [chore](be) Inline ANN writer buffer helpers ### What problem does this PR solve? Issue Number: None Related PR: #64082 Problem Summary: The ANN writer had small single-use helpers and a cached min segment rows member after switching to finish-time buffering. Inline vector buffering, buffer release, and direct ann_index_build_min_segment_rows access at their call sites to keep the writer implementation simpler. ### Release note None ### Check List (For Author) - Test: Manual test - Ran git diff --check - Ran rg to verify _append_vectors_to_buffer, _release_buffered_vectors, _min_segment_rows, and min_segment_rows() have no remaining references - Behavior changed: No - Does this need documentation: No --- be/src/storage/index/ann/ann_index_writer.cpp | 28 ++++++------------- be/src/storage/index/ann/ann_index_writer.h | 6 ---- 2 files changed, 9 insertions(+), 25 deletions(-) diff --git a/be/src/storage/index/ann/ann_index_writer.cpp b/be/src/storage/index/ann/ann_index_writer.cpp index 0c9cf381f75e3b..21911417c4f9a1 100644 --- a/be/src/storage/index/ann/ann_index_writer.cpp +++ b/be/src/storage/index/ann/ann_index_writer.cpp @@ -23,6 +23,7 @@ #include #include "common/cast_set.h" +#include "common/config.h" #include "storage/index/ann/faiss_ann_index.h" #include "storage/index/inverted/inverted_index_fs_directory.h" @@ -51,7 +52,6 @@ Status AnnIndexColumnWriter::init() { _dir = compound_dir.value(); - _min_segment_rows = AnnIndexColumnWriter::min_segment_rows(); _vector_index = nullptr; const auto& properties = _index_meta->properties(); const std::string index_type = get_or_default(properties, INDEX_TYPE, "hnsw"); @@ -87,7 +87,8 @@ Status AnnIndexColumnWriter::add_values(const std::string fn, const void* values } void AnnIndexColumnWriter::close_on_error() { - _release_buffered_vectors(); + PODArray empty_buffered_vectors; + _buffered_vectors.swap(empty_buffered_vectors); } Status AnnIndexColumnWriter::add_array_values(size_t field_size, const void* value_ptr, @@ -111,7 +112,8 @@ Status AnnIndexColumnWriter::add_array_values(size_t field_size, const void* val const float* p = reinterpret_cast(value_ptr); // The offsets check above guarantees every array row matches the ANN index dimension. - RETURN_IF_ERROR(_append_vectors_to_buffer(p, num_rows)); + DCHECK(p != nullptr); + _buffered_vectors.insert(_buffered_vectors.end(), p, p + num_rows * dim); _total_rows += cast_set(num_rows); return Status::OK(); @@ -137,28 +139,20 @@ Status AnnIndexColumnWriter::finish() { const Int64 min_train_rows = _vector_index->get_min_train_rows(); const Int64 effective_min_rows = - std::max(min_train_rows, cast_set(_min_segment_rows)); + std::max(min_train_rows, cast_set(config::ann_index_build_min_segment_rows)); if (_total_rows < effective_min_rows) { LOG_INFO( "Total data size {} is less than minimum {} rows required for ANN index build. " "Skipping index building for this segment.", _total_rows, effective_min_rows); - _release_buffered_vectors(); + PODArray empty_buffered_vectors; + _buffered_vectors.swap(empty_buffered_vectors); return _index_file_writer->delete_index(_index_meta); } return _build_and_save(min_train_rows, effective_min_rows); } -Status AnnIndexColumnWriter::_append_vectors_to_buffer(const float* vectors, size_t num_rows) { - DCHECK(vectors != nullptr); - DCHECK(num_rows > 0); - - const size_t dim = _vector_index->get_dimension(); - _buffered_vectors.insert(_buffered_vectors.end(), vectors, vectors + num_rows * dim); - return Status::OK(); -} - Status AnnIndexColumnWriter::_build_and_save(Int64 min_train_rows, Int64 effective_min_rows) { const size_t dim = _vector_index->get_dimension(); DCHECK(_buffered_vectors.size() % dim == 0); @@ -169,14 +163,10 @@ Status AnnIndexColumnWriter::_build_and_save(Int64 min_train_rows, Int64 effecti RETURN_IF_ERROR(_vector_index->train(train_rows, _buffered_vectors.data())); } RETURN_IF_ERROR(_vector_index->add(train_rows, _buffered_vectors.data())); - _release_buffered_vectors(); - return _vector_index->save(_dir.get()); -} - -void AnnIndexColumnWriter::_release_buffered_vectors() { // PODArray::clear() keeps the allocated capacity. Swap with an empty array so the // full-segment build buffer is released before saving the index. PODArray empty_buffered_vectors; _buffered_vectors.swap(empty_buffered_vectors); + return _vector_index->save(_dir.get()); } } // namespace doris::segment_v2 diff --git a/be/src/storage/index/ann/ann_index_writer.h b/be/src/storage/index/ann/ann_index_writer.h index 2d6cd285ba398a..67061bef9219a8 100644 --- a/be/src/storage/index/ann/ann_index_writer.h +++ b/be/src/storage/index/ann/ann_index_writer.h @@ -27,7 +27,6 @@ #include #include -#include "common/config.h" #include "core/pod_array.h" #include "storage/index/ann/ann_index.h" #include "storage/index/index_file_writer.h" @@ -64,11 +63,7 @@ class AnnIndexColumnWriter : public IndexColumnWriter { Status finish() override; private: - static inline int64_t min_segment_rows() { return config::ann_index_build_min_segment_rows; } - - Status _append_vectors_to_buffer(const float* vectors, size_t num_rows); Status _build_and_save(Int64 min_train_rows, Int64 effective_min_rows); - void _release_buffered_vectors(); #ifdef BE_TEST friend class TestAnnIndexColumnWriter; @@ -80,7 +75,6 @@ class AnnIndexColumnWriter : public IndexColumnWriter { std::shared_ptr _vector_index; PODArray _buffered_vectors; int64_t _total_rows = 0; - int64_t _min_segment_rows = 0; IndexFileWriter* _index_file_writer; const TabletIndex* _index_meta; std::shared_ptr _dir;