From 92bef049a535690c2830188da6801533f7f859b5 Mon Sep 17 00:00:00 2001 From: Jianghua Yang Date: Tue, 19 May 2026 04:16:00 +0800 Subject: [PATCH 1/2] PAX: include bloomfilter columns in DELETE stats projection DeleteWithVisibilityMap projected only minmax_columns into the reader, then asked UpdateStatsInAuxTable to refresh bloomfilter stats too. When a bloomfilter column was not also a minmax column, the reader did not materialize it, and MicroPartitionStats::AddRow dereferenced an uninitialized slot value, crashing the segment with SIGSEGV. Project the union of minmax and bloomfilter column indexes, while keeping the original lists passed to UpdateStatsInAuxTable so per-stat semantics are unchanged. Hoist the GetBloomFilterColumnIndexes() call out of the per-block loop. Fixes apache/cloudberry#1749 --- contrib/pax_storage/src/cpp/storage/pax.cc | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/contrib/pax_storage/src/cpp/storage/pax.cc b/contrib/pax_storage/src/cpp/storage/pax.cc index 69282738f4c..494962b6cde 100644 --- a/contrib/pax_storage/src/cpp/storage/pax.cc +++ b/contrib/pax_storage/src/cpp/storage/pax.cc @@ -27,8 +27,11 @@ #include "storage/pax.h" +#include +#include #include #include +#include #include "access/pax_visimap.h" #include "access/paxc_rel_options.h" @@ -595,6 +598,8 @@ void TableDeleter::DeleteWithVisibilityMap( return; } std::vector min_max_col_idxs; + std::vector bf_col_idxs; + std::vector stats_proj_col_idxs; auto stats_updater_projection = std::make_shared(); std::unique_ptr visi_bitmap; @@ -602,7 +607,14 @@ void TableDeleter::DeleteWithVisibilityMap( auto rel_path = cbdb::BuildPaxDirectoryPath(rel_->rd_node, rel_->rd_backend); min_max_col_idxs = cbdb::GetMinMaxColumnIndexes(rel_); - stats_updater_projection->SetColumnProjection(min_max_col_idxs, + bf_col_idxs = cbdb::GetBloomFilterColumnIndexes(rel_); + + // Projection must cover minmax ∪ bloomfilter columns; otherwise + // AddRow reads uninitialized slot values for bf columns (issue #1749). + std::set_union(min_max_col_idxs.begin(), min_max_col_idxs.end(), + bf_col_idxs.begin(), bf_col_idxs.end(), + std::back_inserter(stats_proj_col_idxs)); + stats_updater_projection->SetColumnProjection(stats_proj_col_idxs, rel_->rd_att->natts); do { auto it = iterator->Next(); @@ -675,7 +687,7 @@ void TableDeleter::DeleteWithVisibilityMap( UpdateStatsInAuxTable( catalog_update, micro_partition_metadata, std::make_shared(visi_bitmap->Raw()), min_max_col_idxs, - cbdb::GetBloomFilterColumnIndexes(rel_), stats_updater_projection); + bf_col_idxs, stats_updater_projection); // write pg_pax_blocks_oid catalog_update.UpdateVisimap(block_id, visimap_file_name); From b5fec85472d2d6d8c3e48f91b1281387c5f95c28 Mon Sep 17 00:00:00 2001 From: Jianghua Yang Date: Tue, 19 May 2026 04:22:11 +0800 Subject: [PATCH 2/2] PAX: add regression test for DELETE bloomfilter stats crash Covers three layouts that previously crashed the segment when bloomfilter columns were not projected by DeleteWithVisibilityMap: - bloomfilter column outside minmax_columns - bloomfilter only, no minmax - overlapping minmax and bloomfilter sets Refs apache/cloudberry#1749 --- .../expected/delete_bloom_stats.out | 60 +++++++++++++++++++ contrib/pax_storage/pax_schedule | 1 + .../pax_storage/sql/delete_bloom_stats.sql | 48 +++++++++++++++ 3 files changed, 109 insertions(+) create mode 100644 contrib/pax_storage/expected/delete_bloom_stats.out create mode 100644 contrib/pax_storage/sql/delete_bloom_stats.sql diff --git a/contrib/pax_storage/expected/delete_bloom_stats.out b/contrib/pax_storage/expected/delete_bloom_stats.out new file mode 100644 index 00000000000..65d4d12986d --- /dev/null +++ b/contrib/pax_storage/expected/delete_bloom_stats.out @@ -0,0 +1,60 @@ +-- Regression test for issue #1749: +-- PAX DELETE crashes with SIGSEGV when bloomfilter_columns are not a +-- subset of minmax_columns. The stats refresh inside +-- DeleteWithVisibilityMap must project every column it reads. +-- Case 1: bloomfilter column (payload) is NOT in minmax_columns. +-- Pre-fix: segment crashed on DELETE. +drop table if exists pax_delete_bloom_crash; +NOTICE: table "pax_delete_bloom_crash" does not exist, skipping +create table pax_delete_bloom_crash (id int, k int, payload text) +using pax +with (minmax_columns = 'id', bloomfilter_columns = 'payload'); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'id' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +insert into pax_delete_bloom_crash +select i, i % 10, 'payload-' || i::text +from generate_series(1, 10000) as i; +delete from pax_delete_bloom_crash where id between 1 and 100; +select count(*) from pax_delete_bloom_crash; + count +------- + 9900 +(1 row) + +drop table pax_delete_bloom_crash; +-- Case 2: bloomfilter only, no minmax columns. +drop table if exists pax_delete_bf_only; +NOTICE: table "pax_delete_bf_only" does not exist, skipping +create table pax_delete_bf_only (id int, payload text) +using pax +with (bloomfilter_columns = 'payload'); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'id' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +insert into pax_delete_bf_only +select i, 'payload-' || i::text from generate_series(1, 5000) as i; +delete from pax_delete_bf_only where id between 1 and 50; +select count(*) from pax_delete_bf_only; + count +------- + 4950 +(1 row) + +drop table pax_delete_bf_only; +-- Case 3: minmax and bloomfilter columns overlap but neither is a subset. +drop table if exists pax_delete_mm_bf_mixed; +NOTICE: table "pax_delete_mm_bf_mixed" does not exist, skipping +create table pax_delete_mm_bf_mixed (id int, k int, payload text) +using pax +with (minmax_columns = 'id,payload', bloomfilter_columns = 'k,payload'); +NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'id' as the Apache Cloudberry data distribution key for this table. +HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. +insert into pax_delete_mm_bf_mixed +select i, i % 7, 'p-' || i::text from generate_series(1, 5000) as i; +delete from pax_delete_mm_bf_mixed where id between 1 and 50; +select count(*) from pax_delete_mm_bf_mixed; + count +------- + 4950 +(1 row) + +drop table pax_delete_mm_bf_mixed; diff --git a/contrib/pax_storage/pax_schedule b/contrib/pax_storage/pax_schedule index f9818f8e0b7..0f24f7e2028 100644 --- a/contrib/pax_storage/pax_schedule +++ b/contrib/pax_storage/pax_schedule @@ -6,6 +6,7 @@ test: alter_distributed test: toast toast_failed detoast ddl numeric types filter test: update test: statistics_bloom_filter +test: delete_bloom_stats test: filter_tree filter_tree_arithmetic test: filter_tree_root_quals diff --git a/contrib/pax_storage/sql/delete_bloom_stats.sql b/contrib/pax_storage/sql/delete_bloom_stats.sql new file mode 100644 index 00000000000..c422d21ffa8 --- /dev/null +++ b/contrib/pax_storage/sql/delete_bloom_stats.sql @@ -0,0 +1,48 @@ +-- Regression test for issue #1749: +-- PAX DELETE crashes with SIGSEGV when bloomfilter_columns are not a +-- subset of minmax_columns. The stats refresh inside +-- DeleteWithVisibilityMap must project every column it reads. + +-- Case 1: bloomfilter column (payload) is NOT in minmax_columns. +-- Pre-fix: segment crashed on DELETE. +drop table if exists pax_delete_bloom_crash; +create table pax_delete_bloom_crash (id int, k int, payload text) +using pax +with (minmax_columns = 'id', bloomfilter_columns = 'payload'); + +insert into pax_delete_bloom_crash +select i, i % 10, 'payload-' || i::text +from generate_series(1, 10000) as i; + +delete from pax_delete_bloom_crash where id between 1 and 100; +select count(*) from pax_delete_bloom_crash; + +drop table pax_delete_bloom_crash; + +-- Case 2: bloomfilter only, no minmax columns. +drop table if exists pax_delete_bf_only; +create table pax_delete_bf_only (id int, payload text) +using pax +with (bloomfilter_columns = 'payload'); + +insert into pax_delete_bf_only +select i, 'payload-' || i::text from generate_series(1, 5000) as i; + +delete from pax_delete_bf_only where id between 1 and 50; +select count(*) from pax_delete_bf_only; + +drop table pax_delete_bf_only; + +-- Case 3: minmax and bloomfilter columns overlap but neither is a subset. +drop table if exists pax_delete_mm_bf_mixed; +create table pax_delete_mm_bf_mixed (id int, k int, payload text) +using pax +with (minmax_columns = 'id,payload', bloomfilter_columns = 'k,payload'); + +insert into pax_delete_mm_bf_mixed +select i, i % 7, 'p-' || i::text from generate_series(1, 5000) as i; + +delete from pax_delete_mm_bf_mixed where id between 1 and 50; +select count(*) from pax_delete_mm_bf_mixed; + +drop table pax_delete_mm_bf_mixed;