From 41de75e4528a8aa9cecf6f1d4743d3e207fceecb Mon Sep 17 00:00:00 2001 From: Jie Yao Date: Thu, 18 Jun 2026 15:11:15 +0800 Subject: [PATCH] redo destroy pg --- conanfile.py | 3 +- src/lib/homestore_backend/gc_manager.cpp | 11 +++-- src/lib/homestore_backend/hs_homeobject.hpp | 4 +- src/lib/homestore_backend/hs_pg_manager.cpp | 42 ++++++++---------- .../replication_state_machine.cpp | 43 ++++++++++++++++++- .../tests/homeobj_fixture.hpp | 2 +- .../homestore_backend/tests/hs_pg_tests.cpp | 38 ++++++++++++++++ .../tests/hs_shard_tests.cpp | 7 ++- 8 files changed, 112 insertions(+), 38 deletions(-) diff --git a/conanfile.py b/conanfile.py index 17ec62bae..009c6b544 100644 --- a/conanfile.py +++ b/conanfile.py @@ -10,7 +10,7 @@ class HomeObjectConan(ConanFile): name = "homeobject" - version = "4.1.18" + version = "4.1.19" homepage = "https://github.com/eBay/HomeObject" description = "Blob Store built on HomeStore" @@ -26,6 +26,7 @@ class HomeObjectConan(ConanFile): "coverage": ['True', 'False'], "sanitize": ['True', 'False'], } + default_options = { 'shared': False, 'fPIC': True, diff --git a/src/lib/homestore_backend/gc_manager.cpp b/src/lib/homestore_backend/gc_manager.cpp index df7baf343..c93eb43dc 100644 --- a/src/lib/homestore_backend/gc_manager.cpp +++ b/src/lib/homestore_backend/gc_manager.cpp @@ -365,7 +365,7 @@ folly::SemiFuture< bool > GCManager::pdev_gc_actor::add_gc_task(uint8_t priority const auto pg_id = EXvchunk->m_pg_id.value(); m_hs_home_object->gc_manager()->incr_pg_pending_gc_task(pg_id); - if (!m_hs_home_object->can_chunks_in_pg_be_gc(pg_id)) { + if (!m_hs_home_object->is_pg_alive(pg_id)) { LOGDEBUGMOD(gcmgr, "chunk_id={} belongs to pg {}, which is not eligible for gc at this moment!", move_from_chunk, pg_id) m_hs_home_object->gc_manager()->decr_pg_pending_gc_task(pg_id); @@ -504,7 +504,7 @@ void GCManager::pdev_gc_actor::handle_recovered_gc_task( } // we have no gc_task_guard for recovered gc task, so we need to do this manually to make sure the gc task can be - // marked as completed and the pg can be marked as available for new gc task + // marked as completed on_gc_task_completed(priority, pg_id, move_from_chunk, move_to_chunk, vchunk_id, true, 0); GCLOGD(RECOVERD_GC_TASK_ID, pg_id, NO_SHARD_ID, @@ -797,9 +797,8 @@ bool GCManager::pdev_gc_actor::copy_valid_data( move_from_chunk); } - // check if all the pbas in the valid_blob_indexes are in move_from_chunk, if not, it means the - // shard is being modified during gc, we can not guarantee the data consistency, so we fail this gc - // task and let it be retried later. + // check if all the pbas in the valid_blob_indexes are in move_from_chunk, if not, we cancel this task and retry + // later. for (const auto& [blob, v] : valid_blob_indexes) { auto pba = v.pbas(); if (pba.chunk_num() != move_from_chunk) { @@ -1100,7 +1099,7 @@ bool GCManager::pdev_gc_actor::purge_reserved_chunk(chunk_id_t chunk, const uint RELEASE_ASSERT(!vchunk->m_pg_id.has_value(), "chunk_id={} is expected to be a reserved chunk, and not belong to a pg", chunk); RELEASE_ASSERT(vchunk->m_state == ChunkState::GC, - "chunk_id={} is a reserved chunk, expected to have a GC state, but actuall state is {} ", chunk, + "chunk_id={} is a reserved chunk, expected to have a GC state, but the actual state is {} ", chunk, vchunk->m_state); // Clear all rreqs on the reserved chunk BEFORE reset() resets its allocator. diff --git a/src/lib/homestore_backend/hs_homeobject.hpp b/src/lib/homestore_backend/hs_homeobject.hpp index 75c182b26..51a0b8d00 100644 --- a/src/lib/homestore_backend/hs_homeobject.hpp +++ b/src/lib/homestore_backend/hs_homeobject.hpp @@ -881,6 +881,8 @@ class HSHomeObject : public HomeObjectImpl { */ bool pg_destroy(pg_id_t pg_id, bool need_to_pause_pg_state_machine = false); + void destroy_pg_resource(pg_id_t pg_id); + bool pause_pg_state_machine(pg_id_t pg_id); bool resume_pg_state_machine(pg_id_t pg_id); @@ -977,7 +979,7 @@ class HSHomeObject : public HomeObjectImpl { * @param pg_id The ID of the PG whose shards are to be destroyed. * @return True if the chunks in the PG can be garbage collected, false otherwise. */ - bool can_chunks_in_pg_be_gc(pg_id_t pg_id) const; + bool is_pg_alive(pg_id_t pg_id) const; bool pg_exists(pg_id_t pg_id) const; diff --git a/src/lib/homestore_backend/hs_pg_manager.cpp b/src/lib/homestore_backend/hs_pg_manager.cpp index 09c23ef91..cce7164cd 100644 --- a/src/lib/homestore_backend/hs_pg_manager.cpp +++ b/src/lib/homestore_backend/hs_pg_manager.cpp @@ -688,29 +688,32 @@ std::optional< pg_id_t > HSHomeObject::get_pg_id_with_group_id(group_id_t group_ void HSHomeObject::_destroy_pg(pg_id_t pg_id) { pg_destroy(pg_id); } +void HSHomeObject::destroy_pg_resource(pg_id_t pg_id) { + destroy_shards(pg_id); + destroy_hs_resources(pg_id); + destroy_pg_index_table(pg_id); + destroy_pg_superblk(pg_id); + + // return pg chunks to dev heap + // which must be done after destroying pg super blk to avoid multiple pg use same chunks + bool res = chunk_selector_->return_pg_chunks_to_dev_heap(pg_id); + RELEASE_ASSERT(res, "Failed to return pg={} chunks to dev_heap", pg_id); + LOGI("resource of pg={} is destroyed", pg_id); +} + bool HSHomeObject::pg_destroy(pg_id_t pg_id, bool need_to_pause_pg_state_machine) { if (need_to_pause_pg_state_machine && !pause_pg_state_machine(pg_id)) { LOGI("Failed to pause pg state machine, pg_id={}", pg_id); return false; } + LOGI("Destroying pg={}", pg_id); mark_pg_destroyed(pg_id); // we have the assumption that after pg is marked as destroyed, it will not be marked as alive again. // TODO:: if this assumption is broken, we need to handle it. gc_mgr_->drain_pg_pending_gc_task(pg_id); - - destroy_shards(pg_id); - destroy_hs_resources(pg_id); - destroy_pg_index_table(pg_id); - destroy_pg_superblk(pg_id); - - // return pg chunks to dev heap - // which must be done after destroying pg super blk to avoid multiple pg use same chunks - bool res = chunk_selector_->return_pg_chunks_to_dev_heap(pg_id); - RELEASE_ASSERT(res, "Failed to return pg={} chunks to dev_heap", pg_id); - - LOGI("pg={} is destroyed", pg_id); + destroy_pg_resource(pg_id); return true; } @@ -800,7 +803,7 @@ void HSHomeObject::mark_pg_destroyed(pg_id_t pg_id) { LOGD("pg={} is marked as destroyed", pg_id); } -bool HSHomeObject::can_chunks_in_pg_be_gc(pg_id_t pg_id) const { +bool HSHomeObject::is_pg_alive(pg_id_t pg_id) const { auto lg = std::scoped_lock(_pg_lock); auto hs_pg = const_cast< HS_PG* >(_get_hs_pg_unlocked(pg_id)); if (hs_pg == nullptr) { @@ -949,8 +952,7 @@ void HSHomeObject::on_pg_meta_blk_found(sisl::byte_view const& buf, void* meta_c hs_pg->index_table_ = it->second.index_table; it->second.pg_id = pg_id; } else { - RELEASE_ASSERT(hs_pg->pg_sb_->state == PGState::DESTROYED, "IndexTable should be recovered before PG"); - hs_pg->index_table_ = nullptr; + RELEASE_ASSERT(hs_pg->pg_sb_->state == PGState::DESTROYED, "IndexTable should be recovered for alive PG"); LOGI("Index table not found for destroyed pg={}, index_table_uuid={}", pg_id, uuid_str); } @@ -1262,17 +1264,11 @@ uint32_t HSHomeObject::get_pg_tombstone_blob_count(pg_id_t pg_id) const { } void HSHomeObject::refresh_pg_statistics(pg_id_t pg_id) { + RELEASE_ASSERT(is_pg_alive(pg_id), "pg={} should be alive", pg_id); auto hs_pg = const_cast< HS_PG* >(_get_hs_pg_unlocked(pg_id)); RELEASE_ASSERT(hs_pg, "Failed to get pg={} for statistics refresh", pg_id); auto pg_index_table = hs_pg->index_table_; - if (!pg_index_table) { - if (hs_pg->pg_sb_->state == PGState::DESTROYED) { - LOGI("pg={} is destroyed, skip statistics refresh", pg_id); - } else { - RELEASE_ASSERT(false, "index table is not found for pg={} and not in PGState::DESTROYED state", pg_id); - } - return; - } + RELEASE_ASSERT(pg_index_table, "pg is alive, index table should be found for pg={}", pg_id); // Step 1: Scan index table to count active and tombstone blobs in one pass uint64_t active_count = 0; diff --git a/src/lib/homestore_backend/replication_state_machine.cpp b/src/lib/homestore_backend/replication_state_machine.cpp index 81ec3e6d0..7501bf109 100644 --- a/src/lib/homestore_backend/replication_state_machine.cpp +++ b/src/lib/homestore_backend/replication_state_machine.cpp @@ -449,6 +449,10 @@ void ReplicationStateMachine::write_snapshot_obj(std::shared_ptr< homestore::sna set_snapshot_context(context); // Update the snapshot context in case apply_snapshot is not called auto hs_pg = home_object_->get_hs_pg(m_snp_rcv_handler->get_context_pg_id()); hs_pg->pg_state_.clear_state(PGStateMask::BASELINE_RESYNC); + // we only reset this if destroying pg happens in BR case. for other cases (on_destroy and _exit_pg), + // since this replica will leave the PG and no later logs will be received, no need to reset this. + reset_no_space_left_error_info(); + repl_dev()->reset_latch_lsn(); return; } @@ -499,7 +503,7 @@ void ReplicationStateMachine::write_snapshot_obj(std::shared_ptr< homestore::sna if (home_object_->pg_exists(pg_data->pg_id())) { LOGI("pg already exists, clean pg resources before snapshot, pg={} {}", pg_data->pg_id(), log_suffix); // Need to pause state machine before destroying the PG, if fail, let raft retry. - if (!home_object_->pg_destroy(pg_data->pg_id(), true /* pause state machine */)) { + if (!home_object_->pg_destroy(pg_data->pg_id())) { LOGE("failed to destroy existing pg, let raft retry, pg={} {}", pg_data->pg_id(), log_suffix); return; } @@ -1030,7 +1034,42 @@ void ReplicationStateMachine::on_log_replay_done(const homestore::group_id_t& gr const auto pg_id = pg_id_opt.value(); RELEASE_ASSERT(home_object_->pg_exists(pg_id), "pg={} should exist, but not! fatal error!", pg_id); - const auto& shards_in_pg = (const_cast< HSHomeObject::HS_PG* >(home_object_->_get_hs_pg_unlocked(pg_id)))->shards_; + const auto hs_pg = (const_cast< HSHomeObject::HS_PG* >(home_object_->get_hs_pg(pg_id))); + RELEASE_ASSERT(hs_pg, "Failed to get pg={} when log replay done", pg_id); + if (hs_pg->pg_sb_->state == PGState::DESTROYED) { + // if we find a pg with a state of destroyed in recovery path, we can make sure that pg_destroy was called and + // crash occured before it was completed. + + // pg_destroy will be called in the following scenarios: + // 1 baseline resync: when the first snapshot message is received, if the pg already exists, we will call + // pg_destroy to clean up the stale pg resources before resync. + + // 2 exit_pg: when processing exit_pg request, we will call pg_destroy to destroy the pg resources. + + // 3 RaftReplDev::leave() is called and thus RaftReplDev::permanent_destroy() is called from nuraft_mesg. and + // RaftReplDev::leave() will be called in the following scenarios: + // a. destroy_repl_dev: commit log entry of journal_type_t::HS_CTRL_DESTROY + // b. removed from raft group: the move_out member in replace member. + + // for baseline resync, no need to redo destroy pg here since the first snapshot message will be received again + // and trigger pg_destroy again if the pg already exists. but for other cases, we need to redo destory pg to + // clean up the stale pg resources since no message will be received to trigger pg_destroy again. so, we call it + // here for all the above cases to make sure the stale pg resources are cleaned up. + + // there is also a concern that in baseline resync case, if some resource is destroyed in pg_destroy but not all + // the resources are destroyed before crash, then when recovery, log replay will hit those destroyed resources + // and cause error. for example, we have a put_blob log after cp_lsn and before dc_lsn, and the pg_index_table + // is destory but before pg_super_blk is destroyed, crash happens. when recovery, log replay will hit the + // put_blob log and try to write to the index table, but since the index table is destroyed, it will cause + // error. but actually, this is not a problem. since before we starting pg_destroy in baseline resync, + // m_rd_sb->last_snapshot_lsn will be persisted upto the snapshot.get_last_log_idx(). then all the log less than + // or equal to m_rd_sb->last_snapshot_lsn will not be replayed or committed after recovery. so, the concern is + // not valid. pls refer to raft_repl_dev#need_skip_processing for more details. + home_object_->destroy_pg_resource(pg_id); + return; + } + + const auto& shards_in_pg = hs_pg->shards_; auto chunk_selector = home_object_->chunk_selector(); for (const auto& shard_iter : shards_in_pg) { diff --git a/src/lib/homestore_backend/tests/homeobj_fixture.hpp b/src/lib/homestore_backend/tests/homeobj_fixture.hpp index 499968ab3..06342b394 100644 --- a/src/lib/homestore_backend/tests/homeobj_fixture.hpp +++ b/src/lib/homestore_backend/tests/homeobj_fixture.hpp @@ -49,7 +49,7 @@ class HomeObjectFixture : public ::testing::Test { HSHomeObject::_hs_chunk_size = SISL_OPTIONS["chunk_size"].as< uint64_t >() * Mi; _obj_inst = std::dynamic_pointer_cast< HSHomeObject >(g_helper->build_new_homeobject()); - + // Used to export metrics, it should be called after init_homeobject if (SISL_OPTIONS["enable_http"].as< bool >()) { g_helper->app->start_http_server(); } if (!g_helper->is_current_testcase_restarted()) { diff --git a/src/lib/homestore_backend/tests/hs_pg_tests.cpp b/src/lib/homestore_backend/tests/hs_pg_tests.cpp index 85bbfcd38..be0a7ff21 100644 --- a/src/lib/homestore_backend/tests/hs_pg_tests.cpp +++ b/src/lib/homestore_backend/tests/hs_pg_tests.cpp @@ -517,4 +517,42 @@ TEST_F(HomeObjectFixture, PGRefreshStatisticsTest) { EXPECT_EQ(pg_stats_restart.num_tombstone_objects, num_tombstones) << "Tombstone blob count should be preserved after restart"; EXPECT_EQ(pg_stats_restart.used_bytes, used_bytes_after) << "Used bytes should be preserved after restart"; +} + +TEST_F(HomeObjectFixture, IsPgAliveTest) { + LOGINFO("HomeObject replica={} setup completed", g_helper->replica_num()); + + pg_id_t pg_id{1}; + create_pg(pg_id); + + EXPECT_TRUE(_obj_inst->is_pg_alive(pg_id)); + EXPECT_FALSE(_obj_inst->is_pg_alive(999)); + + _obj_inst->mark_pg_destroyed(pg_id); + EXPECT_FALSE(_obj_inst->is_pg_alive(pg_id)); +} + +// Simulate a crash that occurs after pg is marked destroyed but before destroy_pg_resource completes. +// On restart, on_log_replay_done should detect the DESTROYED state and call destroy_pg_resource to +// finish cleaning up stale resources. +TEST_F(HomeObjectFixture, DestroyedPgRecoveryTest) { + LOGINFO("HomeObject replica={} setup completed", g_helper->replica_num()); + + pg_id_t pg_id{1}; + create_pg(pg_id); + auto shard_info = create_shard(pg_id, 64 * Mi, "shard meta"); + auto shard_id = shard_info.id; + + auto hs_pg = _obj_inst->get_hs_pg(pg_id); + ASSERT_NE(hs_pg, nullptr); + auto index_table_uuid_str = boost::uuids::to_string(hs_pg->index_table_->uuid()); + + // Simulate crash: persist DESTROYED state without cleaning up resources + _obj_inst->mark_pg_destroyed(pg_id); + EXPECT_FALSE(_obj_inst->is_pg_alive(pg_id)); + + // on_log_replay_done detects DESTROYED state and calls destroy_pg_resource + restart(); + + verify_pg_destroy(pg_id, index_table_uuid_str, {shard_id}); } \ No newline at end of file diff --git a/src/lib/homestore_backend/tests/hs_shard_tests.cpp b/src/lib/homestore_backend/tests/hs_shard_tests.cpp index 8b68aee8d..def30e26f 100644 --- a/src/lib/homestore_backend/tests/hs_shard_tests.cpp +++ b/src/lib/homestore_backend/tests/hs_shard_tests.cpp @@ -122,7 +122,7 @@ TEST_F(HomeObjectFixture, ShardManagerRecovery) { create_pg(pg_id); // create one shard; - auto shard_info = create_shard(pg_id, Mi, "shard meta");; + auto shard_info = create_shard(pg_id, Mi, "shard meta"); auto shard_id = shard_info.id; EXPECT_EQ(ShardInfo::State::OPEN, shard_info.state); EXPECT_EQ(Mi, shard_info.total_capacity_bytes); @@ -162,7 +162,7 @@ TEST_F(HomeObjectFixture, ShardManagerRecovery) { EXPECT_EQ(1, pg_result->shard_sequence_num_); // re-create new shards on this pg works too even homeobject is restarted twice. - auto new_shard_info = create_shard(pg_id, Mi, "shard meta");; + auto new_shard_info = create_shard(pg_id, Mi, "shard meta"); EXPECT_NE(shard_id, new_shard_info.id); EXPECT_EQ(ShardInfo::State::OPEN, new_shard_info.state); @@ -177,7 +177,7 @@ TEST_F(HomeObjectFixture, SealedShardRecovery) { create_pg(pg_id); // create one shard and seal it. - auto shard_info = create_shard(pg_id, Mi, "shard meta");; + auto shard_info = create_shard(pg_id, Mi, "shard meta"); auto shard_id = shard_info.id; shard_info = seal_shard(shard_id); EXPECT_EQ(ShardInfo::State::SEALED, shard_info.state); @@ -282,7 +282,6 @@ TEST_F(HomeObjectFixture, CreateShardOnDiskLostMemeber) { ASSERT_TRUE(s.hasError()) << "degraded pg on error member should return seal shard fail, pg_id " << degrade_pg_id << "shard_id " << pg_shard_id_map[degrade_pg_id] << " replica number " << g_helper->replica_num(); - tid = generateRandomTraceId(); s = _obj_inst->shard_manager()->create_shard(degrade_pg_id, 64 * Mi, "shard meta", tid).get(); ASSERT_TRUE(s.hasError()) << "degraded pg on error member should return create shard fail, pg_id "