diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 319752482..07b4f6fc5 100755 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -129,7 +129,16 @@ else () if (CMAKE_BUILD_TYPE STREQUAL "Debug") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g") elseif (CMAKE_BUILD_TYPE STREQUAL "Release") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2") + # -flto + MinGW gcc + statically-linked antlr4_static produces + # unresolved-reference errors at link time (LTO intermediate objects + # can't see the .a's vtable thunks). -march=native is also a poor + # default for CI binaries shipped to other machines. Keep both on + # Linux/macOS where the optimization actually pays off. + if (MINGW OR WIN32) + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3") + else () + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -march=native -flto") + endif () elseif (CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O2 -g") elseif (CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") diff --git a/cpp/pom.xml b/cpp/pom.xml index 5415212f0..153e75dc2 100644 --- a/cpp/pom.xml +++ b/cpp/pom.xml @@ -99,8 +99,8 @@ plugin's generate goal throw an NPE. --> - - + + diff --git a/cpp/src/CMakeLists.txt b/cpp/src/CMakeLists.txt index 93342c113..895c1ddba 100644 --- a/cpp/src/CMakeLists.txt +++ b/cpp/src/CMakeLists.txt @@ -37,6 +37,9 @@ message("cmake using: ENABLE_LZOKAY=${ENABLE_LZOKAY}") option(ENABLE_ZLIB "Enable Zlib compression" ON) message("cmake using: ENABLE_ZLIB=${ENABLE_ZLIB}") +# ENABLE_SIMD is defined in the top-level CMakeLists.txt +message("cmake using: ENABLE_SIMD=${ENABLE_SIMD}") + message("Running in src directory") if (${COV_ENABLED}) add_compile_options(-fprofile-arcs -ftest-coverage) @@ -89,6 +92,13 @@ if (ENABLE_ANTLR4) message("Adding ANTLR4 include directory") endif() +if (ENABLE_SIMD) + add_definitions(-DENABLE_SIMD) + list(APPEND PROJECT_INCLUDE_DIR + ${CMAKE_SOURCE_DIR}/third_party/simde-0.8.4-rc3 + ) +endif() + include_directories(${PROJECT_INCLUDE_DIR}) # Mark every translation unit that is compiled into the tsfile library so that @@ -144,10 +154,17 @@ add_library(tsfile SHARED) if (${COV_ENABLED}) message("Enable code cov...") + # Apple clang ships coverage runtime via --coverage; libgcov isn't a + # standalone library on macOS. Use --coverage there. + if (APPLE) + set(COV_LINK_LIB --coverage) + else() + set(COV_LINK_LIB -lgcov) + endif() if (ENABLE_ANTLR4) - target_link_libraries(tsfile common_obj compress_obj cwrapper_obj file_obj read_obj write_obj parser_obj -lgcov) + target_link_libraries(tsfile common_obj compress_obj cwrapper_obj file_obj read_obj write_obj parser_obj ${COV_LINK_LIB}) else() - target_link_libraries(tsfile common_obj compress_obj cwrapper_obj file_obj read_obj write_obj -lgcov) + target_link_libraries(tsfile common_obj compress_obj cwrapper_obj file_obj read_obj write_obj ${COV_LINK_LIB}) endif() else() message("Disable code cov...") @@ -171,4 +188,4 @@ set_target_properties(tsfile PROPERTIES SOVERSION ${LIBTSFILE_SO_VERSION}) install(TARGETS tsfile RUNTIME DESTINATION ${LIBRARY_OUTPUT_PATH} LIBRARY DESTINATION ${LIBRARY_OUTPUT_PATH} - ARCHIVE DESTINATION ${LIBRARY_OUTPUT_PATH}) \ No newline at end of file + ARCHIVE DESTINATION ${LIBRARY_OUTPUT_PATH}) diff --git a/cpp/src/common/CMakeLists.txt b/cpp/src/common/CMakeLists.txt index 4406cb219..60e0fdccf 100644 --- a/cpp/src/common/CMakeLists.txt +++ b/cpp/src/common/CMakeLists.txt @@ -22,21 +22,15 @@ aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} common_SRC_LIST) aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/allocator common_allocator_SRC_LIST) aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/container common_container_SRC_LIST) aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/tsblock common_tsblock_SRC_LIST) -aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/mutex common_mutex_SRC_LIST) aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/datatype common_datatype_SRC_LIST) set(CMAKE_POSITION_INDEPENDENT_CODE ON) -add_library(common_obj OBJECT ${common_SRC_LIST} +add_library(common_obj OBJECT ${common_SRC_LIST} ${common_allocator_SRC_LIST} ${common_container_SRC_LIST} - ${common_tsblock_SRC_LIST} - ${common_mutex_SRC_LIST} + ${common_tsblock_SRC_LIST} ${common_datatype_SRC_LIST}) -if (ENABLE_ANTLR4) - target_compile_definitions(common_obj PRIVATE ENABLE_ANTLR4) -endif() - # install header files recursively file(GLOB_RECURSE HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/*.h") copy_to_dir(${HEADERS} "common_obj") \ No newline at end of file diff --git a/cpp/src/common/allocator/alloc_base.h b/cpp/src/common/allocator/alloc_base.h index c89aed077..dd2e0ab61 100644 --- a/cpp/src/common/allocator/alloc_base.h +++ b/cpp/src/common/allocator/alloc_base.h @@ -82,35 +82,43 @@ class ModStat { } void init(); void destroy(); - INLINE void update_alloc(AllocModID mid, int32_t size) { + INLINE void update_alloc(AllocModID mid, int64_t size) { #ifdef ENABLE_MEM_STAT ASSERT(mid < __LAST_MOD_ID); ATOMIC_FAA(get_item(mid), size); #endif } - void update_free(AllocModID mid, uint32_t size) { + void update_free(AllocModID mid, uint64_t size) { #ifdef ENABLE_MEM_STAT ASSERT(mid < __LAST_MOD_ID); - ATOMIC_FAA(get_item(mid), 0 - size); + ATOMIC_FAA(get_item(mid), -static_cast(size)); #endif } void print_stat(); + int64_t get_stat(int8_t mid) { +#ifdef ENABLE_MEM_STAT + if (stat_arr_ != NULL && mid < __LAST_MOD_ID) + return ATOMIC_FAA(get_item(mid), 0LL); +#endif + return 0; + } + #ifdef ENABLE_TEST - int32_t TEST_get_stat(int8_t mid) { return ATOMIC_FAA(get_item(mid), 0); } + int64_t TEST_get_stat(int8_t mid) { return ATOMIC_FAA(get_item(mid), 0LL); } #endif private: - INLINE int32_t* get_item(int8_t mid) { - return &(stat_arr_[mid * (ITEM_SIZE / sizeof(int32_t))]); + INLINE int64_t* get_item(int8_t mid) { + return &(stat_arr_[mid * (ITEM_SIZE / sizeof(int64_t))]); } private: static const int32_t ITEM_SIZE = CACHE_LINE_SIZE; static const int32_t ITEM_COUNT = __LAST_MOD_ID; - int32_t* stat_arr_; + int64_t* stat_arr_; - STATIC_ASSERT((ITEM_SIZE % sizeof(int32_t) == 0), ModStat_ITEM_SIZE_ERROR); + STATIC_ASSERT((ITEM_SIZE % sizeof(int64_t) == 0), ModStat_ITEM_SIZE_ERROR); }; /* base allocator */ diff --git a/cpp/src/common/allocator/byte_stream.h b/cpp/src/common/allocator/byte_stream.h index 435a1f6fd..ad8dbb90d 100644 --- a/cpp/src/common/allocator/byte_stream.h +++ b/cpp/src/common/allocator/byte_stream.h @@ -24,6 +24,7 @@ #include #include +#include #include #include @@ -33,51 +34,51 @@ namespace common { +// std::atomic as the actual storage so the MSVC fallback no longer needs +// `reinterpret_cast*>(T*)` — that cast is UB because the underlying +// object was never constructed as a std::atomic. When the caller asks for +// non-atomic mode we still go through the atomic interface but with +// memory_order_relaxed, which on x86/ARM compiles to a plain load/store. +// std::atomic is non-copyable, so neither is OptionalAtomic; existing +// callers either construct in place or use shallow_clone_from / store. template class OptionalAtomic { public: OptionalAtomic(T t, bool enable_atomic = false) : val_(t), enable_atomic_(enable_atomic) {} + OptionalAtomic(const OptionalAtomic&) = delete; + OptionalAtomic& operator=(const OptionalAtomic&) = delete; + OptionalAtomic(OptionalAtomic&&) = delete; + OptionalAtomic& operator=(OptionalAtomic&&) = delete; + FORCE_INLINE T load() const { - if (UNLIKELY(enable_atomic_)) { - return ATOMIC_LOAD(&val_); - } else { - return val_; - } + return val_.load(UNLIKELY(enable_atomic_) ? std::memory_order_seq_cst + : std::memory_order_relaxed); } FORCE_INLINE void store(const T t) { - if (UNLIKELY(enable_atomic_)) { - ATOMIC_STORE(&val_, t); - } else { - val_ = t; - } + val_.store(t, UNLIKELY(enable_atomic_) ? std::memory_order_seq_cst + : std::memory_order_relaxed); } FORCE_INLINE T atomic_faa(const T increment) { - if (UNLIKELY(enable_atomic_)) { - return ATOMIC_FAA(&val_, increment); - } else { - T old_val = val_; - val_ = val_ + increment; - return old_val; - } + return val_.fetch_add(increment, UNLIKELY(enable_atomic_) + ? std::memory_order_seq_cst + : std::memory_order_relaxed); } FORCE_INLINE T atomic_aaf(const T increment) { - if (UNLIKELY(enable_atomic_)) { - return ATOMIC_AAF(&val_, increment); - } else { - val_ = val_ + increment; - return val_; - } + return val_.fetch_add(increment, UNLIKELY(enable_atomic_) + ? std::memory_order_seq_cst + : std::memory_order_relaxed) + + increment; } FORCE_INLINE bool enable_atomic() const { return enable_atomic_; } private: - T val_; + std::atomic val_; bool enable_atomic_; }; @@ -231,6 +232,23 @@ FORCE_INLINE double bytes_to_double(uint8_t bytes[8]) { // TODO define a WrappedByteStream class +// Round n up to the next power of two (>=1). Used to normalize ByteStream +// page sizes so that `& page_mask_` is equivalent to `% page_size_`. +// Values above the largest power-of-two that fits in uint32_t are clamped to +// 0x80000000 — the previous `while (ps < n) ps <<= 1` would shift past 2^31 +// and overflow to 0, looping forever. +FORCE_INLINE uint32_t round_up_pow2(uint32_t n) { + if (n <= 1) return 1; + if (n > 0x80000000u) return 0x80000000u; + uint32_t v = n - 1; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + return v + 1; +} + // auto extend buffer for serialization class ByteStream { private: @@ -253,6 +271,8 @@ class ByteStream { }; public: + static const uint32_t DEFAULT_PAGE_SIZE = 1024; + ByteStream(uint32_t page_size, AllocModID mid, bool enable_atomic = false, BaseAllocator& allocator = g_base_allocator) : allocator_(allocator), @@ -262,11 +282,16 @@ class ByteStream { total_size_(0, enable_atomic), read_pos_(0), marked_read_pos_(0), - page_size_(page_size), + // page_mask_ is used as a bitmask in the hot read/write paths + // (`x & page_mask_` instead of `x % page_size_`), which only + // matches modulo arithmetic when page_size_ is a power of two. + // Round up so callers passing non-power-of-2 sizes still get a + // correctly-sized page, at the cost of <2x memory in the worst + // case (e.g. 1000 → 1024). + page_size_(round_up_pow2(page_size)), + page_mask_(round_up_pow2(page_size) - 1), mid_(mid), - wrapped_page_(false, nullptr) { - // assert(page_size >= 16); // commented out by gxh on 2023.03.09 - } + wrapped_page_(false, nullptr) {} // for wrap plain buffer to ByteStream ByteStream(AllocModID mid = MOD_DEFAULT) @@ -278,6 +303,7 @@ class ByteStream { read_pos_(0), marked_read_pos_(0), page_size_(0), + page_mask_(0), mid_(mid), wrapped_page_(false, nullptr) {} @@ -290,7 +316,10 @@ class ByteStream { wrapped_page_.next_.store(nullptr); wrapped_page_.buf_ = (uint8_t*)buf; - page_size_ = buf_len; + // page_mask_ is used as a bitmask; only correct for power-of-2 + // page sizes (see ByteStream ctor comment). + page_size_ = round_up_pow2(static_cast(buf_len)); + page_mask_ = page_size_ - 1; head_.store(&wrapped_page_); tail_.store(&wrapped_page_); total_size_.store(buf_len); @@ -305,14 +334,14 @@ class ByteStream { void clear_wrapped_buf() { wrapped_page_.buf_ = nullptr; } /* ================ Part 1: basic ================ */ - FORCE_INLINE uint32_t remaining_size() const { + FORCE_INLINE uint64_t remaining_size() const { ASSERT(total_size_.load() >= read_pos_); return total_size_.load() - read_pos_; } FORCE_INLINE bool has_remaining() const { return remaining_size() > 0; } FORCE_INLINE void mark_read_pos() { marked_read_pos_ = read_pos_; } - FORCE_INLINE uint32_t get_mark_len() const { + FORCE_INLINE uint64_t get_mark_len() const { ASSERT(marked_read_pos_ <= read_pos_); return read_pos_ - marked_read_pos_; } @@ -339,30 +368,46 @@ class ByteStream { // never used TODO void shallow_clone_from(ByteStream& other) { this->page_size_ = other.page_size_; + this->page_mask_ = other.page_mask_; this->mid_ = other.mid_; this->head_.store(other.head_.load()); this->tail_.store(other.tail_.load()); this->total_size_.store(other.total_size_.load()); } - FORCE_INLINE uint32_t total_size() const { return total_size_.load(); } - FORCE_INLINE uint32_t read_pos() const { return read_pos_; }; + FORCE_INLINE uint64_t total_size() const { return total_size_.load(); } + FORCE_INLINE uint64_t read_pos() const { return read_pos_; }; + // Sum of bytes physically allocated for this stream's pages. For a + // wrapped stream this just reports total_size(); for an owning stream + // it counts page_size_ per backing page so callers doing memory-pressure + // accounting see the real footprint, not the few bytes that happen to + // have been written into the latest 64 KiB page. + FORCE_INLINE uint64_t allocated_bytes() const { + if (is_wrapped()) return total_size_.load(); + uint64_t total = 0; + Page* p = head_.load(); + while (p != nullptr) { + total += page_size_; + p = p->next_.load(); + } + return total; + } /** * Seek the read cursor to an absolute offset. Re-anchors read_page_ for * multi-page streams. */ - void set_read_pos(uint32_t pos) { + void set_read_pos(uint64_t pos) { ASSERT(pos <= total_size()); read_pos_ = pos; Page* p = head_.load(); - uint32_t skipped = 0; + uint64_t skipped = 0; while (p != nullptr && skipped + page_size_ <= pos) { skipped += page_size_; p = p->next_.load(); } read_page_ = p; } - FORCE_INLINE void wrapped_buf_advance_read_pos(uint32_t size) { + FORCE_INLINE void wrapped_buf_advance_read_pos(uint64_t size) { if (size + read_pos_ > total_size_.load()) { read_pos_ = total_size_.load(); } else { @@ -380,10 +425,10 @@ class ByteStream { std::cout << "write_buf error " << ret << std::endl; return ret; } - uint32_t remainder = page_size_ - (total_size_.load() % page_size_); + uint32_t remainder = page_size_ - (total_size_.load() & page_mask_); uint32_t copy_len = remainder < (len - write_len) ? remainder : (len - write_len); - memcpy(tail_.load()->buf_ + total_size_.load() % page_size_, + memcpy(tail_.load()->buf_ + (total_size_.load() & page_mask_), buf + write_len, copy_len); total_size_.atomic_aaf(copy_len); write_len += copy_len; @@ -404,11 +449,11 @@ class ByteStream { if (RET_FAIL(check_space())) { return ret; } - uint32_t remainder = page_size_ - (read_pos_ % page_size_); + uint32_t remainder = page_size_ - (read_pos_ & page_mask_); uint32_t copy_len = remainder < want_len_limited - read_len ? remainder : want_len_limited - read_len; - memcpy(buf + read_len, read_page_->buf_ + (read_pos_ % page_size_), + memcpy(buf + read_len, read_page_->buf_ + (read_pos_ & page_mask_), copy_len); read_len += copy_len; read_pos_ += copy_len; @@ -460,16 +505,17 @@ class ByteStream { return b; } b.buf_ = - (char*)(tail_.load()->buf_ + (total_size_.load() % page_size_)); - b.len_ = page_size_ - (total_size_.load() % page_size_); + (char*)(tail_.load()->buf_ + (total_size_.load() & page_mask_)); + b.len_ = page_size_ - (total_size_.load() & page_mask_); return b; } void buffer_used(uint32_t used_bytes) { ASSERT(used_bytes >= 1); // would not span page - ASSERT((total_size_.load() / page_size_) == - ((total_size_.load() + used_bytes - 1) / page_size_)); + ASSERT(page_size_ == 0 || + (total_size_.load() / page_size_) == + ((total_size_.load() + used_bytes - 1) / page_size_)); total_size_.atomic_aaf(used_bytes); } @@ -485,7 +531,7 @@ class ByteStream { if (RET_FAIL(prepare_space())) { return ret; } - uint32_t remainder = page_size_ - (total_size_.load() % page_size_); + uint32_t remainder = page_size_ - (total_size_.load() & page_mask_); uint32_t step = remainder < (len - advanced) ? remainder : (len - advanced); total_size_.atomic_aaf(step); @@ -504,6 +550,7 @@ class ByteStream { Page* cur_; Page* end_; int64_t total_size_; + int64_t consumed_ = 0; BufferIterator(const ByteStream& bs) : host_(bs) { cur_ = bs.head_.load(); end_ = bs.tail_.load(); @@ -514,13 +561,17 @@ class ByteStream { Buffer b; if (cur_ != nullptr) { b.buf_ = (char*)cur_->buf_; - if (cur_ == end_ && - host_.total_size_.load() % host_.page_size_ != 0) { - b.len_ = host_.total_size_.load() % host_.page_size_; + if (cur_ == end_) { + // Last page: clamp to remaining total_size_. For wrapped + // streams page_size_ may have been rounded up past the + // user buffer (see wrap_from), so we must not return + // page_size_ as the length here. + b.len_ = static_cast(total_size_ - consumed_); } else { b.len_ = host_.page_size_; } ASSERT(b.len_ > 0); + consumed_ += b.len_; cur_ = cur_->next_.load(); } return b; @@ -566,7 +617,7 @@ class ByteStream { // get tail position atomically Page* host_end = nullptr; - uint32_t host_total_size = 0; + uint64_t host_total_size = 0; while (true) { host_end = host_.tail_.load(); host_total_size = host_.total_size_.load(); @@ -577,7 +628,7 @@ class ByteStream { while (true) { if (cur_ == host_end) { - if (host_total_size % host_.page_size_ == 0) { + if ((host_total_size & host_.page_mask_) == 0) { if (read_offset_within_cur_page_ == host_.page_size_) { return b; } else { @@ -591,15 +642,15 @@ class ByteStream { } } else { if (read_offset_within_cur_page_ == - (host_total_size % host_.page_size_)) { + (host_total_size & host_.page_mask_)) { return b; } else { b.buf_ = ((char*)(cur_->buf_)) + read_offset_within_cur_page_; - b.len_ = (host_total_size % host_.page_size_) - + b.len_ = (host_total_size & host_.page_mask_) - read_offset_within_cur_page_; read_offset_within_cur_page_ = - (host_total_size % host_.page_size_); + (host_total_size & host_.page_mask_); total_end_offset_ += b.len_; return b; } @@ -629,7 +680,7 @@ class ByteStream { FORCE_INLINE int prepare_space() { int ret = common::E_OK; if (UNLIKELY(tail_.load() == nullptr || - total_size_.load() % page_size_ == 0)) { + (total_size_.load() & page_mask_) == 0)) { Page* p = nullptr; if (RET_FAIL(alloc_page(p))) { return ret; @@ -646,7 +697,7 @@ class ByteStream { } if (UNLIKELY(read_page_ == nullptr)) { read_page_ = head_.load(); - } else if (UNLIKELY(read_pos_ % page_size_ == 0)) { + } else if (UNLIKELY((read_pos_ & page_mask_) == 0)) { read_page_ = read_page_->next_.load(); } if (UNLIKELY(read_page_ == nullptr)) { @@ -682,10 +733,14 @@ class ByteStream { OptionalAtomic head_; OptionalAtomic tail_; Page* read_page_; // only one thread is allow to reader this ByteStream - OptionalAtomic total_size_; // total size in byte - uint32_t read_pos_; // current reader position - uint32_t marked_read_pos_; // current reader position + OptionalAtomic total_size_; // total size in byte + // 64-bit so streams that legitimately grow past 4 GiB don't truncate + // the read cursor (e.g. concatenated chunk buffers in the writer's + // write_stream_ before the next flush). + uint64_t read_pos_; // current reader position + uint64_t marked_read_pos_; // current reader position uint32_t page_size_; + uint32_t page_mask_; // page_size_ - 1, for bitwise AND instead of modulo AllocModID mid_; public: @@ -1185,6 +1240,7 @@ class SerializationUtil { // indicates that memory has been allocated and must be freed. FORCE_INLINE static int read_var_char_ptr(std::string*& str, ByteStream& in) { + str = nullptr; int ret = common::E_OK; int32_t len = 0; int32_t read_len = 0; @@ -1192,7 +1248,6 @@ class SerializationUtil { return ret; } else { if (len == storage::NO_STR_TO_READ) { - str = nullptr; return ret; } else { char* tmp_buf = diff --git a/cpp/src/common/allocator/mem_alloc.cc b/cpp/src/common/allocator/mem_alloc.cc index 524287e75..b7c5c09c1 100644 --- a/cpp/src/common/allocator/mem_alloc.cc +++ b/cpp/src/common/allocator/mem_alloc.cc @@ -95,7 +95,7 @@ void* mem_alloc(uint32_t size, AllocModID mid) { auto high4b = static_cast(header >> 32); *reinterpret_cast(raw) = high4b; *reinterpret_cast(raw + 4) = low4b; - ModStat::get_instance().update_alloc(mid, static_cast(size)); + ModStat::get_instance().update_alloc(mid, static_cast(size)); return raw + header_size; } @@ -158,7 +158,7 @@ void* mem_realloc(void* ptr, uint32_t size) { *reinterpret_cast(p) = high4b; *reinterpret_cast(p + 4) = low4b; ModStat::get_instance().update_alloc( - mid, int32_t(size) - int32_t(original_size)); + mid, int64_t(size) - int64_t(original_size)); return p + ALIGNMENT; } @@ -166,9 +166,9 @@ void ModStat::init() { if (stat_arr_ != NULL) { return; } - stat_arr_ = (int32_t*)(::malloc(ITEM_SIZE * ITEM_COUNT)); + stat_arr_ = (int64_t*)(::malloc(ITEM_SIZE * ITEM_COUNT)); for (int8_t i = 0; i < __LAST_MOD_ID; i++) { - int32_t* item = get_item(i); + int64_t* item = get_item(i); *item = 0; } } @@ -183,14 +183,14 @@ void ModStat::print_stat() { struct Entry { const char* name; - int32_t val; + int64_t val; }; Entry entries[__LAST_MOD_ID]; int count = 0; int64_t total = 0; for (int i = 0; i < __LAST_MOD_ID; i++) { - int32_t val = ATOMIC_FAA(get_item(i), 0); + int64_t val = ATOMIC_FAA(get_item(i), 0LL); total += val; if (val != 0) { entries[count++] = {g_mod_names[i], val}; diff --git a/cpp/src/common/allocator/page_arena.h b/cpp/src/common/allocator/page_arena.h index 9b8ce5ef6..c0dfbebb9 100644 --- a/cpp/src/common/allocator/page_arena.h +++ b/cpp/src/common/allocator/page_arena.h @@ -47,6 +47,19 @@ class PageArena { FORCE_INLINE void destroy() { reset(); } void reset(); + // Returns the number of bytes actually consumed across all pages. + // This is the precise M_meta size: metadata structs are not data-encoded, + // so arena used bytes == metadata memory exactly. + int64_t get_total_used_bytes() const { + int64_t total = 0; + Page* p = dummy_head_.next_; + while (p) { + total += p->cur_alloc_ - reinterpret_cast(p + 1); + p = p->next_; + } + return total; + } + #ifdef ENABLE_TEST int TEST_get_page_count() const { int count = 0; diff --git a/cpp/src/common/config/config.h b/cpp/src/common/config/config.h index e2b2039a7..5cf968688 100644 --- a/cpp/src/common/config/config.h +++ b/cpp/src/common/config/config.h @@ -36,7 +36,7 @@ typedef struct ConfigValue { TSEncoding time_encoding_type_; TSDataType time_data_type_; CompressionType time_compress_type_; - int32_t chunk_group_size_threshold_; + int64_t chunk_group_size_threshold_; int32_t record_count_for_next_mem_check_; bool encrypt_flag_ = false; TSEncoding boolean_encoding_type_; @@ -46,14 +46,21 @@ typedef struct ConfigValue { TSEncoding double_encoding_type_; TSEncoding string_encoding_type_; CompressionType default_compression_type_; + bool parallel_read_enabled_; bool parallel_write_enabled_; - int32_t write_thread_count_; - // When true, aligned writer enforces page size limit strictly by - // interleaving time/value writes and sealing pages together when any side - // becomes full. - // When false, aligned writer may disable some page-size checks to improve - // write performance. - bool strict_page_size_ = true; + // Size of the single global worker pool (common::g_thread_pool_) shared by + // the parallel write and parallel read paths. The pool is (re)created from + // this value in init_common(). Like sync_on_close_/encrypt_flag_ it keeps + // its in-class default rather than being reset by init_config_value(), so a + // set_thread_count() call made before libtsfile_init() actually sizes the + // pool instead of being clobbered by the init-time defaults. + int32_t thread_count_ = 6; + // Durability knob: when true (default), TsFileIOWriter::end_file() issues + // an fsync() before closing so that a process / OS crash cannot leave a + // partially-flushed file behind. Disabling this trades durability for + // throughput: writes return success as soon as data is in the page cache. + // Only set to false if the caller drives its own fsync policy. + bool sync_on_close_ = true; } ConfigValue; extern void init_config_value(); @@ -62,10 +69,14 @@ extern CompressionType get_default_compressor(); // In the future, configuration items need to be dynamically adjusted according // to the level extern void set_config_value(); -extern void config_set_page_max_point_count(uint32_t page_max_point_count); -extern void config_set_max_degree_of_index_node( +// Public config setters: validate at the entry point and return +// E_INVALID_ARG when the requested value is outside the supported range. +// On rejection the underlying field is left untouched so the writer keeps +// running with whatever value it had before — callers that don't check the +// return are no worse off than they were before validation existed. +extern int config_set_page_max_point_count(uint32_t page_max_point_count); +extern int config_set_max_degree_of_index_node( uint32_t max_degree_of_index_node); -extern void config_set_strict_page_size(bool strict_page_size); } // namespace common diff --git a/cpp/src/common/container/bit_map.cc b/cpp/src/common/container/bit_map.cc index 407605e56..3b1af6ab2 100644 --- a/cpp/src/common/container/bit_map.cc +++ b/cpp/src/common/container/bit_map.cc @@ -31,14 +31,15 @@ BitMap::~BitMap() { } } -int BitMap::init(uint32_t item_size, bool init_as_zero) { +int BitMap::init(uint32_t item_size, bool init_as_zero, AllocModID mod_id) { uint32_t size = (item_size + 7) / 8; - bitmap_ = static_cast(mem_alloc(size, MOD_TSBLOCK)); + bitmap_ = static_cast(mem_alloc(size, mod_id)); // need set to 0, otherwise there will be wrong data const char initial_char = init_as_zero ? 0x00 : 0xFF; memset(bitmap_, initial_char, size); size_ = size; init_as_zero_ = init_as_zero; + has_set_bits_ = !init_as_zero; return common::E_OK; } diff --git a/cpp/src/common/container/bit_map.h b/cpp/src/common/container/bit_map.h index 757ab1fb1..90ed0e0b6 100644 --- a/cpp/src/common/container/bit_map.h +++ b/cpp/src/common/container/bit_map.h @@ -25,16 +25,13 @@ #include #endif +#include "common/allocator/alloc_base.h" #include "utils/errno_define.h" #include "utils/util_define.h" namespace common { -// Cross-platform bit-twiddling helpers. GCC/Clang use their builtins; MSVC -// uses the equivalent intrinsics from ; any other compiler falls -// back to a portable loop. namespace bitops { -// Population count of an 8-bit value. FORCE_INLINE int popcount8(uint8_t v) { #if defined(__GNUC__) || defined(__clang__) return __builtin_popcount(v); @@ -49,7 +46,7 @@ FORCE_INLINE int popcount8(uint8_t v) { return c; #endif } -// Count trailing zero bits. The argument must be non-zero. + FORCE_INLINE int ctz_nonzero(uint32_t v) { #if defined(__GNUC__) || defined(__clang__) return __builtin_ctz(v); @@ -66,23 +63,13 @@ FORCE_INLINE int ctz_nonzero(uint32_t v) { return c; #endif } -// Count trailing zero bits of a 64-bit value. The argument must be non-zero. -FORCE_INLINE int ctz64_nonzero(uint64_t v) { + +FORCE_INLINE int ctz_nonzero(uint64_t v) { #if defined(__GNUC__) || defined(__clang__) return __builtin_ctzll(v); #elif defined(_MSC_VER) unsigned long idx; -#if defined(_M_X64) || defined(_M_ARM64) _BitScanForward64(&idx, v); -#else - // 32-bit MSVC has no _BitScanForward64. - if (static_cast(v) != 0) { - _BitScanForward(&idx, static_cast(v)); - } else { - _BitScanForward(&idx, static_cast(v >> 32)); - idx += 32; - } -#endif return static_cast(idx); #else int c = 0; @@ -97,13 +84,19 @@ FORCE_INLINE int ctz64_nonzero(uint64_t v) { class BitMap { public: - BitMap() : bitmap_(nullptr), size_(0), init_as_zero_(true) {} + BitMap() + : bitmap_(nullptr), + size_(0), + init_as_zero_(true), + has_set_bits_(false) {} ~BitMap(); - int init(uint32_t item_size, bool init_as_zero = true); + int init(uint32_t item_size, bool init_as_zero = true, + AllocModID mod_id = MOD_TSBLOCK); FORCE_INLINE void reset() { const char initial_char = init_as_zero_ ? 0x00 : 0xFF; memset(bitmap_, initial_char, size_); + has_set_bits_ = !init_as_zero_; } FORCE_INLINE void set(uint32_t index) { @@ -113,6 +106,7 @@ class BitMap { char* start_addr = bitmap_ + offset; uint8_t bit_mask = get_bit_mask(index); *start_addr = (*start_addr) | (bit_mask); + has_set_bits_ = true; } FORCE_INLINE void clear(uint32_t index) { @@ -124,7 +118,26 @@ class BitMap { *start_addr = (*start_addr) & (~bit_mask); } - FORCE_INLINE void clear_all() { memset(bitmap_, 0x00, size_); } + FORCE_INLINE void clear_all() { + memset(bitmap_, 0x00, size_); + has_set_bits_ = false; + } + + // Copy `bytes` of externally-owned bitmap data into this BitMap's buffer + // and keep has_set_bits_ in sync. Without this, callers that memcpy + // directly into get_bitmap() can leave the has_set_bits_ shortcut stale + // and downstream readers (may_have_set_bits()) will falsely treat the + // bitmap as empty. + FORCE_INLINE void copy_from(const char* src, uint32_t bytes) { + ASSERT(bytes <= size_); + memcpy(bitmap_, src, bytes); + // Conservative: assume the caller-provided bitmap can have set bits. + // We could scan to be precise, but the false-positive only costs a + // bit of per-cell testing in writers — never silent data loss. + if (bytes > 0) { + has_set_bits_ = true; + } + } FORCE_INLINE bool test(uint32_t index) { uint32_t offset = index >> 3; @@ -135,7 +148,6 @@ class BitMap { return (*start_addr & bit_mask); } - // Count the number of bits set to 1 (i.e., number of null entries). FORCE_INLINE uint32_t count_set_bits() const { uint32_t count = 0; const uint8_t* p = reinterpret_cast(bitmap_); @@ -145,26 +157,21 @@ class BitMap { return count; } - // Find the next set bit (null position) at or after @from, - // within [0, total_bits). Returns total_bits if none found. - // Skips zero bytes in bulk so cost is proportional to the number - // of null bytes, not total rows. FORCE_INLINE uint32_t next_set_bit(uint32_t from, uint32_t total_bits) const { if (from >= total_bits) return total_bits; const uint8_t* p = reinterpret_cast(bitmap_); uint32_t byte_idx = from >> 3; - // Check remaining bits in the first (partial) byte uint8_t byte_val = p[byte_idx] >> (from & 7); if (byte_val) { - return from + bitops::ctz_nonzero(byte_val); + return from + bitops::ctz_nonzero(static_cast(byte_val)); } - // Scan subsequent full bytes, skipping zeros const uint32_t byte_end = (total_bits + 7) >> 3; for (++byte_idx; byte_idx < byte_end; ++byte_idx) { if (p[byte_idx]) { uint32_t pos = - (byte_idx << 3) + bitops::ctz_nonzero(p[byte_idx]); + (byte_idx << 3) + + bitops::ctz_nonzero(static_cast(p[byte_idx])); return pos < total_bits ? pos : total_bits; } } @@ -175,6 +182,10 @@ class BitMap { FORCE_INLINE char* get_bitmap() { return bitmap_; } + // Fast check: returns false only when guaranteed no bits are set. + // May return true even when no bits are actually set (conservative). + FORCE_INLINE bool may_have_set_bits() const { return has_set_bits_; } + private: FORCE_INLINE uint8_t get_bit_mask(uint32_t index) { return 1 << (index & 7); @@ -184,6 +195,7 @@ class BitMap { char* bitmap_; uint32_t size_; bool init_as_zero_; + bool has_set_bits_; }; } // namespace common diff --git a/cpp/src/common/container/byte_buffer.h b/cpp/src/common/container/byte_buffer.h index 88006dac6..4e2dfab15 100644 --- a/cpp/src/common/container/byte_buffer.h +++ b/cpp/src/common/container/byte_buffer.h @@ -107,11 +107,11 @@ class ByteBuffer { // for variable len value FORCE_INLINE char* read(uint32_t offset, uint32_t* len) { + ASSERT(offset + variable_type_len_ <= real_data_size_); uint32_t tmp; - // Directly memcpy to avoid potential alignment issues when casting - // int32_t array pointer std::memcpy(&tmp, data_ + offset, sizeof(tmp)); *len = tmp; + ASSERT(offset + variable_type_len_ + *len <= real_data_size_); char* p = &data_[offset + variable_type_len_]; return p; } @@ -128,4 +128,4 @@ class ByteBuffer { }; } // namespace common -#endif // COMMON_CONTAINER_BYTE_BUFFER_H \ No newline at end of file +#endif // COMMON_CONTAINER_BYTE_BUFFER_H diff --git a/cpp/src/common/device_id.cc b/cpp/src/common/device_id.cc index b35a8593f..e88cdac8a 100644 --- a/cpp/src/common/device_id.cc +++ b/cpp/src/common/device_id.cc @@ -144,7 +144,7 @@ int StringArrayDeviceID::deserialize(common::ByteStream& read_stream) { segments_.clear(); for (uint32_t i = 0; i < num_segments; ++i) { - std::string* segment; + std::string* segment = nullptr; if (RET_FAIL(common::SerializationUtil::read_var_char_ptr( segment, read_stream))) { delete segment; diff --git a/cpp/src/common/global.cc b/cpp/src/common/global.cc index b49b55657..cc6c5117f 100644 --- a/cpp/src/common/global.cc +++ b/cpp/src/common/global.cc @@ -19,31 +19,31 @@ #include "global.h" +#ifdef ENABLE_THREADS +#include "common/thread_pool.h" +#endif + #ifndef _WIN32 #include +#include // strncasecmp #endif #include +#include // strlen -#include - -#ifdef ENABLE_THREADS -#include "common/thread_pool.h" -#endif #include "utils/injection.h" -#include "utils/util_define.h" // strncasecmp and other platform-compat shims +#include "utils/util_define.h" // strncasecmp -> _strnicmp shim on Windows namespace common { ColumnSchema g_time_column_schema; +ConfigValue g_config_value_; #ifdef ENABLE_THREADS -ThreadPool* g_write_thread_pool_ = nullptr; +ThreadPool* g_thread_pool_ = nullptr; #endif -ConfigValue g_config_value_; void init_config_value() { - g_config_value_.tsblock_mem_inc_step_size_ = 8000; // 8k - g_config_value_.tsblock_max_memory_ = 64000; // 64k - // g_config_value_.tsblock_max_memory_ = 32; + g_config_value_.tsblock_mem_inc_step_size_ = 8000; // 8k + g_config_value_.tsblock_max_memory_ = 2 * 1024 * 1024; // 2 MB g_config_value_.page_writer_max_point_num_ = 10000; g_config_value_.page_writer_max_memory_bytes_ = 128 * 1024; // 128 k g_config_value_.max_degree_of_index_node_ = 256; @@ -64,19 +64,21 @@ void init_config_value() { g_config_value_.float_encoding_type_ = GORILLA; g_config_value_.double_encoding_type_ = GORILLA; g_config_value_.string_encoding_type_ = PLAIN; - // Default compression type is LZ4 -#ifdef ENABLE_LZ4 + // Pick the strongest compressor that was actually compiled in. Gating on + // ENABLE_LZ4 while setting SNAPPY (the original code) would request a + // compressor that the factory can't produce when the build disables + // Snappy, returning nullptr at write time. +#ifdef ENABLE_SNAPPY + g_config_value_.default_compression_type_ = SNAPPY; +#elif defined(ENABLE_LZ4) g_config_value_.default_compression_type_ = LZ4; #else g_config_value_.default_compression_type_ = UNCOMPRESSED; #endif - unsigned int hw_cores = std::thread::hardware_concurrency(); - if (hw_cores == 0) hw_cores = 1; // fallback if detection fails - g_config_value_.parallel_write_enabled_ = (hw_cores > 1); - g_config_value_.write_thread_count_ = - static_cast(std::min(hw_cores, 64u)); - // Enforce aligned page size limits strictly by default. - g_config_value_.strict_page_size_ = true; + g_config_value_.parallel_read_enabled_ = true; + g_config_value_.parallel_write_enabled_ = true; + // thread_count_ keeps its in-class default (see config.h) so a + // set_thread_count() before libtsfile_init() is not reset here. } extern TSEncoding get_value_encoder(TSDataType data_type) { @@ -113,16 +115,20 @@ extern CompressionType get_default_compressor() { return g_config_value_.default_compression_type_; } -void config_set_page_max_point_count(uint32_t page_max_point_count) { +int config_set_page_max_point_count(uint32_t page_max_point_count) { + if (page_max_point_count == 0) { + return E_INVALID_ARG; + } g_config_value_.page_writer_max_point_num_ = page_max_point_count; + return E_OK; } -void config_set_max_degree_of_index_node(uint32_t max_degree_of_index_node) { +int config_set_max_degree_of_index_node(uint32_t max_degree_of_index_node) { + if (max_degree_of_index_node < 2u) { + return E_INVALID_ARG; + } g_config_value_.max_degree_of_index_node_ = max_degree_of_index_node; -} - -void config_set_strict_page_size(bool strict_page_size) { - g_config_value_.strict_page_size_ = strict_page_size; + return E_OK; } void set_config_value() {} @@ -145,17 +151,35 @@ int init_common() { g_time_column_schema.compression_ = UNCOMPRESSED; g_time_column_schema.column_name_ = storage::TIME_COLUMN_NAME; #ifdef ENABLE_THREADS - // (Re)create the global write thread pool with the configured size. - delete g_write_thread_pool_; - size_t pool_size = - g_config_value_.write_thread_count_ > 0 - ? static_cast(g_config_value_.write_thread_count_) - : size_t{1}; - g_write_thread_pool_ = new ThreadPool(pool_size); + // (Re)create the single global worker pool with the configured size. All + // parallel write/read paths submit here; torn down in libtsfile_destroy(). + delete g_thread_pool_; + size_t pool_size = g_config_value_.thread_count_ > 0 + ? static_cast(g_config_value_.thread_count_) + : size_t{1}; + g_thread_pool_ = new ThreadPool(pool_size); #endif return ret; } +int set_thread_count(int32_t count) { + if (count < 1 || count > 64) return E_INVALID_ARG; + g_config_value_.thread_count_ = count; +#ifdef ENABLE_THREADS + // If the global pool already exists (libtsfile_init has run) rebuild it at + // the new size so the change takes effect immediately instead of only at + // the next libtsfile_init(). This joins all current workers and recreates + // them, so the caller must ensure no read/write is concurrently using the + // pool — intended for setup / benchmark reconfiguration, not mid-operation + // resizing. + if (g_thread_pool_ != nullptr) { + delete g_thread_pool_; + g_thread_pool_ = new ThreadPool(static_cast(count)); + } +#endif + return E_OK; +} + bool is_timestamp_column_name(const char* time_col_name) { // both "time" and "timestamp" refer to timestamp column. int32_t len = strlen(time_col_name); diff --git a/cpp/src/common/global.h b/cpp/src/common/global.h index 5bee0fa60..ae04c6afa 100644 --- a/cpp/src/common/global.h +++ b/cpp/src/common/global.h @@ -29,6 +29,15 @@ namespace common { extern TSFILE_API ConfigValue g_config_value_; extern TSFILE_API ColumnSchema g_time_column_schema; +#ifdef ENABLE_THREADS +class ThreadPool; +// The single process-wide worker pool shared by every parallel code path +// (write column encoding, read column decoding). Created in init_common() +// and torn down in libtsfile_destroy(); null until libtsfile_init() runs, so +// every caller must fall back to the serial path when it is null. +extern TSFILE_API ThreadPool* g_thread_pool_; +#endif + FORCE_INLINE int set_global_time_data_type(uint8_t data_type) { ASSERT(data_type >= BOOLEAN && data_type <= STRING); if (data_type != INT64) { @@ -163,29 +172,28 @@ FORCE_INLINE uint8_t get_global_compression() { return static_cast(g_config_value_.default_compression_type_); } +FORCE_INLINE void set_parallel_read_enabled(bool enabled) { + g_config_value_.parallel_read_enabled_ = enabled; +} + +FORCE_INLINE bool get_parallel_read_enabled() { + return g_config_value_.parallel_read_enabled_; +} + FORCE_INLINE void set_parallel_write_enabled(bool enabled) { g_config_value_.parallel_write_enabled_ = enabled; } FORCE_INLINE bool get_parallel_write_enabled() { - return g_config_value_.parallel_write_enabled_ && - g_config_value_.write_thread_count_ > 1; -} - -// Set the number of threads for parallel writes. Must be called before -// init_common() / libtsfile_init() — the global thread pool is created -// during initialization and is not resized at runtime. -FORCE_INLINE int set_write_thread_count(int32_t count) { - if (count < 1 || count > 64) return E_INVALID_ARG; - g_config_value_.write_thread_count_ = count; - return E_OK; + return g_config_value_.parallel_write_enabled_; } -#ifdef ENABLE_THREADS -class ThreadPool; -// Global write thread pool, created by init_common(). -extern ThreadPool* g_write_thread_pool_; -#endif +// Size of the single global worker pool. Rejects values outside [1, 64] with +// E_INVALID_ARG, leaving the field untouched. If the pool already exists +// (libtsfile_init has run) it is rebuilt at the new size immediately; the +// caller must ensure no read/write is concurrently using the pool. Defined in +// global.cc (needs the full ThreadPool type). +extern int set_thread_count(int32_t count); extern int init_common(); extern bool is_timestamp_column_name(const char* time_col_name); diff --git a/cpp/src/common/mutex/CMakeLists.txt b/cpp/src/common/mutex/CMakeLists.txt deleted file mode 100644 index e7ef66faa..000000000 --- a/cpp/src/common/mutex/CMakeLists.txt +++ /dev/null @@ -1,20 +0,0 @@ -#[[ -Licensed to the Apache Software Foundation (ASF) under one -or more contributor license agreements. See the NOTICE file -distributed with this work for additional information -regarding copyright ownership. The ASF licenses this file -to you under the Apache License, Version 2.0 (the -"License"); you may not use this file except in compliance -with the License. You may obtain a copy of the License at - - https://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, -software distributed under the License is distributed on an -"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -KIND, either express or implied. See the License for the -specific language governing permissions and limitations -under the License. -]] - - diff --git a/cpp/src/common/mutex/mutex.h b/cpp/src/common/mutex/mutex.h deleted file mode 100644 index b35d328de..000000000 --- a/cpp/src/common/mutex/mutex.h +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#ifndef COMMON_MUTEX_MUTEX_H -#define COMMON_MUTEX_MUTEX_H - -#include - -#include "utils/util_define.h" - -namespace common { - -// Thin wrapper over std::mutex. Implemented with the C++11 standard library -// (instead of pthreads directly) so it builds on every platform, including -// MSVC where pthreads is not available. -class Mutex { - public: - Mutex() {} - ~Mutex() {} - - void lock() { mutex_.lock(); } - - void unlock() { mutex_.unlock(); } - - bool try_lock() { return mutex_.try_lock(); } - - private: - std::mutex mutex_; -}; - -class MutexGuard { - public: - MutexGuard(Mutex& m) : m_(m) { m_.lock(); } - ~MutexGuard() { m_.unlock(); } - - private: - Mutex& m_; -}; - -} // end namespace common -#endif // COMMON_MUTEX_MUTEX_H diff --git a/cpp/src/common/path.cc b/cpp/src/common/path.cc deleted file mode 100644 index d70a9d6c6..000000000 --- a/cpp/src/common/path.cc +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include "common/path.h" - -#include "common/constant/tsfile_constant.h" - -#ifdef ENABLE_ANTLR4 -#include "parser/path_nodes_generator.h" -#endif - -namespace storage { - -Path::Path() = default; - -Path::Path(std::string& device, std::string& measurement) - : measurement_(measurement), - device_id_(std::make_shared(device)) { - full_path_ = device + "." + measurement; -} - -Path::Path(const std::string& path_sc, bool if_split) { - if (!path_sc.empty()) { - if (!if_split) { - full_path_ = path_sc; - device_id_ = std::make_shared(path_sc); - } else { -#ifdef ENABLE_ANTLR4 - std::vector nodes = - PathNodesGenerator::invokeParser(path_sc); -#else - std::vector nodes = - IDeviceID::split_string(path_sc, '.'); -#endif - if (nodes.size() > 1) { - // Join nodes, then parse like write path / Java Path (not - // per-segment vector). - std::string device_joined; - for (size_t i = 0; i + 1 < nodes.size(); ++i) { - if (i > 0) { - device_joined += PATH_SEPARATOR_CHAR; - } - device_joined += nodes[i]; - } - device_id_ = - std::make_shared(device_joined); - measurement_ = nodes[nodes.size() - 1]; - full_path_ = device_id_->get_device_name() + "." + measurement_; - } else { - full_path_ = path_sc; - device_id_ = std::make_shared(); - measurement_ = path_sc; - } - } - } else { - full_path_ = ""; - device_id_ = std::make_shared(); - measurement_ = ""; - } -} - -} // namespace storage diff --git a/cpp/src/common/path.h b/cpp/src/common/path.h index 3896b2715..c176d93db 100644 --- a/cpp/src/common/path.h +++ b/cpp/src/common/path.h @@ -21,7 +21,12 @@ #include +#include "common/constant/tsfile_constant.h" #include "common/device_id.h" +#ifdef ENABLE_ANTLR4 +#include "parser/generated/PathParser.h" +#include "parser/path_nodes_generator.h" +#endif #include "utils/errno_define.h" namespace storage { @@ -31,9 +36,57 @@ struct Path { std::shared_ptr device_id_; std::string full_path_; - Path(); - Path(std::string& device, std::string& measurement); - Path(const std::string& path_sc, bool if_split = true); + Path() {} + + Path(std::string& device, std::string& measurement) + : measurement_(measurement), + device_id_(std::make_shared(device)) { + full_path_ = device + "." + measurement; + } + + Path(const std::string& path_sc, bool if_split = true) { + if (!path_sc.empty()) { + if (!if_split) { + full_path_ = path_sc; + device_id_ = std::make_shared(path_sc); + } else { +#ifdef ENABLE_ANTLR4 + std::vector nodes = + PathNodesGenerator::invokeParser(path_sc); +#else + std::vector nodes = + IDeviceID::split_string(path_sc, '.'); +#endif + if (nodes.size() > 1) { + // Join nodes, then parse like write path / Java Path + // (route through the interpretive string ctor instead of + // the literal per-segment vector ctor, so a stored + // "root.sg.d1" device matches a query path + // "root.sg.d1.s1"). + std::string device_joined; + for (size_t i = 0; i + 1 < nodes.size(); ++i) { + if (i > 0) { + device_joined += PATH_SEPARATOR_CHAR; + } + device_joined += nodes[i]; + } + device_id_ = + std::make_shared(device_joined); + measurement_ = nodes[nodes.size() - 1]; + full_path_ = + device_id_->get_device_name() + "." + measurement_; + } else { + full_path_ = path_sc; + device_id_ = std::make_shared(); + measurement_ = path_sc; + } + } + } else { + full_path_ = ""; + device_id_ = std::make_shared(); + measurement_ = ""; + } + } bool operator==(const Path& path) { if (measurement_.compare(path.measurement_) == 0 && diff --git a/cpp/src/common/seq_tvlist.h b/cpp/src/common/seq_tvlist.h deleted file mode 100644 index 24805ac5d..000000000 --- a/cpp/src/common/seq_tvlist.h +++ /dev/null @@ -1,172 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#ifndef COMMON_SEQ_TVLIST_H -#define COMMON_SEQ_TVLIST_H - -#include "common/allocator/alloc_base.h" -#include "common/allocator/page_arena.h" -#include "common/mutex/mutex.h" -#include "utils/db_utils.h" -#include "utils/errno_define.h" -#include "utils/storage_utils.h" -#include "utils/util_define.h" - -namespace storage { - -class SeqTVListBase { - public: - SeqTVListBase() - : data_type_(common::VECTOR), - mutex_(), - ref_count_(0), - primary_array_size_(0), - list_size_(0), - write_count_(0), - page_arena_(common::g_base_allocator), - use_page_arena_(false), - is_immutable_(false) {} - virtual ~SeqTVListBase() {} - virtual void destroy() {} - - FORCE_INLINE void ref() { ATOMIC_AAF(&ref_count_, 1); } - FORCE_INLINE bool unref() { return 0 == ATOMIC_AAF(&ref_count_, -1); } - - FORCE_INLINE void lock() { mutex_.lock(); } - FORCE_INLINE void unlock() { mutex_.unlock(); } - - int32_t get_total_count() const { return write_count_; } - common::TSDataType get_data_type() const { return data_type_; } - virtual TimeRange get_time_range() const = 0; - void mark_immutable() { is_immutable_ = true; } - bool is_immutable() const { return is_immutable_; } - - protected: - common::TSDataType data_type_; - mutable common::Mutex mutex_; - int32_t ref_count_; - int32_t primary_array_size_; - int32_t list_size_; - int32_t write_count_; - common::PageArena page_arena_; - bool use_page_arena_; - bool is_immutable_; -}; - -template -class SeqTVList : public SeqTVListBase { - public: - typedef struct TV { - int64_t time_; - Type value_; - } TV; - - struct Iterator { - SeqTVList* host_list_; - int32_t read_idx_; - int32_t end_idx_; - - Iterator() : host_list_(nullptr), read_idx_(UINT32_MAX), end_idx_(0) {} - - INLINE void init(SeqTVList* host, int32_t start_idx, int32_t end_idx) { - host_list_ = host; - read_idx_ = start_idx; - end_idx_ = end_idx; - } - - int next(TV& tv) { - if (read_idx_ >= end_idx_) { - return common::E_NO_MORE_DATA; - } - tv = host_list_->at(read_idx_); - read_idx_++; - return common::E_OK; - } - }; - - public: - SeqTVList() : tv_array_list_(nullptr), last_time_(-1) { - data_type_ = common::GetDataTypeFromTemplateType(); - } - virtual ~SeqTVList() {} - - int init(int32_t primary_array_size, int32_t max_count, - bool use_page_arena); - void destroy() OVERRIDE; - - int push(int64_t time, Type value); - int push_without_lock(int64_t time, Type value); - Iterator scan_without_lock(int64_t start_time, int64_t end_time); - Iterator scan_without_lock(); - - TimeRange get_time_range() const OVERRIDE { - TimeRange time_range; - common::MutexGuard mg(mutex_); - if (write_count_ > 0) { - time_range.start_time_ = time_at(0); - time_range.end_time_ = time_at(write_count_ - 1); - ASSERT(time_range.start_time_ <= time_range.end_time_); - } - return time_range; - } - - FORCE_INLINE TV at(int32_t tv_idx) const { - ASSERT(tv_idx < write_count_); - int32_t list_idx = tv_idx / primary_array_size_; - int32_t list_offset = tv_idx % primary_array_size_; - return tv_array_list_[list_idx][list_offset]; - } - - FORCE_INLINE int64_t time_at(int32_t tv_idx) const { - return at(tv_idx).time_; - } - -#ifdef ENABLE_TEST - int32_t TEST_binary_search_upper(int64_t time) { - return binary_search_upper(time); - } - int32_t TEST_binary_search_lower(int64_t time) { - return binary_search_lower(time); - } -#endif - - private: - FORCE_INLINE void* alloc(uint32_t size) { - if (use_page_arena_) { - return page_arena_.alloc(size); - } else { - return common::mem_alloc(size, common::MOD_TVLIST_DATA); - } - } - - // return the first tv which is larger or equal to @time - int32_t binary_search_upper(int64_t time); - // return the last tv which is less or equal to @time - int32_t binary_search_lower(int64_t time); - - private: - TV** tv_array_list_; - int64_t last_time_; -}; - -} // namespace storage - -#include "seq_tvlist.inc" - -#endif // COMMON_SEQ_TVLIST_H diff --git a/cpp/src/common/seq_tvlist.inc b/cpp/src/common/seq_tvlist.inc deleted file mode 100644 index c25e49f45..000000000 --- a/cpp/src/common/seq_tvlist.inc +++ /dev/null @@ -1,174 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * License); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -// #include "seq_tvlist.h" -#include -#include -#include -#include "common/mutex/mutex.h" -#include "common/logger/elog.h" - - -namespace storage -{ - -template -int SeqTVList::init(int32_t primary_array_size, - int32_t max_count, - bool use_page_arena) -{ - if (primary_array_size > max_count) { - //common:://log_err("TVList init error, primary_array_size=%u, max_count=%u", primary_array_size, max_count); - return common::E_INVALID_ARG; - } - use_page_arena_ = use_page_arena; - - primary_array_size_ = primary_array_size; - list_size_ = (max_count / primary_array_size_) + - (max_count % primary_array_size_ == 0 ? 0 : 1); - - int32_t alloc_size = sizeof(TV) * list_size_; - tv_array_list_ = (TV**)alloc(alloc_size); - if (tv_array_list_ == nullptr) { - return common::E_OOM; - } - memset(tv_array_list_, 0, alloc_size); - write_count_ = 0; - if (use_page_arena_) { - // TODO make it configurable - page_arena_.init(sizeof(TV) * primary_array_size_ * 4, common::MOD_TVLIST_OBJ); - } - return common::E_OK; -} - -template -int SeqTVList::push(int64_t time, Type value) -{ - common::MutexGuard mg(mutex_); - return push_without_lock(time, value); -}; - -template -int SeqTVList::push_without_lock(int64_t time, Type value) -{ - if (UNLIKELY(time <= last_time_)) { - return common::E_OUT_OF_ORDER; - } - if (UNLIKELY(write_count_ >= list_size_ * primary_array_size_)) { - return common::E_OVERFLOW; - } - - int32_t list_idx = write_count_ / primary_array_size_; - int32_t list_offset = write_count_ % primary_array_size_; - if (UNLIKELY(list_offset == 0)) { - ASSERT(tv_array_list_[list_idx] == nullptr); - tv_array_list_[list_idx] = static_cast(alloc(sizeof(TV) * primary_array_size_)); - if (UNLIKELY(tv_array_list_[list_idx] == nullptr)) { - return common::E_OOM; - } - } - - TV insert_tv; - insert_tv.time_ = time; - insert_tv.value_ = value; -#if STORAGE_ENGINE_DEBUG - std::cout << "tvlist[" << list_idx << "][" << list_offset << "] = (" << time << ", " << value << ")" << std::endl; -#endif - tv_array_list_[list_idx][list_offset] = insert_tv; - write_count_++; - last_time_ = time; - return common::E_OK; -}; - -template -void SeqTVList::destroy() -{ - if (use_page_arena_) { - page_arena_.destroy(); - } else { - int32_t list_size = write_count_ / primary_array_size_ - + (write_count_ % primary_array_size_ == 0 ? 0 : 1); - for (int i = 0; i < list_size; i++) { - common::mem_free(tv_array_list_[i]); - } - common::mem_free(tv_array_list_); - } -} - -template -typename SeqTVList::Iterator SeqTVList::scan_without_lock(int64_t start_time, int64_t end_time) -{ - ASSERT(start_time < end_time); - int32_t start_idx = binary_search_lower(start_time); - int32_t end_idx = binary_search_upper(end_time); - ASSERT(start_idx <= end_time + 1); - SeqTVList::Iterator iter; - iter.init(this, start_idx, end_idx); - return iter; -} - -template -typename SeqTVList::Iterator SeqTVList::scan_without_lock() -{ - SeqTVList::Iterator iter; - iter.init(this, 0, write_count_); - return iter; -} - -// return the first tv which is larger or equal to @time -template -int32_t SeqTVList::binary_search_lower(int64_t time) -{ - int32_t start = -1; - int32_t end = write_count_; - - // arr[start] < time <= arr[end] - while (start + 1 != end) { - int mid = (start + end) / 2; - int64_t mid_time = time_at(mid); - if (mid_time < time) { - start = mid; - } else { - end = mid; - } - } - return end; -} - -// return the last tv which is less or equal to @time -template -int32_t SeqTVList::binary_search_upper(int64_t time) -{ - int32_t start = 0; - int32_t end = write_count_; - - // arr[start] <= time < arr[end] - while (start + 1 != end) { - int mid = (start + end) / 2; - int64_t mid_time = time_at(mid); - if (mid_time <= time) { - start = mid; - } else { - end = mid; - } - } - return start; -} - -} // namespace storage - diff --git a/cpp/src/common/statistic.h b/cpp/src/common/statistic.h index bced66173..3d45b4f43 100644 --- a/cpp/src/common/statistic.h +++ b/cpp/src/common/statistic.h @@ -22,12 +22,18 @@ #include +#include #include #include "common/allocator/alloc_base.h" #include "common/allocator/byte_stream.h" #include "common/db_common.h" +#if defined(__ARM_NEON) || defined(__ARM_NEON__) +#include +#define TSFILE_HAS_NEON 1 +#endif + namespace storage { /* @@ -176,6 +182,48 @@ class Statistic { } virtual FORCE_INLINE void update(int64_t time) { ASSERT(false); } + virtual void update_time_batch(const int64_t* timestamps, uint32_t count) { + for (uint32_t i = 0; i < count; i++) { + update(timestamps[i]); + } + } + virtual void update_batch(const int64_t* timestamps, const bool* values, + uint32_t count) { + for (uint32_t i = 0; i < count; i++) { + update(timestamps[i], values[i]); + } + } + virtual void update_batch(const int64_t* timestamps, const int32_t* values, + uint32_t count) { + for (uint32_t i = 0; i < count; i++) { + update(timestamps[i], values[i]); + } + } + virtual void update_batch(const int64_t* timestamps, const int64_t* values, + uint32_t count) { + for (uint32_t i = 0; i < count; i++) { + update(timestamps[i], values[i]); + } + } + virtual void update_batch(const int64_t* timestamps, const float* values, + uint32_t count) { + for (uint32_t i = 0; i < count; i++) { + update(timestamps[i], values[i]); + } + } + virtual void update_batch(const int64_t* timestamps, const double* values, + uint32_t count) { + for (uint32_t i = 0; i < count; i++) { + update(timestamps[i], values[i]); + } + } + virtual void update_batch(const int64_t* timestamps, + const common::String* values, uint32_t count) { + for (uint32_t i = 0; i < count; i++) { + update(timestamps[i], values[i]); + } + } + virtual int serialize_to(common::ByteStream& out) { int ret = common::E_OK; if (RET_FAIL(common::SerializationUtil::write_var_uint(count_, out))) { @@ -554,17 +602,17 @@ class BooleanStatistic : public Statistic { last_value_ = that.last_value_; } - FORCE_INLINE void reset() { + FORCE_INLINE void reset() override { count_ = 0; sum_value_ = 0; first_value_ = false; last_value_ = false; } - FORCE_INLINE void update(int64_t time, bool value) { + FORCE_INLINE void update(int64_t time, bool value) override { BOOL_STAT_UPDATE(time, value); } - int serialize_typed_stat(common::ByteStream& out) { + int serialize_typed_stat(common::ByteStream& out) override { int ret = common::E_OK; if (RET_FAIL(common::SerializationUtil::write_ui8(first_value_ ? 1 : 0, out))) { @@ -575,7 +623,7 @@ class BooleanStatistic : public Statistic { } return ret; } - int deserialize_typed_stat(common::ByteStream& in) { + int deserialize_typed_stat(common::ByteStream& in) override { int ret = common::E_OK; if (RET_FAIL(common::SerializationUtil::read_ui8((uint8_t&)first_value_, in))) { @@ -587,13 +635,15 @@ class BooleanStatistic : public Statistic { return ret; } - FORCE_INLINE common::TSDataType get_type() { return common::BOOLEAN; } + FORCE_INLINE common::TSDataType get_type() override { + return common::BOOLEAN; + } - int merge_with(Statistic* stat) { + int merge_with(Statistic* stat) override { MERGE_BOOL_STAT_FROM(BooleanStatistic, stat); } - int deep_copy_from(Statistic* stat) { + int deep_copy_from(Statistic* stat) override { DEEP_COPY_BOOL_STAT_FROM(BooleanStatistic, stat); } }; @@ -625,7 +675,7 @@ class Int32Statistic : public Statistic { last_value_ = that.last_value_; } - FORCE_INLINE void reset() { + FORCE_INLINE void reset() override { count_ = 0; sum_value_ = 0; min_value_ = 0; @@ -634,13 +684,41 @@ class Int32Statistic : public Statistic { last_value_ = 0; } - FORCE_INLINE void update(int64_t time, int32_t value) { + FORCE_INLINE void update(int64_t time, int32_t value) override { NUM_STAT_UPDATE(time, value); } - FORCE_INLINE common::TSDataType get_type() { return common::INT32; } + void update_batch(const int64_t* timestamps, const int32_t* values, + uint32_t count) override { + if (count == 0) return; + uint32_t start = 0; + if (count_ == 0) { + start_time_ = timestamps[0]; + end_time_ = timestamps[0]; + first_value_ = values[0]; + last_value_ = values[0]; + min_value_ = values[0]; + max_value_ = values[0]; + sum_value_ = (int64_t)values[0]; + count_ = 1; + start = 1; + } + for (uint32_t i = start; i < count; i++) { + if (timestamps[i] < start_time_) start_time_ = timestamps[i]; + if (timestamps[i] > end_time_) end_time_ = timestamps[i]; + if (values[i] < min_value_) min_value_ = values[i]; + if (values[i] > max_value_) max_value_ = values[i]; + sum_value_ += (int64_t)values[i]; + } + last_value_ = values[count - 1]; + count_ += (count - start); + } + + FORCE_INLINE common::TSDataType get_type() override { + return common::INT32; + } - int serialize_typed_stat(common::ByteStream& out) { + int serialize_typed_stat(common::ByteStream& out) override { int ret = common::E_OK; if (RET_FAIL(common::SerializationUtil::write_ui32(min_value_, out))) { } else if (RET_FAIL(common::SerializationUtil::write_ui32(max_value_, @@ -654,7 +732,7 @@ class Int32Statistic : public Statistic { } return ret; } - int deserialize_typed_stat(common::ByteStream& in) { + int deserialize_typed_stat(common::ByteStream& in) override { int ret = common::E_OK; if (RET_FAIL(common::SerializationUtil::read_ui32((uint32_t&)min_value_, in))) { @@ -676,15 +754,15 @@ class Int32Statistic : public Statistic { // << std::endl; return ret; } - int merge_with(Statistic* stat) { + int merge_with(Statistic* stat) override { MERGE_NUM_STAT_FROM(Int32Statistic, stat); } - int deep_copy_from(Statistic* stat) { + int deep_copy_from(Statistic* stat) override { DEEP_COPY_NUM_STAT_FROM(Int32Statistic, stat); } - std::string to_string() const { + std::string to_string() const override { std::ostringstream oss; oss << "{count=" << count_ << ", start_time=" << start_time_ << ", end_time=" << end_time_ << ", first_val=" << first_value_ @@ -696,7 +774,7 @@ class Int32Statistic : public Statistic { }; class DateStatistic : public Int32Statistic { - FORCE_INLINE common::TSDataType get_type() { return common::DATE; } + FORCE_INLINE common::TSDataType get_type() override { return common::DATE; } }; class Int64Statistic : public Statistic { @@ -726,7 +804,7 @@ class Int64Statistic : public Statistic { last_value_ = that.last_value_; } - FORCE_INLINE void reset() { + FORCE_INLINE void reset() override { count_ = 0; sum_value_ = 0; min_value_ = 0; @@ -734,13 +812,69 @@ class Int64Statistic : public Statistic { first_value_ = 0; last_value_ = 0; } - FORCE_INLINE void update(int64_t time, int64_t value) { + FORCE_INLINE void update(int64_t time, int64_t value) override { NUM_STAT_UPDATE(time, value); } - FORCE_INLINE common::TSDataType get_type() { return common::INT64; } + void update_batch(const int64_t* timestamps, const int64_t* values, + uint32_t count) override { + if (count == 0) return; + uint32_t start = 0; + if (count_ == 0) { + start_time_ = timestamps[0]; + end_time_ = timestamps[0]; + first_value_ = values[0]; + last_value_ = values[0]; + min_value_ = values[0]; + max_value_ = values[0]; + sum_value_ = (double)values[0]; + count_ = 1; + start = 1; + } + // Timestamps are monotonic (verified by TimePageWriter), + // so only first/last matter for start_time_/end_time_. + if (count > start) { + if (timestamps[start] < start_time_) + start_time_ = timestamps[start]; + if (timestamps[count - 1] > end_time_) + end_time_ = timestamps[count - 1]; + } + uint32_t i = start; +#if TSFILE_HAS_NEON + { + int64x2_t vmin = vdupq_n_s64(min_value_); + int64x2_t vmax = vdupq_n_s64(max_value_); + float64x2_t vsum = vdupq_n_f64(0.0); + for (; i + 2 <= count; i += 2) { + int64x2_t v = vld1q_s64(&values[i]); + // min/max via compare+select (no vminq_s64 in NEON) + uint64x2_t lt = vcltq_s64(v, vmin); + vmin = vbslq_s64(lt, v, vmin); + uint64x2_t gt = vcgtq_s64(v, vmax); + vmax = vbslq_s64(gt, v, vmax); + vsum = vaddq_f64(vsum, vcvtq_f64_s64(v)); + } + min_value_ = + std::min(vgetq_lane_s64(vmin, 0), vgetq_lane_s64(vmin, 1)); + max_value_ = + std::max(vgetq_lane_s64(vmax, 0), vgetq_lane_s64(vmax, 1)); + sum_value_ += vgetq_lane_f64(vsum, 0) + vgetq_lane_f64(vsum, 1); + } +#endif + for (; i < count; i++) { + if (values[i] < min_value_) min_value_ = values[i]; + if (values[i] > max_value_) max_value_ = values[i]; + sum_value_ += (double)values[i]; + } + last_value_ = values[count - 1]; + count_ += (count - start); + } + + FORCE_INLINE common::TSDataType get_type() override { + return common::INT64; + } - int serialize_typed_stat(common::ByteStream& out) { + int serialize_typed_stat(common::ByteStream& out) override { int ret = common::E_OK; if (RET_FAIL(common::SerializationUtil::write_ui64(min_value_, out))) { } else if (RET_FAIL(common::SerializationUtil::write_ui64(max_value_, @@ -754,7 +888,7 @@ class Int64Statistic : public Statistic { } return ret; } - int deserialize_typed_stat(common::ByteStream& in) { + int deserialize_typed_stat(common::ByteStream& in) override { int ret = common::E_OK; if (RET_FAIL(common::SerializationUtil::read_ui64((uint64_t&)min_value_, in))) { @@ -769,15 +903,15 @@ class Int64Statistic : public Statistic { } return ret; } - int merge_with(Statistic* stat) { + int merge_with(Statistic* stat) override { MERGE_NUM_STAT_FROM(Int64Statistic, stat); } - int deep_copy_from(Statistic* stat) { + int deep_copy_from(Statistic* stat) override { DEEP_COPY_NUM_STAT_FROM(Int64Statistic, stat); } - std::string to_string() const { + std::string to_string() const override { std::ostringstream oss; oss << "{count=" << count_ << ", start_time=" << start_time_ << ", end_time=" << end_time_ << ", first_val=" << first_value_ @@ -815,7 +949,7 @@ class FloatStatistic : public Statistic { last_value_ = that.last_value_; } - FORCE_INLINE void reset() { + FORCE_INLINE void reset() override { count_ = 0; sum_value_ = 0; min_value_ = 0; @@ -823,13 +957,15 @@ class FloatStatistic : public Statistic { first_value_ = 0; last_value_ = 0; } - FORCE_INLINE void update(int64_t time, float value) { + FORCE_INLINE void update(int64_t time, float value) override { NUM_STAT_UPDATE(time, value); } - FORCE_INLINE common::TSDataType get_type() { return common::FLOAT; } + FORCE_INLINE common::TSDataType get_type() override { + return common::FLOAT; + } - int serialize_typed_stat(common::ByteStream& out) { + int serialize_typed_stat(common::ByteStream& out) override { int ret = common::E_OK; if (RET_FAIL(common::SerializationUtil::write_float(min_value_, out))) { } else if (RET_FAIL(common::SerializationUtil::write_float(max_value_, @@ -843,7 +979,7 @@ class FloatStatistic : public Statistic { } return ret; } - int deserialize_typed_stat(common::ByteStream& in) { + int deserialize_typed_stat(common::ByteStream& in) override { int ret = common::E_OK; if (RET_FAIL(common::SerializationUtil::read_float(min_value_, in))) { } else if (RET_FAIL( @@ -857,10 +993,10 @@ class FloatStatistic : public Statistic { } return ret; } - int merge_with(Statistic* stat) { + int merge_with(Statistic* stat) override { MERGE_NUM_STAT_FROM(FloatStatistic, stat); } - int deep_copy_from(Statistic* stat) { + int deep_copy_from(Statistic* stat) override { DEEP_COPY_NUM_STAT_FROM(FloatStatistic, stat); } }; @@ -892,7 +1028,7 @@ class DoubleStatistic : public Statistic { last_value_ = that.last_value_; } - FORCE_INLINE void reset() { + FORCE_INLINE void reset() override { count_ = 0; sum_value_ = 0; min_value_ = 0; @@ -900,13 +1036,64 @@ class DoubleStatistic : public Statistic { first_value_ = 0; last_value_ = 0; } - FORCE_INLINE void update(int64_t time, double value) { + FORCE_INLINE void update(int64_t time, double value) override { NUM_STAT_UPDATE(time, value); } - FORCE_INLINE common::TSDataType get_type() { return common::DOUBLE; } + void update_batch(const int64_t* timestamps, const double* values, + uint32_t count) override { + if (count == 0) return; + uint32_t start = 0; + if (count_ == 0) { + start_time_ = timestamps[0]; + end_time_ = timestamps[0]; + first_value_ = values[0]; + last_value_ = values[0]; + min_value_ = values[0]; + max_value_ = values[0]; + sum_value_ = values[0]; + count_ = 1; + start = 1; + } + if (count > start) { + if (timestamps[start] < start_time_) + start_time_ = timestamps[start]; + if (timestamps[count - 1] > end_time_) + end_time_ = timestamps[count - 1]; + } + uint32_t i = start; +#if TSFILE_HAS_NEON + { + float64x2_t vmin = vdupq_n_f64(min_value_); + float64x2_t vmax = vdupq_n_f64(max_value_); + float64x2_t vsum = vdupq_n_f64(0.0); + for (; i + 2 <= count; i += 2) { + float64x2_t v = vld1q_f64(&values[i]); + vmin = vminq_f64(vmin, v); + vmax = vmaxq_f64(vmax, v); + vsum = vaddq_f64(vsum, v); + } + min_value_ = + std::min(vgetq_lane_f64(vmin, 0), vgetq_lane_f64(vmin, 1)); + max_value_ = + std::max(vgetq_lane_f64(vmax, 0), vgetq_lane_f64(vmax, 1)); + sum_value_ += vgetq_lane_f64(vsum, 0) + vgetq_lane_f64(vsum, 1); + } +#endif + for (; i < count; i++) { + if (values[i] < min_value_) min_value_ = values[i]; + if (values[i] > max_value_) max_value_ = values[i]; + sum_value_ += values[i]; + } + last_value_ = values[count - 1]; + count_ += (count - start); + } + + FORCE_INLINE common::TSDataType get_type() override { + return common::DOUBLE; + } - int serialize_typed_stat(common::ByteStream& out) { + int serialize_typed_stat(common::ByteStream& out) override { int ret = common::E_OK; if (RET_FAIL( common::SerializationUtil::write_double(min_value_, out))) { @@ -921,7 +1108,7 @@ class DoubleStatistic : public Statistic { } return ret; } - int deserialize_typed_stat(common::ByteStream& in) { + int deserialize_typed_stat(common::ByteStream& in) override { int ret = common::E_OK; if (RET_FAIL(common::SerializationUtil::read_double(min_value_, in))) { } else if (RET_FAIL(common::SerializationUtil::read_double(max_value_, @@ -935,10 +1122,10 @@ class DoubleStatistic : public Statistic { } return ret; } - int merge_with(Statistic* stat) { + int merge_with(Statistic* stat) override { MERGE_NUM_STAT_FROM(DoubleStatistic, stat); } - int deep_copy_from(Statistic* stat) { + int deep_copy_from(Statistic* stat) override { DEEP_COPY_NUM_STAT_FROM(DoubleStatistic, stat); } }; @@ -960,30 +1147,50 @@ class TimeStatistic : public Statistic { end_time_ = that.end_time_; } - FORCE_INLINE void reset() { + FORCE_INLINE void reset() override { count_ = 0; start_time_ = 0; end_time_ = 0; } - FORCE_INLINE void update(int64_t time) { + FORCE_INLINE void update(int64_t time) override { TIME_STAT_UPDATE((time)); count_++; } - FORCE_INLINE common::TSDataType get_type() { return common::VECTOR; } + void update_time_batch(const int64_t* timestamps, uint32_t count) override { + if (count == 0) return; + if (count_ == 0) { + start_time_ = timestamps[0]; + end_time_ = timestamps[0]; + } + // Timestamps are already verified monotonic in TimePageWriter, + // so first element is min candidate and last is max candidate. + if (timestamps[0] < start_time_) start_time_ = timestamps[0]; + if (timestamps[count - 1] > end_time_) + end_time_ = timestamps[count - 1]; + count_ += count; + } - int serialize_typed_stat(common::ByteStream& out) { return common::E_OK; } - int deserialize_typed_stat(common::ByteStream& in) { return common::E_OK; } - int merge_with(Statistic* stat) { + FORCE_INLINE common::TSDataType get_type() override { + return common::VECTOR; + } + + int serialize_typed_stat(common::ByteStream& out) override { + return common::E_OK; + } + int deserialize_typed_stat(common::ByteStream& in) override { + return common::E_OK; + } + int merge_with(Statistic* stat) override { MERGE_TIME_STAT_FROM(TimeStatistic, stat); } - int deep_copy_from(Statistic* stat) { + int deep_copy_from(Statistic* stat) override { DEEP_COPY_TIME_STAT_FROM(TimeStatistic, stat); } - std::string to_string() const { + std::string to_string() const override { std::ostringstream oss; oss << "{count=" << count_ << ", start_time=" << start_time_ << ", end_time=" << end_time_ << "}"; @@ -992,7 +1199,9 @@ class TimeStatistic : public Statistic { }; class TimestampStatistics : public Int64Statistic { - FORCE_INLINE common::TSDataType get_type() { return common::TIMESTAMP; } + FORCE_INLINE common::TSDataType get_type() override { + return common::TIMESTAMP; + } }; class StringStatistic : public Statistic { @@ -1002,35 +1211,24 @@ class StringStatistic : public Statistic { common::String first_value_; common::String last_value_; StringStatistic() - : min_value_(), - max_value_(), - first_value_(), - last_value_(), - pa_(nullptr), - owns_pa_(true) { + : min_value_(), max_value_(), first_value_(), last_value_() { pa_ = new common::PageArena(); pa_->init(512, common::MOD_STATISTIC_OBJ); } StringStatistic(common::PageArena* pa) - : min_value_(), - max_value_(), - first_value_(), - last_value_(), - pa_(pa), - owns_pa_(false) {} + : min_value_(), max_value_(), first_value_(), last_value_(), pa_(pa) {} ~StringStatistic() { destroy(); } - void destroy() { - if (owns_pa_ && pa_) { + void destroy() override { + if (pa_) { delete pa_; pa_ = nullptr; } - owns_pa_ = false; } - FORCE_INLINE void reset() { + FORCE_INLINE void reset() override { count_ = 0; start_time_ = 0; end_time_ = 0; @@ -1050,13 +1248,15 @@ class StringStatistic : public Statistic { last_value_.dup_from(that.last_value_, *pa_); } - FORCE_INLINE void update(int64_t time, common::String value) { + FORCE_INLINE void update(int64_t time, common::String value) override { STRING_STAT_UPDATE(time, value); } - FORCE_INLINE common::TSDataType get_type() { return common::STRING; } + FORCE_INLINE common::TSDataType get_type() override { + return common::STRING; + } - int serialize_typed_stat(common::ByteStream& out) { + int serialize_typed_stat(common::ByteStream& out) override { int ret = common::E_OK; if (RET_FAIL(common::SerializationUtil::write_str(first_value_, out))) { } else if (RET_FAIL(common::SerializationUtil::write_str(last_value_, @@ -1068,7 +1268,7 @@ class StringStatistic : public Statistic { } return ret; } - int deserialize_typed_stat(common::ByteStream& in) { + int deserialize_typed_stat(common::ByteStream& in) override { int ret = common::E_OK; if (RET_FAIL( common::SerializationUtil::read_str(first_value_, pa_, in))) { @@ -1081,42 +1281,39 @@ class StringStatistic : public Statistic { } return ret; } - int merge_with(Statistic* stat) { + int merge_with(Statistic* stat) override { MERGE_STRING_STAT_FROM(StringStatistic, stat); } - int deep_copy_from(Statistic* stat) { + int deep_copy_from(Statistic* stat) override { DEEP_COPY_STRING_STAT_FROM(StringStatistic, stat); } private: common::PageArena* pa_; - bool owns_pa_; }; class TextStatistic : public Statistic { public: common::String first_value_; common::String last_value_; - TextStatistic() - : first_value_(), last_value_(), pa_(nullptr), owns_pa_(true) { + TextStatistic() : first_value_(), last_value_() { pa_ = new common::PageArena(); pa_->init(512, common::MOD_STATISTIC_OBJ); } TextStatistic(common::PageArena* pa) - : first_value_(), last_value_(), pa_(pa), owns_pa_(false) {} + : first_value_(), last_value_(), pa_(pa) {} ~TextStatistic() { destroy(); } - void destroy() { - if (owns_pa_ && pa_) { + void destroy() override { + if (pa_) { delete pa_; pa_ = nullptr; } - owns_pa_ = false; } - FORCE_INLINE void reset() { + FORCE_INLINE void reset() override { count_ = 0; start_time_ = 0; end_time_ = 0; @@ -1132,13 +1329,13 @@ class TextStatistic : public Statistic { last_value_.dup_from(that.last_value_, *pa_); } - FORCE_INLINE void update(int64_t time, common::String value) { + FORCE_INLINE void update(int64_t time, common::String value) override { TEXT_STAT_UPDATE(time, value); } - FORCE_INLINE common::TSDataType get_type() { return common::TEXT; } + FORCE_INLINE common::TSDataType get_type() override { return common::TEXT; } - int serialize_typed_stat(common::ByteStream& out) { + int serialize_typed_stat(common::ByteStream& out) override { int ret = common::E_OK; if (RET_FAIL(common::SerializationUtil::write_str(first_value_, out))) { } else if (RET_FAIL(common::SerializationUtil::write_str(last_value_, @@ -1146,7 +1343,7 @@ class TextStatistic : public Statistic { } return ret; } - int deserialize_typed_stat(common::ByteStream& in) { + int deserialize_typed_stat(common::ByteStream& in) override { int ret = common::E_OK; if (RET_FAIL( common::SerializationUtil::read_str(first_value_, pa_, in))) { @@ -1155,35 +1352,33 @@ class TextStatistic : public Statistic { } return ret; } - int merge_with(Statistic* stat) { + int merge_with(Statistic* stat) override { MERGE_TEXT_STAT_FROM(TextStatistic, stat); } - int deep_copy_from(Statistic* stat) { + int deep_copy_from(Statistic* stat) override { DEEP_COPY_TEXT_STAT_FROM(TextStatistic, stat); } private: common::PageArena* pa_; - bool owns_pa_; }; class BlobStatistic : public Statistic { public: - BlobStatistic() : pa_(nullptr), owns_pa_(true) { + BlobStatistic() { pa_ = new common::PageArena(); pa_->init(512, common::MOD_STATISTIC_OBJ); } - BlobStatistic(common::PageArena* pa) : pa_(pa), owns_pa_(false) {} + BlobStatistic(common::PageArena* pa) {} ~BlobStatistic() { destroy(); } void destroy() { - if (owns_pa_ && pa_) { + if (pa_) { delete pa_; pa_ = nullptr; } - owns_pa_ = false; } FORCE_INLINE void reset() { @@ -1214,7 +1409,6 @@ class BlobStatistic : public Statistic { private: common::PageArena* pa_; - bool owns_pa_; }; FORCE_INLINE uint32_t get_typed_statistic_sizeof(common::TSDataType type) { diff --git a/cpp/src/common/tablet.cc b/cpp/src/common/tablet.cc index b9ae5301a..ba37a3245 100644 --- a/cpp/src/common/tablet.cc +++ b/cpp/src/common/tablet.cc @@ -20,8 +20,10 @@ #include "tablet.h" #include +#include #include "allocator/alloc_base.h" +#include "container/bit_map.h" #include "datatype/date_converter.h" #include "utils/errno_define.h" @@ -98,14 +100,13 @@ int Tablet::init() { case BLOB: case TEXT: case STRING: { - auto* sc = static_cast(common::mem_alloc( - sizeof(StringColumn), common::MOD_TABLET)); - if (sc == nullptr) return E_OOM; - new (sc) StringColumn(); - // 8 bytes/row is a conservative initial estimate for short - // string columns (e.g. device IDs, tags). The buffer grows - // automatically on demand via mem_realloc. - sc->init(max_row_num_, max_row_num_ * 8); + void* mem = + common::mem_alloc(sizeof(StringColumn), common::MOD_TABLET); + if (mem == nullptr) { + return E_OOM; + } + auto* sc = new (mem) StringColumn(); + sc->init(max_row_num_, max_row_num_ * 32); value_matrix_[c].string_col = sc; break; } @@ -120,8 +121,9 @@ int Tablet::init() { if (bitmaps_ == nullptr) return E_OOM; for (size_t c = 0; c < schema_count; c++) { new (&bitmaps_[c]) BitMap(); - bitmaps_[c].init(max_row_num_, false); + bitmaps_[c].init(max_row_num_, false, common::MOD_TABLET); } + return E_OK; } @@ -156,6 +158,7 @@ void Tablet::destroy() { case TEXT: case STRING: value_matrix_[c].string_col->destroy(); + value_matrix_[c].string_col->~StringColumn(); common::mem_free(value_matrix_[c].string_col); break; default: @@ -192,9 +195,7 @@ int Tablet::add_timestamp(uint32_t row_index, int64_t timestamp) { } int Tablet::set_timestamps(const int64_t* timestamps, uint32_t count) { - if (err_code_ != E_OK) { - return err_code_; - } + if (err_code_ != E_OK) return err_code_; ASSERT(timestamps_ != NULL); if (UNLIKELY(count > static_cast(max_row_num_))) { return E_OUT_OF_RANGE; @@ -206,15 +207,10 @@ int Tablet::set_timestamps(const int64_t* timestamps, uint32_t count) { int Tablet::set_column_values(uint32_t schema_index, const void* data, const uint8_t* bitmap, uint32_t count) { - if (err_code_ != E_OK) { - return err_code_; - } - if (UNLIKELY(schema_index >= schema_vec_->size())) { - return E_OUT_OF_RANGE; - } - if (UNLIKELY(count > static_cast(max_row_num_))) { + if (err_code_ != E_OK) return err_code_; + if (UNLIKELY(schema_index >= schema_vec_->size())) return E_OUT_OF_RANGE; + if (UNLIKELY(count > static_cast(max_row_num_))) return E_OUT_OF_RANGE; - } const MeasurementSchema& schema = schema_vec_->at(schema_index); size_t elem_size = 0; @@ -250,9 +246,13 @@ int Tablet::set_column_values(uint32_t schema_index, const void* data, if (bitmap == nullptr) { bitmaps_[schema_index].clear_all(); } else { - char* tsfile_bm = bitmaps_[schema_index].get_bitmap(); + // copy_from also refreshes has_set_bits_; a plain memcpy into + // get_bitmap() would leave the flag stale (e.g. cleared by a prior + // clear_all()) and downstream may_have_set_bits() checks would skip + // null-mask handling for the column. uint32_t bm_bytes = (count + 7) / 8; - std::memcpy(tsfile_bm, bitmap, bm_bytes); + bitmaps_[schema_index].copy_from(reinterpret_cast(bitmap), + bm_bytes); } cur_row_size_ = std::max(count, cur_row_size_); return E_OK; @@ -271,15 +271,36 @@ int Tablet::set_column_string_values(uint32_t schema_index, return E_OUT_OF_RANGE; } + // Reject non-string types: the union member is StringColumn*, but for + // numeric columns the same slot holds the numeric buffer pointer. + // Interpreting it as StringColumn* and writing into ->buffer/->offsets + // would corrupt the numeric buffer. + const TSDataType dt = schema_vec_->at(schema_index).data_type_; + if (dt != STRING && dt != TEXT && dt != BLOB) { + return E_TYPE_NOT_MATCH; + } StringColumn* sc = value_matrix_[schema_index].string_col; if (sc == nullptr) { return E_INVALID_ARG; } + // offsets is the Arrow-style "offsets" array (count + 1 entries). All + // downstream code assumes offsets[0] == 0, offsets are non-negative, + // and offsets[i] <= offsets[i+1]. Skipping these checks would let a + // caller pass e.g. {0, 10, 5} and trigger an unsigned underflow on + // (offsets[i+1] - offsets[i]) at serialize time, plus a wild memcpy. + if (UNLIKELY(offsets == nullptr)) return E_INVALID_ARG; + if (UNLIKELY(offsets[0] != 0)) return E_INVALID_ARG; + for (uint32_t i = 0; i < count; i++) { + if (UNLIKELY(offsets[i + 1] < offsets[i])) return E_INVALID_ARG; + } + if (UNLIKELY(offsets[count] < 0)) return E_INVALID_ARG; uint32_t total_bytes = static_cast(offsets[count]); if (total_bytes > sc->buf_capacity) { + char* new_buf = (char*)mem_realloc(sc->buffer, total_bytes); + if (UNLIKELY(new_buf == nullptr)) return E_OOM; + sc->buffer = new_buf; sc->buf_capacity = total_bytes; - sc->buffer = (char*)mem_realloc(sc->buffer, sc->buf_capacity); } if (total_bytes > 0) { @@ -291,14 +312,74 @@ int Tablet::set_column_string_values(uint32_t schema_index, if (bitmap == nullptr) { bitmaps_[schema_index].clear_all(); } else { - char* tsfile_bm = bitmaps_[schema_index].get_bitmap(); uint32_t bm_bytes = (count + 7) / 8; - std::memcpy(tsfile_bm, bitmap, bm_bytes); + bitmaps_[schema_index].copy_from(reinterpret_cast(bitmap), + bm_bytes); + } + cur_row_size_ = std::max(count, cur_row_size_); + return E_OK; +} + +int Tablet::set_column_string_repeated(uint32_t schema_index, const char* str, + uint32_t str_len, uint32_t count) { + if (err_code_ != E_OK) return err_code_; + if (UNLIKELY(schema_index >= schema_vec_->size())) return E_OUT_OF_RANGE; + if (UNLIKELY(count > static_cast(max_row_num_))) + return E_OUT_OF_RANGE; + + // See set_column_string_values: the union member is only valid as + // StringColumn* when the schema column is a variable-width type. + const TSDataType dt = schema_vec_->at(schema_index).data_type_; + if (dt != STRING && dt != TEXT && dt != BLOB) { + return E_TYPE_NOT_MATCH; + } + StringColumn* sc = value_matrix_[schema_index].string_col; + if (sc == nullptr) return E_INVALID_ARG; + + // str_len * count can overflow uint32_t; do the multiply in uint64_t and + // reject anything that wouldn't fit, otherwise the subsequent loop would + // walk past the truncated buf_capacity allocation. + uint64_t total_bytes_64 = + static_cast(str_len) * static_cast(count); + if (total_bytes_64 > std::numeric_limits::max()) { + return E_OVERFLOW; } + uint32_t total_bytes = static_cast(total_bytes_64); + if (total_bytes > sc->buf_capacity) { + char* new_buf = (char*)mem_realloc(sc->buffer, total_bytes); + if (UNLIKELY(new_buf == nullptr)) return E_OOM; + sc->buffer = new_buf; + sc->buf_capacity = total_bytes; + } + + for (uint32_t i = 0; i < count; i++) { + sc->offsets[i] = i * str_len; + memcpy(sc->buffer + i * str_len, str, str_len); + } + sc->offsets[count] = total_bytes; + sc->buf_used = total_bytes; + + bitmaps_[schema_index].clear_all(); cur_row_size_ = std::max(count, cur_row_size_); return E_OK; } +void Tablet::reset(uint32_t row_count) { + ASSERT(row_count <= max_row_num_); + cur_row_size_ = row_count; + reset_string_columns(); + // Bitmaps init to all-null (bit=1); writes flip bits to mark non-null. + // Without resetting them here, a reused Tablet would inherit cleared + // bits from the previous batch, causing stale values to be reported as + // non-null and written out again. + if (bitmaps_ != nullptr) { + const size_t schema_count = schema_vec_->size(); + for (size_t c = 0; c < schema_count; c++) { + bitmaps_[c].reset(); + } + } +} + void* Tablet::get_value(int row_index, uint32_t schema_index, common::TSDataType& data_type) const { if (UNLIKELY(schema_index >= schema_vec_->size())) { @@ -505,31 +586,21 @@ void Tablet::reset_string_columns() { } } -// Find all row indices where the device ID changes. A device ID is the -// composite key formed by all id columns (e.g. region + sensor_id). Row i -// is a boundary when at least one id column differs between row i-1 and row i. -// -// Example (2 id columns: region, sensor_id): -// row 0: "A", "s1" -// row 1: "A", "s2" <- boundary: sensor_id changed -// row 2: "B", "s1" <- boundary: region changed -// row 3: "B", "s1" -// row 4: "B", "s2" <- boundary: sensor_id changed -// result: [1, 2, 4] -// -// Boundaries are computed in one shot at flush time rather than maintained -// incrementally during add_value / set_column_*. The total work is similar -// either way, but batch computation here is far more CPU-friendly: the inner -// loop is a tight memcmp scan over contiguous buffers with good cache -// locality, and the CPU can pipeline comparisons without the branch overhead -// and cache thrashing of per-row bookkeeping spread across the write path. std::vector Tablet::find_all_device_boundaries() const { const uint32_t row_count = get_cur_row_size(); if (row_count <= 1) return {}; + // Use uint64_t bitmap instead of vector for faster set/test/scan. const uint32_t nwords = (row_count + 63) / 64; std::vector boundary(nwords, 0); + // Walk id columns RIGHT to LEFT. In time-series tag systems the rightmost + // tags (sensor_id, metric_name, etc.) typically have the highest + // cardinality and change most often. By processing them first we mark most + // of the boundary bitmap early; subsequent (lower-cardinality) columns then + // short- circuit on `boundary[i] already set` for the bulk of their rows. + // Reverse order also lets us bail out of the entire scan as soon as every + // possible boundary is marked. uint32_t boundary_count = 0; const uint32_t max_boundaries = row_count - 1; for (auto it = id_column_indexes_.rbegin(); it != id_column_indexes_.rend(); @@ -537,43 +608,55 @@ std::vector Tablet::find_all_device_boundaries() const { const StringColumn& sc = *value_matrix_[*it].string_col; const int32_t* off = sc.offsets; const char* buf = sc.buffer; + common::BitMap& bitmap = const_cast(bitmaps_[*it]); for (uint32_t i = 1; i < row_count; i++) { - if (boundary[i >> 6] & (1ULL << (i & 63))) continue; + if (boundary[i >> 6] & (1ULL << (i & 63))) { + continue; + } + const bool prev_null = bitmap.test(i - 1); + const bool curr_null = bitmap.test(i); + if (prev_null != curr_null) { + boundary[i >> 6] |= (1ULL << (i & 63)); + if (++boundary_count >= max_boundaries) { + break; + } + continue; + } + if (prev_null) { + continue; + } + // Signed int32 widths so an offset-array corruption that would + // otherwise underflow to a huge unsigned value surfaces as + // len < 0 instead. memcmp's size_t param needs an explicit cast, + // guarded by `len_a > 0`. int32_t len_a = off[i] - off[i - 1]; int32_t len_b = off[i + 1] - off[i]; if (len_a != len_b || (len_a > 0 && memcmp(buf + off[i - 1], buf + off[i], - static_cast(len_a)) != 0)) { + static_cast(len_a)) != 0)) { boundary[i >> 6] |= (1ULL << (i & 63)); - if (++boundary_count >= max_boundaries) break; + if (++boundary_count >= max_boundaries) { + break; + } } } - if (boundary_count >= max_boundaries) break; - } - - // Sweep the bitmap word by word, extracting set bit positions in order. - // Each word covers 64 consecutive rows: word w covers rows [w*64, w*64+63]. - // - // For each word we use two standard bit tricks: - // __builtin_ctzll(bits) — count trailing zeros = index of lowest set bit - // bits &= bits - 1 — clear the lowest set bit - // - // Example: w=1, bits=0b...00010100 (bits 2 and 4 set) - // iter 1: ctzll=2 → idx=1*64+2=66, bits becomes 0b...00010000 - // iter 2: ctzll=4 → idx=1*64+4=68, bits becomes 0b...00000000 → exit - // - // Guards: idx>0 because row 0 can never be a boundary (no predecessor); - // idx= max_boundaries) { + break; + } + } + + // Collect boundary positions using bitscan std::vector result; for (uint32_t w = 0; w < nwords; w++) { uint64_t bits = boundary[w]; while (bits) { - uint32_t bit = bitops::ctz64_nonzero(bits); + uint32_t bit = + static_cast(common::bitops::ctz_nonzero(bits)); uint32_t idx = w * 64 + bit; if (idx > 0 && idx < row_count) { result.push_back(idx); } - bits &= bits - 1; + bits &= bits - 1; // clear lowest set bit } } return result; @@ -612,4 +695,4 @@ std::shared_ptr Tablet::get_device_id(int i) const { return res; } -} // end namespace storage \ No newline at end of file +} // end namespace storage diff --git a/cpp/src/common/tablet.h b/cpp/src/common/tablet.h index 799d6b7cc..76af3ac0e 100644 --- a/cpp/src/common/tablet.h +++ b/cpp/src/common/tablet.h @@ -22,7 +22,6 @@ #include #include -#include #include #include "common/config/config.h" @@ -47,7 +46,6 @@ class TabletColIterator; * with their associated metadata such as column names and types. */ class Tablet { - public: // Arrow-style string column: offsets + contiguous buffer. // string[i] = buffer + offsets[i], len = offsets[i+1] - offsets[i] struct StringColumn { @@ -61,11 +59,10 @@ class Tablet { void init(uint32_t max_rows, uint32_t init_buf_capacity) { offsets = (int32_t*)common::mem_alloc( - sizeof(int32_t) * (max_rows + 1), common::MOD_DEFAULT); + sizeof(int32_t) * (max_rows + 1), common::MOD_TABLET); offsets[0] = 0; buf_capacity = init_buf_capacity; - buffer = - (char*)common::mem_alloc(buf_capacity, common::MOD_DEFAULT); + buffer = (char*)common::mem_alloc(buf_capacity, common::MOD_TABLET); buf_used = 0; } @@ -98,14 +95,13 @@ class Tablet { return buffer + offsets[row]; } uint32_t get_len(uint32_t row) const { - return static_cast(offsets[row + 1] - offsets[row]); + return offsets[row + 1] - offsets[row]; } // Return a String view for a given row. The returned reference is // valid until the next call to get_string_view on this column. common::String& get_string_view(uint32_t row) { view_cache_.buf_ = buffer + offsets[row]; - view_cache_.len_ = - static_cast(offsets[row + 1] - offsets[row]); + view_cache_.len_ = offsets[row + 1] - offsets[row]; return view_cache_; } @@ -231,11 +227,14 @@ class Tablet { ~Tablet() { destroy(); } - // Tablet owns raw heap buffers (timestamps_, value_matrix_, bitmaps_) that - // destroy() frees. The implicitly generated copy operations would shallow- - // copy those pointers, causing double-free / use-after-free, so copying is - // disabled. Move transfers ownership and leaves the source empty (its - // pointers nulled) so the moved-from object destructs harmlessly. + // Tablet owns several heap buffers (timestamps_, value_matrix_ with its + // StringColumn::buffer/offsets, bitmaps_) that ~Tablet frees. The default + // copy ctor / copy-assign shallow-copies the raw pointers, so any copy + // path (e.g. `return tablet;` without NRVO under MSVC Debug) leaves the + // source Tablet's destructor freeing buffers the copy still points at, + // triggering heap-use-after-free in code like + // Tablet::find_all_device_boundaries. Make Tablet move-only with a + // pointer-stealing move ctor / move-assign so return-by-value is safe. Tablet(const Tablet&) = delete; Tablet& operator=(const Tablet&) = delete; @@ -250,10 +249,14 @@ class Tablet { value_matrix_(other.value_matrix_), bitmaps_(other.bitmaps_), column_categories_(std::move(other.column_categories_)), - id_column_indexes_(std::move(other.id_column_indexes_)) { + id_column_indexes_(std::move(other.id_column_indexes_)), + single_device_(other.single_device_) { other.timestamps_ = nullptr; other.value_matrix_ = nullptr; other.bitmaps_ = nullptr; + other.cur_row_size_ = 0; + // Leaving other.schema_vec_ moved-from is fine; destroy() only + // touches the heap buffers above, which we've now nulled out. } Tablet& operator=(Tablet&& other) noexcept { @@ -270,9 +273,11 @@ class Tablet { bitmaps_ = other.bitmaps_; column_categories_ = std::move(other.column_categories_); id_column_indexes_ = std::move(other.id_column_indexes_); + single_device_ = other.single_device_; other.timestamps_ = nullptr; other.value_matrix_ = nullptr; other.bitmaps_ = nullptr; + other.cur_row_size_ = 0; } return *this; } @@ -283,12 +288,6 @@ class Tablet { } size_t get_column_count() const { return schema_vec_->size(); } uint32_t get_cur_row_size() const { return cur_row_size_; } - int64_t get_timestamp(uint32_t row_index) const { - return timestamps_[row_index]; - } - bool is_null(uint32_t row_index, uint32_t col_index) const { - return bitmaps_[col_index].test(row_index); - } /** * @brief Adds a timestamp to the specified row. @@ -300,25 +299,27 @@ class Tablet { */ int add_timestamp(uint32_t row_index, int64_t timestamp); - /** - * @brief Bulk copy timestamps into the tablet. - * - * @param timestamps Pointer to an array of timestamp values. - * @param count Number of timestamps to copy. Must be <= max_row_num. - * If count > cur_row_size_, cur_row_size_ is updated to count, - * so that subsequent operations know how many rows are populated. - * @return Returns 0 on success, or a non-zero error code on failure - * (E_OUT_OF_RANGE if count > max_row_num). - */ int set_timestamps(const int64_t* timestamps, uint32_t count); - // Bulk copy fixed-length column data. If bitmap is nullptr, all rows are - // non-null. Otherwise bit=1 means null, bit=0 means valid (same as TsFile - // BitMap convention). Callers using other conventions (e.g. Arrow, where - // 1=valid) must invert before calling. + // Bulk copy fixed-length column data. bitmap=nullptr means all non-null. + // bitmap uses TsFile convention: bit=1 is null, bit=0 is valid. int set_column_values(uint32_t schema_index, const void* data, const uint8_t* bitmap, uint32_t count); + // Bulk copy a STRING column from Arrow-style offsets + flat data buffer. + // bitmap=nullptr means all non-null; same convention as set_column_values. + int set_column_string_values(uint32_t schema_index, const int32_t* offsets, + const char* data, const uint8_t* bitmap, + uint32_t count); + + // Bulk fill a STRING column with the same value for all rows. + int set_column_string_repeated(uint32_t schema_index, const char* str, + uint32_t str_len, uint32_t count); + + // Reset per-batch state so the tablet can be reused without reallocating + // its backing buffers. row_count is typically 0 before refilling. + void reset(uint32_t row_count = 0); + void* get_value(int row_index, uint32_t schema_index, common::TSDataType& data_type) const; /** @@ -341,14 +342,10 @@ class Tablet { std::shared_ptr get_device_id(int i) const; std::vector find_all_device_boundaries() const; - // Bulk copy string column data (offsets + data buffer). - // offsets has count+1 entries and must start from 0 (offsets[0] == 0). - // bitmap follows TsFile convention (bit=1 means null, nullptr means all - // valid). Callers using Arrow convention (bit=1 means valid) must invert - // before calling. - int set_column_string_values(uint32_t schema_index, const int32_t* offsets, - const char* data, const uint8_t* bitmap, - uint32_t count); + // When the caller guarantees that all rows belong to a single device, + // set this flag to skip the O(n*m) boundary detection in the write path. + void set_single_device(bool v) { single_device_ = v; } + bool is_single_device() const { return single_device_; } /** * @brief Template function to add a value of type T to the specified row * and column by name. @@ -406,6 +403,7 @@ class Tablet { common::BitMap* bitmaps_; std::vector column_categories_; std::vector id_column_indexes_; + bool single_device_ = false; }; } // end namespace storage diff --git a/cpp/src/common/thread_pool.h b/cpp/src/common/thread_pool.h index f82aea038..191001bd9 100644 --- a/cpp/src/common/thread_pool.h +++ b/cpp/src/common/thread_pool.h @@ -27,7 +27,6 @@ #include #include #include -#include #include namespace common { @@ -38,12 +37,27 @@ namespace common { // (column-parallel decoding). class ThreadPool { public: - explicit ThreadPool(size_t num_threads) : stop_(false), active_(0) { - for (size_t i = 0; i < num_threads; i++) { - workers_.emplace_back([this] { worker_loop(); }); + explicit ThreadPool(size_t num_threads) + // A zero-thread pool would silently accept submit() but wait_all() + // would block forever because active_ never reaches 0. init_common() + // already clamps the configured size to >= 1 before building the + // global pool; this normalization is a defensive backstop so any + // direct ThreadPool(0) still makes progress. + : num_threads_(num_threads == 0 ? 1 : num_threads), + stop_(false), + active_(0) { + for (size_t i = 0; i < num_threads_; i++) { + workers_.emplace_back([this, i] { worker_loop(i); }); } } + // Returns this worker's index in [0, num_threads). Returns SIZE_MAX when + // called from a non-pool thread. Used by callers that want per-worker + // state (e.g., per-worker decoders/compressors). + static size_t current_worker_id() { return tl_worker_id_(); } + + size_t num_threads() const { return num_threads_; } + ~ThreadPool() { { std::lock_guard lk(mu_); @@ -88,7 +102,8 @@ class ThreadPool { } private: - void worker_loop() { + void worker_loop(size_t id) { + tl_worker_id_() = id; while (true) { std::function task; { @@ -98,7 +113,23 @@ class ThreadPool { task = std::move(tasks_.front()); tasks_.pop(); } - task(); + // Without the try/catch, a task that throws would: + // (1) skip the active_-- below → wait_all() blocks forever + // because active_ never drops to zero, and + // (2) propagate the exception out of the std::thread function + // → std::terminate() takes down the whole process. + // Swallowing the exception is unfortunate but it matches the + // contract of the public submit(std::function) overload + // which has no way to surface the failure back to the caller. + // submit() callers receive their error via the std::future + // wrapper installed by std::packaged_task — that path never + // reaches here, so this catch only fires for fire-and-forget + // tasks where the alternative is termination. + try { + task(); + } catch (...) { + // Intentionally suppressed; see comment above. + } { std::lock_guard lk(mu_); active_--; @@ -107,6 +138,14 @@ class ThreadPool { } } + // Wrapped in a function so static-initialization order is well-defined + // (function-local static is zero-initialized to a sentinel). + static size_t& tl_worker_id_() { + static thread_local size_t id = static_cast(-1); + return id; + } + + size_t num_threads_; std::vector workers_; std::queue> tasks_; std::mutex mu_; diff --git a/cpp/src/common/tsblock/tsblock.h b/cpp/src/common/tsblock/tsblock.h index 859ad393d..b68af1611 100644 --- a/cpp/src/common/tsblock/tsblock.h +++ b/cpp/src/common/tsblock/tsblock.h @@ -144,6 +144,12 @@ class RowAppender { ASSERT(tsblock_->row_count_ > 0); tsblock_->row_count_--; } + FORCE_INLINE uint32_t remaining() const { + return tsblock_->max_row_count_ - tsblock_->row_count_; + } + FORCE_INLINE void add_rows(uint32_t count) { + tsblock_->row_count_ += count; + } FORCE_INLINE void append(uint32_t slot_index, const char* value, uint32_t len) { @@ -222,6 +228,19 @@ class ColAppender { } FORCE_INLINE void reset() { column_row_count_ = 0; } + FORCE_INLINE void bulk_append_fixed(const char* data, uint32_t count, + uint32_t elem_size) { + vec_->get_value_data().append_fixed_value(data, count * elem_size); + vec_->add_row_nums(count); + column_row_count_ += count; + } + + FORCE_INLINE uint32_t get_column_row_count() const { + return column_row_count_; + } + + FORCE_INLINE Vector* get_vector() { return vec_; } + private: uint32_t column_index_; uint32_t column_row_count_; @@ -242,6 +261,8 @@ class RowIterator { FORCE_INLINE bool has_next() { return row_id_ < tsblock_->row_count_; } + FORCE_INLINE uint32_t get_row_id() const { return row_id_; } + FORCE_INLINE uint32_t get_column_count() { return column_count_; } FORCE_INLINE TSDataType get_data_type(uint32_t column_index) { @@ -251,17 +272,14 @@ class RowIterator { FORCE_INLINE void next() { ASSERT(row_id_ < tsblock_->row_count_); - ++row_id_; + const uint32_t current_row_id = row_id_++; for (uint32_t i = 0; i < column_count_; ++i) { - tsblock_->vectors_[i]->update_offset(); + if (!tsblock_->vectors_[i]->is_null(current_row_id)) { + tsblock_->vectors_[i]->update_offset(); + } } } - FORCE_INLINE void next(size_t ind) const { - ASSERT(row_id_ < tsblock_->row_count_); - tsblock_->vectors_[ind]->update_offset(); - } - FORCE_INLINE void update_row_id() { row_id_++; } FORCE_INLINE char* read(uint32_t column_index, uint32_t* __restrict len, @@ -271,6 +289,22 @@ class RowIterator { return vec->read(len, null, row_id_); } + // Cheap null check at the current row that avoids the value-read path. + FORCE_INLINE bool is_null_at(uint32_t column_index) { + ASSERT(column_index < column_count_); + return tsblock_->vectors_[column_index]->is_null(row_id_); + } + + // Direct access to the underlying Vector for the column. Caller is + // responsible for type-correct interpretation of the buffer; intended + // for the fast typed-read path that wants to bypass Vector::read's + // virtual dispatch (read into the raw buffer at the vector's current + // offset_). + FORCE_INLINE Vector* get_vector(uint32_t column_index) { + ASSERT(column_index < column_count_); + return tsblock_->vectors_[column_index]; + } + std::string debug_string(); // for debug private: @@ -311,6 +345,23 @@ class ColIterator { FORCE_INLINE uint32_t get_column_index() { return column_index_; } + FORCE_INLINE uint32_t remaining() const { + return tsblock_->row_count_ - row_id_; + } + FORCE_INLINE char* data_ptr() { + return vec_->get_value_data().get_data() + vec_->get_offset(); + } + FORCE_INLINE void advance(uint32_t n, uint32_t elem_size) { + row_id_ += n; + vec_->advance_offset(n * elem_size); + } + + FORCE_INLINE void advance_row_only(uint32_t n) { row_id_ += n; } + + FORCE_INLINE uint32_t get_row_id() const { return row_id_; } + + FORCE_INLINE Vector* get_vector() { return vec_; } + private: uint32_t column_index_; uint32_t row_id_; diff --git a/cpp/src/common/tsblock/vector/variable_length_vector.h b/cpp/src/common/tsblock/vector/variable_length_vector.h index b98a9c739..84e541e5c 100644 --- a/cpp/src/common/tsblock/vector/variable_length_vector.h +++ b/cpp/src/common/tsblock/vector/variable_length_vector.h @@ -45,8 +45,15 @@ class VariableLengthVector : public Vector { // cppcheck-suppress missingOverride FORCE_INLINE void update_offset() OVERRIDE { - offset_ += variable_type_len_; - offset_ += last_value_len_; + // Self-contained advance: read the length prefix at the current + // offset from the buffer rather than relying on a side effect from + // a prior read(). This makes update_offset safe when callers skip + // reading variable-length columns for some rows (e.g. a row + // iterator that only consumes fixed-width columns). + uint32_t value_len = 0; + std::memcpy(&value_len, values_.get_data() + offset_, + sizeof(value_len)); + offset_ += variable_type_len_ + value_len; } // cppcheck-suppress missingOverride diff --git a/cpp/src/common/tsblock/vector/vector.h b/cpp/src/common/tsblock/vector/vector.h index 37a96c543..dde3e76cc 100644 --- a/cpp/src/common/tsblock/vector/vector.h +++ b/cpp/src/common/tsblock/vector/vector.h @@ -73,6 +73,9 @@ class Vector { FORCE_INLINE uint32_t get_row_num() { return row_num_; } FORCE_INLINE void add_row_num() { row_num_++; } + FORCE_INLINE void add_row_nums(uint32_t n) { row_num_ += n; } + FORCE_INLINE uint32_t get_offset() const { return offset_; } + FORCE_INLINE void advance_offset(uint32_t bytes) { offset_ += bytes; } FORCE_INLINE common::TsBlock* get_tsblock() { return tsblock_; } diff --git a/cpp/src/common/tsfile_common.h b/cpp/src/common/tsfile_common.h index b516b608f..fd3690200 100644 --- a/cpp/src/common/tsfile_common.h +++ b/cpp/src/common/tsfile_common.h @@ -314,6 +314,11 @@ class ITimeseriesIndex { virtual common::SimpleList* get_value_chunk_meta_list() const { return nullptr; } + virtual uint32_t get_value_column_count() const { return 1; } + virtual common::SimpleList* get_value_chunk_meta_list( + uint32_t col_index) const { + return col_index == 0 ? get_value_chunk_meta_list() : nullptr; + } virtual common::String get_measurement_name() const { return common::String(); @@ -457,7 +462,7 @@ class TimeseriesIndex : public ITimeseriesIndex { (timeseries_meta_type_ & 0x3F); // TODO chunk_meta_list_ = new (chunk_meta_list_buf) common::SimpleList(pa); - uint32_t start_pos = in.read_pos(); + uint64_t start_pos = in.read_pos(); while (IS_SUCC(ret) && in.read_pos() < start_pos + chunk_meta_list_data_size_) { void* cm_buf = pa->alloc(sizeof(ChunkMeta)); @@ -589,11 +594,17 @@ class AlignedTimeseriesIndex : public ITimeseriesIndex { virtual common::String get_measurement_name() const { return value_ts_idx_->get_measurement_name(); } + // Return the VALUE column's data type — that's what consumers like + // TsFileReader::get_timeseries_schema and metadata APIs expect for an + // aligned measurement. Returning time_ts_idx_->get_data_type() would + // surface the time chunk's on-wire VECTOR marker (or INT64 depending + // on how the marker is interpreted) for every aligned timeseries, + // breaking schema introspection. virtual common::TSDataType get_data_type() const { return value_ts_idx_ == nullptr ? common::INVALID_DATATYPE : value_ts_idx_->get_data_type(); } - virtual bool is_aligned() const { return true; } + bool is_aligned() const override { return true; } virtual Statistic* get_statistic() const { return value_ts_idx_->get_statistic(); } @@ -608,6 +619,52 @@ class AlignedTimeseriesIndex : public ITimeseriesIndex { #endif }; +class MultiAlignedTimeseriesIndex : public ITimeseriesIndex { + public: + TimeseriesIndex* time_ts_idx_ = nullptr; + std::vector value_ts_idxs_; + + MultiAlignedTimeseriesIndex() {} + ~MultiAlignedTimeseriesIndex() {} + + common::SimpleList* get_time_chunk_meta_list() const override { + return time_ts_idx_ ? time_ts_idx_->get_chunk_meta_list() : nullptr; + } + common::SimpleList* get_value_chunk_meta_list() const override { + return value_ts_idxs_.empty() + ? nullptr + : value_ts_idxs_[0]->get_chunk_meta_list(); + } + uint32_t get_value_column_count() const override { + return value_ts_idxs_.size(); + } + common::SimpleList* get_value_chunk_meta_list( + uint32_t col_index) const override { + return col_index < value_ts_idxs_.size() + ? value_ts_idxs_[col_index]->get_chunk_meta_list() + : nullptr; + } + common::String get_measurement_name() const override { + return value_ts_idxs_.empty() + ? common::String() + : value_ts_idxs_[0]->get_measurement_name(); + } + // Same fix as AlignedTimeseriesIndex: report the first value column's + // type rather than the time chunk's VECTOR marker. Consumers walking + // a multi-aligned device for schema info expect the measurement type. + common::TSDataType get_data_type() const override { + return value_ts_idxs_.empty() || value_ts_idxs_[0] == nullptr + ? common::INVALID_DATATYPE + : value_ts_idxs_[0]->get_data_type(); + } + bool is_aligned() const override { return true; } + Statistic* get_statistic() const override { return nullptr; } + + const std::vector& get_value_indices() const { + return value_ts_idxs_; + } +}; + class TSMIterator { public: explicit TSMIterator( @@ -629,7 +686,6 @@ class TSMIterator { common::SimpleList::Iterator chunk_meta_iter_; // timeseries measurenemnt chunk meta info - // map >> std::map, std::map>, IDeviceIDComparator> diff --git a/cpp/src/compress/lz4_compressor.cc b/cpp/src/compress/lz4_compressor.cc index 88c64466f..0f19ce179 100644 --- a/cpp/src/compress/lz4_compressor.cc +++ b/cpp/src/compress/lz4_compressor.cc @@ -76,9 +76,13 @@ int LZ4Compressor::compress(char* uncompressed_buf, } void LZ4Compressor::after_compress(char* compressed_buf) { + // See SnappyCompressor::after_compress for the same reasoning: the member + // pointer can lag behind the caller-known buffer across page reuse. if (compressed_buf != nullptr) { - mem_free(compressed_buf_); - compressed_buf_ = nullptr; + mem_free(compressed_buf); + if (compressed_buf_ == compressed_buf) { + compressed_buf_ = nullptr; + } } } @@ -132,9 +136,11 @@ int LZ4Compressor::uncompress(char* compressed_buf, uint32_t compressed_buf_len, void LZ4Compressor::after_uncompress(char* uncompressed_buf) { if (uncompressed_buf != nullptr) { - mem_free(uncompressed_buf_); - uncompressed_buf_ = nullptr; + mem_free(uncompressed_buf); + if (uncompressed_buf_ == uncompressed_buf) { + uncompressed_buf_ = nullptr; + } } } -} // end namespace storage \ No newline at end of file +} // end namespace storage diff --git a/cpp/src/compress/snappy_compressor.cc b/cpp/src/compress/snappy_compressor.cc index 6a2735e7b..e78a67ac3 100644 --- a/cpp/src/compress/snappy_compressor.cc +++ b/cpp/src/compress/snappy_compressor.cc @@ -73,9 +73,16 @@ int SnappyCompressor::compress(char* uncompressed_buf, } void SnappyCompressor::after_compress(char* compressed_buf) { + // Free the buffer the caller is releasing, not whatever we last cached in + // compressed_buf_. The member is only kept so destroy() can clean up if + // after_compress is never called. When the same compressor is reused + // across pages, compressed_buf_ may point to a different (live) allocation + // or be null by the time the caller releases an earlier page's buffer. if (compressed_buf != nullptr) { - mem_free(compressed_buf_); - compressed_buf_ = nullptr; + mem_free(compressed_buf); + if (compressed_buf_ == compressed_buf) { + compressed_buf_ = nullptr; + } } } @@ -109,9 +116,11 @@ int SnappyCompressor::uncompress(char* compressed_buf, void SnappyCompressor::after_uncompress(char* uncompressed_buf) { if (uncompressed_buf != nullptr) { - mem_free(uncompressed_buf_); - uncompressed_buf_ = nullptr; + mem_free(uncompressed_buf); + if (uncompressed_buf_ == uncompressed_buf) { + uncompressed_buf_ = nullptr; + } } } -} // end namespace storage \ No newline at end of file +} // end namespace storage diff --git a/cpp/src/compress/uncompressed_compressor.h b/cpp/src/compress/uncompressed_compressor.h index c262837a8..c342b5001 100644 --- a/cpp/src/compress/uncompressed_compressor.h +++ b/cpp/src/compress/uncompressed_compressor.h @@ -20,19 +20,38 @@ #ifndef COMPRESS_UNCOMPRESSED_COMPRESSOR_H #define COMPRESS_UNCOMPRESSED_COMPRESSOR_H +#include + +#include "common/allocator/alloc_base.h" #include "compressor.h" +#include "utils/errno_define.h" +#include "utils/util_define.h" namespace storage { class UncompressedCompressor : public Compressor { public: - UncompressedCompressor() {} - virtual ~UncompressedCompressor() {} + UncompressedCompressor() : uncompressed_buf_(nullptr) {} + virtual ~UncompressedCompressor() { + if (uncompressed_buf_ != nullptr) { + common::mem_free(uncompressed_buf_); + uncompressed_buf_ = nullptr; + } + } int reset(bool for_compress) { UNUSED(for_compress); + if (uncompressed_buf_ != nullptr) { + common::mem_free(uncompressed_buf_); + uncompressed_buf_ = nullptr; + } return common::E_OK; } - void destroy() {} + void destroy() { + if (uncompressed_buf_ != nullptr) { + common::mem_free(uncompressed_buf_); + uncompressed_buf_ = nullptr; + } + } int compress(char* uncompressed_buf, uint32_t uncompressed_buf_len, char*& compressed_buf, uint32_t& compressed_buf_len) { compressed_buf = uncompressed_buf; @@ -43,11 +62,33 @@ class UncompressedCompressor : public Compressor { int uncompress(char* compressed_buf, uint32_t compressed_buf_len, char*& uncompressed_buf, uint32_t& uncompressed_buf_len) { - uncompressed_buf = compressed_buf; + char* buf = static_cast( + common::mem_alloc(compressed_buf_len, common::MOD_COMPRESSOR_OBJ)); + if (buf == nullptr) { + return common::E_OOM; + } + memcpy(buf, compressed_buf, compressed_buf_len); + uncompressed_buf = buf; + uncompressed_buf_ = buf; uncompressed_buf_len = compressed_buf_len; return common::E_OK; } - void after_uncompress(char* uncompressed_buf) { UNUSED(uncompressed_buf); } + void after_uncompress(char* uncompressed_buf) { + // Free the buffer the caller is releasing, not the most-recently + // allocated one cached in uncompressed_buf_. Two successive + // uncompress() calls would overwrite uncompressed_buf_ with the + // second allocation; after_uncompress(first) used to free that + // second buffer (use-after-free for the still-live one) and leak + // the first. + if (uncompressed_buf == nullptr) return; + common::mem_free(uncompressed_buf); + if (uncompressed_buf_ == uncompressed_buf) { + uncompressed_buf_ = nullptr; + } + } + + private: + char* uncompressed_buf_; }; } // end namespace storage diff --git a/cpp/src/cwrapper/arrow_c.cc b/cpp/src/cwrapper/arrow_c.cc index 931c17de7..3f02a7692 100644 --- a/cpp/src/cwrapper/arrow_c.cc +++ b/cpp/src/cwrapper/arrow_c.cc @@ -843,7 +843,12 @@ int ArrowStructToTablet(const char* table_name, const ArrowArray* in_array, const ArrowArray* ts_arr = in_array->children[time_col_index]; const int64_t* ts_buf = static_cast(ts_arr->buffers[1]) + ts_arr->offset; - tablet->set_timestamps(ts_buf, static_cast(n_rows)); + int sret = + tablet->set_timestamps(ts_buf, static_cast(n_rows)); + if (sret != common::E_OK) { + delete tablet; + return sret; + } } // Fill data columns from Arrow children (use read_modes to decode buffers) @@ -892,11 +897,15 @@ int ArrowStructToTablet(const char* table_name, const ArrowArray* in_array, delete tablet; return common::E_OOM; } - tablet->set_column_values(tcol, data, null_bm, - static_cast(n_rows)); + int sret = tablet->set_column_values( + tcol, data, null_bm, static_cast(n_rows)); if (null_bm != nullptr) { common::mem_free(null_bm); } + if (sret != common::E_OK) { + delete tablet; + return sret; + } break; } case common::DATE: { @@ -948,14 +957,18 @@ int ArrowStructToTablet(const char* table_name, const ArrowArray* in_array, delete tablet; return common::E_OOM; } - tablet->set_column_string_values(tcol, offsets, data, null_bm, - nrows); + int sret = tablet->set_column_string_values(tcol, offsets, data, + null_bm, nrows); if (null_bm != nullptr) { common::mem_free(null_bm); } if (norm_offsets != nullptr) { common::mem_free(norm_offsets); } + if (sret != common::E_OK) { + delete tablet; + return sret; + } break; } default: diff --git a/cpp/src/cwrapper/tsfile_cwrapper.cc b/cpp/src/cwrapper/tsfile_cwrapper.cc index 0934981f9..0fc915974 100644 --- a/cpp/src/cwrapper/tsfile_cwrapper.cc +++ b/cpp/src/cwrapper/tsfile_cwrapper.cc @@ -21,7 +21,9 @@ #include #include +#include #include + #ifdef _WIN32 #include #else @@ -92,8 +94,14 @@ WriteFile write_file_new(const char* pathname, ERRNO* err_code) { int ret; init_tsfile_config(); - if (access(pathname, F_OK) == 0) { - *err_code = common::E_ALREADY_EXIST; + struct stat path_stat {}; + if (stat(pathname, &path_stat) == 0) { +#ifdef _WIN32 + const bool is_dir = (path_stat.st_mode & _S_IFDIR) != 0; +#else + const bool is_dir = S_ISDIR(path_stat.st_mode); +#endif + *err_code = is_dir ? common::E_FILE_OPEN_ERR : common::E_ALREADY_EXIST; return nullptr; } @@ -110,6 +118,17 @@ WriteFile write_file_new(const char* pathname, ERRNO* err_code) { TsFileWriter tsfile_writer_new(WriteFile file, TableSchema* schema, ERRNO* err_code) { + // C API: every public entry must defend against null callers — a null + // schema or err_code would crash the host process the moment it's + // dereferenced. The tag-filter helpers already follow this pattern. + if (err_code == nullptr) { + return nullptr; + } + if (file == nullptr || schema == nullptr || + schema->column_schemas == nullptr || schema->table_name == nullptr) { + *err_code = common::E_INVALID_ARG; + return nullptr; + } if (schema->column_num == 0) { *err_code = common::E_INVALID_SCHEMA; return nullptr; @@ -149,6 +168,15 @@ TsFileWriter tsfile_writer_new_with_memory_threshold(WriteFile file, TableSchema* schema, uint64_t memory_threshold, ERRNO* err_code) { + // See tsfile_writer_new() above for the null-guard rationale. + if (err_code == nullptr) { + return nullptr; + } + if (file == nullptr || schema == nullptr || + schema->column_schemas == nullptr || schema->table_name == nullptr) { + *err_code = common::E_INVALID_ARG; + return nullptr; + } if (schema->column_num == 0) { *err_code = common::E_INVALID_SCHEMA; return nullptr; @@ -158,11 +186,21 @@ TsFileWriter tsfile_writer_new_with_memory_threshold(WriteFile file, std::set column_names; for (int i = 0; i < schema->column_num; i++) { ColumnSchema cur_schema = schema->column_schemas[i]; - if (column_names.find(cur_schema.column_name) == column_names.end()) { + // Reject only when the name has already been seen. The previous + // condition was inverted, so the first column (always a fresh name) + // was rejected as a duplicate and this constructor was effectively + // unusable — tsfile_writer_new()'s loop above has the correct check + // for comparison. + if (column_names.find(cur_schema.column_name) != column_names.end()) { *err_code = common::E_INVALID_SCHEMA; return nullptr; } column_names.insert(cur_schema.column_name); + if (cur_schema.column_category == TAG && + cur_schema.data_type != TS_DATATYPE_STRING) { + *err_code = common::E_INVALID_SCHEMA; + return nullptr; + } column_schemas.emplace_back( cur_schema.column_name, static_cast(cur_schema.data_type), @@ -1205,6 +1243,8 @@ ERRNO populate_c_metadata_map_from_cpp( if (m.measurement_name == nullptr) { for (uint32_t u = 0; u < slot; u++) { free_timeseries_statistic_heap(&e.timeseries[u].statistic); + free_timeseries_statistic_heap( + &e.timeseries[u].timeline_statistic); free(e.timeseries[u].measurement_name); } free(e.timeseries); @@ -1465,6 +1505,13 @@ Tablet _tablet_new_with_target_name(const char* device_id, } ERRNO _tsfile_writer_register_table(TsFileWriter writer, TableSchema* schema) { + if (writer == nullptr || schema == nullptr || + schema->column_schemas == nullptr || schema->table_name == nullptr) { + return common::E_INVALID_ARG; + } + if (schema->column_num <= 0) { + return common::E_INVALID_SCHEMA; + } std::vector measurement_schemas; std::vector column_categories; measurement_schemas.resize(schema->column_num); @@ -1587,13 +1634,50 @@ ResultSet _tsfile_reader_query_device(TsFileReader reader, return qds; } -// ---------- Tag Filter API ---------- +// ============== Tag Filter API Implementation ============== + +// Helper macro to avoid repetition in tag filter factory functions. +// The shared_ptr must stay alive while TagFilterBuilder accesses the schema. +// Every C-API entry must validate its pointers: a null reader would deref +// during the static_cast, and null table/column/value would feed std::string +// a null pointer (UB / crash). +#define DEFINE_TAG_FILTER_FACTORY(name, method) \ + TagFilterHandle tsfile_tag_filter_##name( \ + TsFileReader reader, const char* table_name, const char* column_name, \ + const char* value) { \ + if (reader == nullptr || table_name == nullptr || \ + column_name == nullptr || value == nullptr) { \ + return nullptr; \ + } \ + auto* r = static_cast(reader); \ + auto schema = r->get_table_schema(table_name); \ + if (!schema) return nullptr; \ + storage::TagFilterBuilder builder(schema.get()); \ + return builder.method(column_name, value); \ + } + +DEFINE_TAG_FILTER_FACTORY(eq, eq) +DEFINE_TAG_FILTER_FACTORY(neq, neq) +DEFINE_TAG_FILTER_FACTORY(lt, lt) +DEFINE_TAG_FILTER_FACTORY(lteq, lteq) +DEFINE_TAG_FILTER_FACTORY(gt, gt) +DEFINE_TAG_FILTER_FACTORY(gteq, gteq) + +#undef DEFINE_TAG_FILTER_FACTORY TagFilterHandle tsfile_tag_filter_create(TsFileReader reader, const char* table_name, const char* column_name, const char* value, TagFilterOp op, ERRNO* err_code) { + if (err_code == nullptr) { + return nullptr; + } + if (reader == nullptr || table_name == nullptr || column_name == nullptr || + value == nullptr) { + *err_code = common::E_INVALID_ARG; + return nullptr; + } auto* r = static_cast(reader); auto schema = r->get_table_schema(table_name); if (!schema) { @@ -1656,25 +1740,30 @@ TagFilterHandle tsfile_tag_filter_between(TsFileReader reader, TagFilterHandle tsfile_tag_filter_and(TagFilterHandle left, TagFilterHandle right) { - return static_cast(storage::TagFilterBuilder::and_filter( + if (!left || !right) return nullptr; + return storage::TagFilterBuilder::and_filter( static_cast(left), - static_cast(right))); + static_cast(right)); } TagFilterHandle tsfile_tag_filter_or(TagFilterHandle left, TagFilterHandle right) { - return static_cast(storage::TagFilterBuilder::or_filter( + if (!left || !right) return nullptr; + return storage::TagFilterBuilder::or_filter( static_cast(left), - static_cast(right))); + static_cast(right)); } TagFilterHandle tsfile_tag_filter_not(TagFilterHandle filter) { - return static_cast(storage::TagFilterBuilder::not_filter( - static_cast(filter))); + if (!filter) return nullptr; + return storage::TagFilterBuilder::not_filter( + static_cast(filter)); } void tsfile_tag_filter_free(TagFilterHandle filter) { - delete static_cast(filter); + if (filter) { + delete static_cast(filter); + } } ResultSet tsfile_query_table_with_tag_filter( diff --git a/cpp/src/cwrapper/tsfile_cwrapper.h b/cpp/src/cwrapper/tsfile_cwrapper.h index ae3e28eed..3b3b13c36 100644 --- a/cpp/src/cwrapper/tsfile_cwrapper.h +++ b/cpp/src/cwrapper/tsfile_cwrapper.h @@ -905,32 +905,68 @@ TagFilterHandle tsfile_tag_filter_between(TsFileReader reader, bool is_not, ERRNO* err_code); /** - * @brief Combine two tag filters with AND. + * @brief Create a tag equality filter: column == value. + * + * @param reader [in] Valid TsFileReader handle (used to resolve column index). + * @param table_name [in] Target table name. + * @param column_name [in] Tag column name. + * @param value [in] Value to compare against. + * @return TagFilterHandle on success, NULL on failure. + */ +TagFilterHandle tsfile_tag_filter_eq(TsFileReader reader, + const char* table_name, + const char* column_name, + const char* value); + +TagFilterHandle tsfile_tag_filter_neq(TsFileReader reader, + const char* table_name, + const char* column_name, + const char* value); + +TagFilterHandle tsfile_tag_filter_lt(TsFileReader reader, + const char* table_name, + const char* column_name, + const char* value); + +TagFilterHandle tsfile_tag_filter_lteq(TsFileReader reader, + const char* table_name, + const char* column_name, + const char* value); + +TagFilterHandle tsfile_tag_filter_gt(TsFileReader reader, + const char* table_name, + const char* column_name, + const char* value); + +TagFilterHandle tsfile_tag_filter_gteq(TsFileReader reader, + const char* table_name, + const char* column_name, + const char* value); + +/** + * @brief Logical AND of two tag filters. Takes ownership of left and right. */ TagFilterHandle tsfile_tag_filter_and(TagFilterHandle left, TagFilterHandle right); /** - * @brief Combine two tag filters with OR. + * @brief Logical OR of two tag filters. Takes ownership of left and right. */ TagFilterHandle tsfile_tag_filter_or(TagFilterHandle left, TagFilterHandle right); /** - * @brief Negate a tag filter. + * @brief Logical NOT of a tag filter. Takes ownership of filter. */ TagFilterHandle tsfile_tag_filter_not(TagFilterHandle filter); /** - * @brief Free a tag filter and all its children. + * @brief Free a tag filter handle. */ void tsfile_tag_filter_free(TagFilterHandle filter); /** - * @brief Query table with tag filter. - * - * @param batch_size <= 0 means row-by-row return mode, - * > 0 means return TsBlock with the specified block size. + * @brief Batch query with tag filter support. */ ResultSet tsfile_query_table_with_tag_filter( TsFileReader reader, const char* table_name, char** columns, diff --git a/cpp/src/encoding/decoder.h b/cpp/src/encoding/decoder.h index c290b5791..24455ca01 100644 --- a/cpp/src/encoding/decoder.h +++ b/cpp/src/encoding/decoder.h @@ -21,6 +21,7 @@ #define ENCODING_DECODER_H #include "common/allocator/byte_stream.h" +#include "common/db_common.h" namespace storage { @@ -37,6 +38,140 @@ class Decoder { virtual int read_double(double& ret_value, common::ByteStream& in) = 0; virtual int read_String(common::String& ret_value, common::PageArena& pa, common::ByteStream& in) = 0; + + virtual int read_batch_int32(int32_t* out, int capacity, int& actual, + common::ByteStream& in) { + actual = 0; + int ret = common::E_OK; + int32_t val; + while (actual < capacity && has_remaining(in)) { + ret = read_int32(val, in); + if (ret != common::E_OK) { + return ret; + } + out[actual++] = val; + } + return common::E_OK; + } + + virtual int read_batch_int64(int64_t* out, int capacity, int& actual, + common::ByteStream& in) { + actual = 0; + int ret = common::E_OK; + int64_t val; + while (actual < capacity && has_remaining(in)) { + ret = read_int64(val, in); + if (ret != common::E_OK) { + return ret; + } + out[actual++] = val; + } + return common::E_OK; + } + + virtual int read_batch_float(float* out, int capacity, int& actual, + common::ByteStream& in) { + actual = 0; + int ret = common::E_OK; + float val; + while (actual < capacity && has_remaining(in)) { + ret = read_float(val, in); + if (ret != common::E_OK) { + return ret; + } + out[actual++] = val; + } + return common::E_OK; + } + + virtual int read_batch_double(double* out, int capacity, int& actual, + common::ByteStream& in) { + actual = 0; + int ret = common::E_OK; + double val; + while (actual < capacity && has_remaining(in)) { + ret = read_double(val, in); + if (ret != common::E_OK) { + return ret; + } + out[actual++] = val; + } + return common::E_OK; + } + + virtual int skip_int32(int count, int& skipped, common::ByteStream& in) { + skipped = 0; + int ret = common::E_OK; + int32_t dummy; + while (skipped < count && has_remaining(in)) { + ret = read_int32(dummy, in); + if (ret != common::E_OK) { + return ret; + } + ++skipped; + } + return common::E_OK; + } + + virtual int skip_int64(int count, int& skipped, common::ByteStream& in) { + skipped = 0; + int ret = common::E_OK; + int64_t dummy; + while (skipped < count && has_remaining(in)) { + ret = read_int64(dummy, in); + if (ret != common::E_OK) { + return ret; + } + ++skipped; + } + return common::E_OK; + } + + virtual int skip_float(int count, int& skipped, common::ByteStream& in) { + skipped = 0; + int ret = common::E_OK; + float dummy; + while (skipped < count && has_remaining(in)) { + ret = read_float(dummy, in); + if (ret != common::E_OK) { + return ret; + } + ++skipped; + } + return common::E_OK; + } + + virtual int skip_double(int count, int& skipped, common::ByteStream& in) { + skipped = 0; + int ret = common::E_OK; + double dummy; + while (skipped < count && has_remaining(in)) { + ret = read_double(dummy, in); + if (ret != common::E_OK) { + return ret; + } + ++skipped; + } + return common::E_OK; + } + + // Block-level filter check: peek the next block header and compute + // the value range [block_min, block_max] without decoding. + // Returns true if a block was peeked; false if not supported or no data. + // After peeking, caller must either: + // - Call skip_peeked_block_int64() to skip the block + // - Call read_batch_int64() which will use the peeked header + virtual bool peek_next_block_range_int64(common::ByteStream& in, + int64_t& block_min, + int64_t& block_max, + int& block_count) { + return false; + } + + // Skip the block whose header was already consumed by peek. + virtual int skip_peeked_block_int64(common::ByteStream& in, int& skipped) { + return common::E_NOT_SUPPORT; + } }; } // end namespace storage diff --git a/cpp/src/encoding/dictionary_encoder.h b/cpp/src/encoding/dictionary_encoder.h index be5f78a09..8f7c495c4 100644 --- a/cpp/src/encoding/dictionary_encoder.h +++ b/cpp/src/encoding/dictionary_encoder.h @@ -83,7 +83,12 @@ class DictionaryEncoder : public Encoder { if (entry_index_.count(value) == 0) { index_entry_.push_back(value); map_size_ = map_size_ + value.length(); - entry_index_[value] = static_cast(index_entry_.size()) - 1; + // Compute the index before the insert: LHS/RHS evaluation order of + // `m[k] = m.size()` is unspecified before C++17, so a compiler + // that evaluates the LHS first would store size()+1 and corrupt + // the dictionary. + const int new_idx = static_cast(index_entry_.size()) - 1; + entry_index_[value] = new_idx; } values_encoder_.encode(entry_index_[value], out); return common::E_OK; diff --git a/cpp/src/encoding/encoder.h b/cpp/src/encoding/encoder.h index 921686446..386129f6e 100644 --- a/cpp/src/encoding/encoder.h +++ b/cpp/src/encoding/encoder.h @@ -48,6 +48,81 @@ class Encoder { * @return the maximal size of possible memory occupied by current encoder */ virtual int get_max_byte_size() = 0; + + /* + * Batch encoding interfaces. + * Default implementations fall back to per-value encode(). + * Subclasses may override for better performance. + */ + virtual int encode_batch(const bool* values, uint32_t count, + common::ByteStream& out_stream) { + int ret = common::E_OK; + for (uint32_t i = 0; i < count; i++) { + if (RET_FAIL(encode(values[i], out_stream))) { + return ret; + } + } + return ret; + } + virtual int encode_batch(const int32_t* values, uint32_t count, + common::ByteStream& out_stream) { + int ret = common::E_OK; + for (uint32_t i = 0; i < count; i++) { + if (RET_FAIL(encode(values[i], out_stream))) { + return ret; + } + } + return ret; + } + virtual int encode_batch(const int64_t* values, uint32_t count, + common::ByteStream& out_stream) { + int ret = common::E_OK; + for (uint32_t i = 0; i < count; i++) { + if (RET_FAIL(encode(values[i], out_stream))) { + return ret; + } + } + return ret; + } + virtual int encode_batch(const float* values, uint32_t count, + common::ByteStream& out_stream) { + int ret = common::E_OK; + for (uint32_t i = 0; i < count; i++) { + if (RET_FAIL(encode(values[i], out_stream))) { + return ret; + } + } + return ret; + } + virtual int encode_batch(const double* values, uint32_t count, + common::ByteStream& out_stream) { + int ret = common::E_OK; + for (uint32_t i = 0; i < count; i++) { + if (RET_FAIL(encode(values[i], out_stream))) { + return ret; + } + } + return ret; + } + + // Batch encode strings from a contiguous buffer with offset array + // (Arrow-style layout from Tablet::StringColumn). + // string[i] = buffer + offsets[start_idx + i], length = offsets[start_idx + + // i + 1] - offsets[start_idx + i]. + virtual int encode_string_batch(const char* buffer, const uint32_t* offsets, + uint32_t start_idx, uint32_t count, + common::ByteStream& out_stream) { + int ret = common::E_OK; + for (uint32_t i = 0; i < count; i++) { + uint32_t idx = start_idx + i; + uint32_t len = offsets[idx + 1] - offsets[idx]; + common::String val(buffer + offsets[idx], len); + if (RET_FAIL(encode(val, out_stream))) { + return ret; + } + } + return ret; + } }; } // end namespace storage diff --git a/cpp/src/encoding/gorilla_decoder.h b/cpp/src/encoding/gorilla_decoder.h index 5684561aa..e1e490105 100644 --- a/cpp/src/encoding/gorilla_decoder.h +++ b/cpp/src/encoding/gorilla_decoder.h @@ -30,6 +30,163 @@ namespace storage { +// ── Raw-pointer bit reader ──────────────────────────────────────────────── +// Operates directly on a contiguous byte array, bypassing ByteStream's +// per-byte read_buf() overhead (atomic loads, page boundary checks, memcpy). + +struct GorillaBitReader { + const uint8_t* data; + uint32_t pos; // next byte index to load + uint32_t data_len; // total bytes + int bits; // remaining bits in cur_byte (0..8) + uint8_t cur_byte; + // Set once a load was attempted on an empty input, or once read_bit / + // read_long ran out of bits mid-value. Without this, a truncated page + // would spin read_long() forever (bits stays 0, n -= 0 makes no + // progress) and read_bit() would execute a negative shift via + // (cur_byte >> (bits - 1)). + bool exhausted = false; + + FORCE_INLINE void load_byte_if_empty() { + if (bits == 0) { + if (pos < data_len) { + cur_byte = data[pos++]; + bits = 8; + } else { + exhausted = true; + } + } + } + + FORCE_INLINE bool read_bit() { + if (UNLIKELY(bits == 0)) { + exhausted = true; + return false; + } + bool bit = ((cur_byte >> (bits - 1)) & 1) == 1; + bits--; + load_byte_if_empty(); + return bit; + } + + FORCE_INLINE int64_t read_long(int n) { + int64_t value = 0; + while (n > 0) { + if (UNLIKELY(bits == 0)) { + // Input drained mid-value; bail so the outer loop in + // read_control_bits / batch_decode_raw doesn't spin. + exhausted = true; + return value; + } + if (n > bits || n == 8) { + value = (value << bits) + (cur_byte & ((1 << bits) - 1)); + n -= bits; + bits = 0; + } else { + value = + (value << n) + ((cur_byte >> (bits - n)) & ((1 << n) - 1)); + bits -= n; + n = 0; + } + load_byte_if_empty(); + } + return value; + } + + FORCE_INLINE uint8_t read_control_bits(int max_bits) { + uint8_t value = 0x00; + for (int i = 0; i < max_bits; i++) { + value <<= 1; + if (exhausted) break; + if (read_bit()) { + value |= 0x01; + } else { + break; + } + } + return value; + } +}; + +// ── Templated raw-pointer decode helpers ────────────────────────────────── + +template +struct GorillaRawOps { + static FORCE_INLINE T read_next(GorillaBitReader& r, T& stored_value, + int& stored_leading_zeros, + int& stored_trailing_zeros); +}; + +template <> +struct GorillaRawOps { + static constexpr int VALUE_BITS = VALUE_BITS_LENGTH_32BIT; + + static FORCE_INLINE int32_t read_next(GorillaBitReader& r, + int32_t& stored_value, + int& stored_leading_zeros, + int& stored_trailing_zeros) { + uint8_t ctrl = r.read_control_bits(2); + switch (ctrl) { + case 3: { + stored_leading_zeros = + (int)r.read_long(LEADING_ZERO_BITS_LENGTH_32BIT); + uint8_t sig = + (uint8_t)r.read_long(MEANINGFUL_XOR_BITS_LENGTH_32BIT); + sig++; + stored_trailing_zeros = VALUE_BITS - sig - stored_leading_zeros; + } + // fallthrough + case 2: { + int32_t xor_value = (int32_t)r.read_long( + VALUE_BITS - stored_leading_zeros - stored_trailing_zeros); + xor_value = static_cast(xor_value) + << stored_trailing_zeros; + stored_value ^= xor_value; + } + // fallthrough + default: + return stored_value; + } + return stored_value; + } +}; + +template <> +struct GorillaRawOps { + static constexpr int VALUE_BITS = VALUE_BITS_LENGTH_64BIT; + + static FORCE_INLINE int64_t read_next(GorillaBitReader& r, + int64_t& stored_value, + int& stored_leading_zeros, + int& stored_trailing_zeros) { + uint8_t ctrl = r.read_control_bits(2); + switch (ctrl) { + case 3: { + stored_leading_zeros = + (int)r.read_long(LEADING_ZERO_BITS_LENGTH_64BIT); + uint8_t sig = + (uint8_t)r.read_long(MEANINGFUL_XOR_BITS_LENGTH_64BIT); + sig++; + stored_trailing_zeros = VALUE_BITS - sig - stored_leading_zeros; + } + // fallthrough + case 2: { + int64_t xor_value = r.read_long( + VALUE_BITS - stored_leading_zeros - stored_trailing_zeros); + xor_value = static_cast(xor_value) + << stored_trailing_zeros; + stored_value ^= xor_value; + } + // fallthrough + default: + return stored_value; + } + return stored_value; + } +}; + +// ────────────────────────────────────────────────────────────────────────── + template class GorillaDecoder : public Decoder { public: @@ -127,6 +284,197 @@ class GorillaDecoder : public Decoder { int read_String(common::String& ret_value, common::PageArena& pa, common::ByteStream& in) override; + // Batch overrides — declared here, defined after template specializations + int read_batch_int32(int32_t* out, int capacity, int& actual, + common::ByteStream& in) override; + int read_batch_int64(int64_t* out, int capacity, int& actual, + common::ByteStream& in) override; + int skip_int32(int count, int& skipped, common::ByteStream& in) override; + int skip_int64(int count, int& skipped, common::ByteStream& in) override; + + protected: + // ── Batch decode using raw pointer (bypasses ByteStream) ───────────── + // The decode() contract: + // stored_value_ holds the "next" value to be returned. + // decode() returns stored_value_, then advances via cache_next(). + // has_next_==false means the ending sentinel was hit. + // + // batch_decode_raw replicates this logic using GorillaBitReader on the + // wrapped contiguous buffer, then syncs state back to ByteStream. + int batch_decode_raw(T* out, int capacity, int& actual, T ending, + common::ByteStream& in) { + int ret = common::E_OK; + actual = 0; + // Bootstrap below would unconditionally write out[0]; guard the + // zero-capacity edge case so callers can probe without writing. + if (capacity <= 0) { + return common::E_OK; + } + if (!in.is_wrapped()) { + return batch_decode_fallback(out, capacity, actual, ending, in); + } + + const uint8_t* base = + (const uint8_t*)in.get_wrapped_buf() + in.read_pos(); + // Gorilla pages are bounded by the page-writer cap (well below 4 GiB), + // so saturating to uint32_t is safe and matches GorillaBitReader's + // 32-bit cursor. + uint32_t remain = static_cast( + std::min(in.remaining_size(), UINT32_MAX)); + + GorillaBitReader r; + r.data = base; + r.pos = 0; + r.data_len = remain; + r.bits = bits_left_; + r.cur_byte = buffer_; + + // Bootstrap first value if needed (mirrors decode()'s first-call path) + if (UNLIKELY(!first_value_was_read_)) { + if (r.bits == 0 && r.pos >= r.data_len) goto done; + r.load_byte_if_empty(); + stored_value_ = (T)r.read_long(GorillaRawOps::VALUE_BITS); + if (UNLIKELY(r.exhausted)) { + // Page truncated before the first value finished; refuse to + // emit a partially-decoded sentinel. + first_value_was_read_ = false; + ret = common::E_BUF_NOT_ENOUGH; + goto done; + } + first_value_was_read_ = true; + // Save the first value before cache_next mutates stored_value_ + T first_value = stored_value_; + // cache_next: read_next then check ending + GorillaRawOps::read_next(r, stored_value_, stored_leading_zeros_, + stored_trailing_zeros_); + if (UNLIKELY(r.exhausted)) { + ret = common::E_BUF_NOT_ENOUGH; + goto done; + } + if (stored_value_ == ending) { + has_next_ = false; + } else { + has_next_ = true; + } + // Output the first value + out[actual++] = first_value; + if (!has_next_ || actual >= capacity) goto done; + } + + // Main batch loop + while (actual < capacity && has_next_) { + out[actual++] = stored_value_; + GorillaRawOps::read_next(r, stored_value_, stored_leading_zeros_, + stored_trailing_zeros_); + if (UNLIKELY(r.exhausted)) { + ret = common::E_BUF_NOT_ENOUGH; + goto done; + } + if (stored_value_ == ending) { + has_next_ = false; + } + } + + done: + // Sync bit-reader state back + buffer_ = r.cur_byte; + bits_left_ = r.bits; + in.wrapped_buf_advance_read_pos(r.pos); + return ret; + } + + int batch_skip_raw(int count, int& skipped, T ending, + common::ByteStream& in) { + int ret = common::E_OK; + skipped = 0; + // Bootstrap below would consume first_value_ even when count == 0, + // advancing the stream past data the caller didn't ask to skip. + if (count <= 0) { + return common::E_OK; + } + if (!in.is_wrapped()) { + return batch_skip_fallback(count, skipped, ending, in); + } + + const uint8_t* base = + (const uint8_t*)in.get_wrapped_buf() + in.read_pos(); + // Same saturation as batch_decode_raw: GorillaBitReader is 32-bit + // internally; pages are well under 4 GiB. + uint32_t remain = static_cast( + std::min(in.remaining_size(), UINT32_MAX)); + + GorillaBitReader r; + r.data = base; + r.pos = 0; + r.data_len = remain; + r.bits = bits_left_; + r.cur_byte = buffer_; + + if (UNLIKELY(!first_value_was_read_)) { + if (r.bits == 0 && r.pos >= r.data_len) goto done; + r.load_byte_if_empty(); + stored_value_ = (T)r.read_long(GorillaRawOps::VALUE_BITS); + if (UNLIKELY(r.exhausted)) { + first_value_was_read_ = false; + ret = common::E_BUF_NOT_ENOUGH; + goto done; + } + first_value_was_read_ = true; + GorillaRawOps::read_next(r, stored_value_, stored_leading_zeros_, + stored_trailing_zeros_); + if (UNLIKELY(r.exhausted)) { + ret = common::E_BUF_NOT_ENOUGH; + goto done; + } + if (stored_value_ == ending) { + has_next_ = false; + } else { + has_next_ = true; + } + // The first value counts as one skip + skipped++; + if (!has_next_ || skipped >= count) goto done; + } + + while (skipped < count && has_next_) { + skipped++; + GorillaRawOps::read_next(r, stored_value_, stored_leading_zeros_, + stored_trailing_zeros_); + if (UNLIKELY(r.exhausted)) { + ret = common::E_BUF_NOT_ENOUGH; + goto done; + } + if (stored_value_ == ending) { + has_next_ = false; + } + } + + done: + buffer_ = r.cur_byte; + bits_left_ = r.bits; + in.wrapped_buf_advance_read_pos(r.pos); + return ret; + } + + int batch_decode_fallback(T* out, int capacity, int& actual, T ending, + common::ByteStream& in) { + actual = 0; + while (actual < capacity && has_remaining(in)) { + out[actual++] = decode(in); + } + return common::E_OK; + } + + int batch_skip_fallback(int count, int& skipped, T ending, + common::ByteStream& in) { + skipped = 0; + while (skipped < count && has_remaining(in)) { + decode(in); + skipped++; + } + return common::E_OK; + } + public: common::TSEncoding type_; T stored_value_; @@ -254,18 +602,18 @@ FORCE_INLINE int64_t GorillaDecoder::decode(common::ByteStream& in) { class FloatGorillaDecoder : public GorillaDecoder { public: - int read_boolean(bool& ret_value, common::ByteStream& in); - int read_int32(int32_t& ret_value, common::ByteStream& in); - int read_int64(int64_t& ret_value, common::ByteStream& in); - int read_float(float& ret_value, common::ByteStream& in); - int read_double(double& ret_value, common::ByteStream& in); + int read_boolean(bool& ret_value, common::ByteStream& in) override; + int read_int32(int32_t& ret_value, common::ByteStream& in) override; + int read_int64(int64_t& ret_value, common::ByteStream& in) override; + int read_float(float& ret_value, common::ByteStream& in) override; + int read_double(double& ret_value, common::ByteStream& in) override; float decode(common::ByteStream& in) { int32_t value_int = GorillaDecoder::decode(in); return common::int_to_float(value_int); } - int32_t cache_next(common::ByteStream& in) { + int32_t cache_next(common::ByteStream& in) override { read_next(in); if (stored_value_ == common::float_to_int(GORILLA_ENCODING_ENDING_FLOAT)) { @@ -273,22 +621,46 @@ class FloatGorillaDecoder : public GorillaDecoder { } return stored_value_; } + + int read_batch_float(float* out, int capacity, int& actual, + common::ByteStream& in) override { + int32_t ending = common::float_to_int(GORILLA_ENCODING_ENDING_FLOAT); + actual = 0; + while (actual < capacity && has_remaining(in)) { + int32_t buf[129]; + int batch = std::min(129, capacity - actual); + int buf_actual = 0; + int ret = batch_decode_raw(buf, batch, buf_actual, ending, in); + if (ret != common::E_OK) return ret; + if (buf_actual == 0) break; + for (int i = 0; i < buf_actual; i++) { + out[actual + i] = common::int_to_float(buf[i]); + } + actual += buf_actual; + } + return common::E_OK; + } + + int skip_float(int count, int& skipped, common::ByteStream& in) override { + int32_t ending = common::float_to_int(GORILLA_ENCODING_ENDING_FLOAT); + return batch_skip_raw(count, skipped, ending, in); + } }; class DoubleGorillaDecoder : public GorillaDecoder { public: - int read_boolean(bool& ret_value, common::ByteStream& in); - int read_int32(int32_t& ret_value, common::ByteStream& in); - int read_int64(int64_t& ret_value, common::ByteStream& in); - int read_float(float& ret_value, common::ByteStream& in); - int read_double(double& ret_value, common::ByteStream& in); + int read_boolean(bool& ret_value, common::ByteStream& in) override; + int read_int32(int32_t& ret_value, common::ByteStream& in) override; + int read_int64(int64_t& ret_value, common::ByteStream& in) override; + int read_float(float& ret_value, common::ByteStream& in) override; + int read_double(double& ret_value, common::ByteStream& in) override; double decode(common::ByteStream& in) { int64_t value_long = GorillaDecoder::decode(in); return common::long_to_double(value_long); } - int64_t cache_next(common::ByteStream& in) { + int64_t cache_next(common::ByteStream& in) override { read_next(in); if (stored_value_ == common::double_to_long(GORILLA_ENCODING_ENDING_DOUBLE)) { @@ -296,12 +668,88 @@ class DoubleGorillaDecoder : public GorillaDecoder { } return stored_value_; } + + int read_batch_double(double* out, int capacity, int& actual, + common::ByteStream& in) override { + int64_t ending = common::double_to_long(GORILLA_ENCODING_ENDING_DOUBLE); + actual = 0; + while (actual < capacity && has_remaining(in)) { + int64_t buf[129]; + int batch = std::min(129, capacity - actual); + int buf_actual = 0; + int ret = batch_decode_raw(buf, batch, buf_actual, ending, in); + if (ret != common::E_OK) return ret; + if (buf_actual == 0) break; + for (int i = 0; i < buf_actual; i++) { + out[actual + i] = common::long_to_double(buf[i]); + } + actual += buf_actual; + } + return common::E_OK; + } + + int skip_double(int count, int& skipped, common::ByteStream& in) override { + int64_t ending = common::double_to_long(GORILLA_ENCODING_ENDING_DOUBLE); + return batch_skip_raw(count, skipped, ending, in); + } }; typedef GorillaDecoder IntGorillaDecoder; typedef GorillaDecoder LongGorillaDecoder; -// wrap as Decoder interface +// ── IntGorillaDecoder batch/skip overrides ───────────────────────────────── +template <> +inline int GorillaDecoder::read_batch_int32(int32_t* out, int capacity, + int& actual, + common::ByteStream& in) { + return batch_decode_raw(out, capacity, actual, + GORILLA_ENCODING_ENDING_INTEGER, in); +} +template <> +inline int GorillaDecoder::read_batch_int64(int64_t*, int, int& actual, + common::ByteStream&) { + actual = 0; + return common::E_NOT_SUPPORT; +} +template <> +inline int GorillaDecoder::skip_int32(int count, int& skipped, + common::ByteStream& in) { + return batch_skip_raw(count, skipped, GORILLA_ENCODING_ENDING_INTEGER, in); +} +template <> +inline int GorillaDecoder::skip_int64(int, int& skipped, + common::ByteStream&) { + skipped = 0; + return common::E_NOT_SUPPORT; +} + +// ── LongGorillaDecoder batch/skip overrides ─────────────────────────────── +template <> +inline int GorillaDecoder::read_batch_int32(int32_t*, int, int& actual, + common::ByteStream&) { + actual = 0; + return common::E_NOT_SUPPORT; +} +template <> +inline int GorillaDecoder::read_batch_int64(int64_t* out, int capacity, + int& actual, + common::ByteStream& in) { + return batch_decode_raw(out, capacity, actual, GORILLA_ENCODING_ENDING_LONG, + in); +} +template <> +inline int GorillaDecoder::skip_int32(int, int& skipped, + common::ByteStream&) { + skipped = 0; + return common::E_NOT_SUPPORT; +} +template <> +inline int GorillaDecoder::skip_int64(int count, int& skipped, + common::ByteStream& in) { + return batch_skip_raw(count, skipped, GORILLA_ENCODING_ENDING_LONG, in); +} + +// ── Scalar Decoder interface wrappers (unchanged) ───────────────────────── template <> FORCE_INLINE int IntGorillaDecoder::read_boolean(bool& ret_value, common::ByteStream& in) { diff --git a/cpp/src/encoding/plain_decoder.h b/cpp/src/encoding/plain_decoder.h index c2627f71d..3e83cfc76 100644 --- a/cpp/src/encoding/plain_decoder.h +++ b/cpp/src/encoding/plain_decoder.h @@ -20,10 +20,47 @@ #ifndef ENCODING_PLAIN_DECODER_H #define ENCODING_PLAIN_DECODER_H +#include +#include +#include + +#if defined(_MSC_VER) +#include +#include +#endif + #include "encoding/decoder.h" namespace storage { +FORCE_INLINE uint32_t plain_bswap32(uint32_t v) { +#if defined(__GNUC__) || defined(__clang__) + return __builtin_bswap32(v); +#elif defined(_MSC_VER) + return _byteswap_ulong(v); +#else + return ((v & 0x000000FFu) << 24) | ((v & 0x0000FF00u) << 8) | + ((v & 0x00FF0000u) >> 8) | ((v & 0xFF000000u) >> 24); +#endif +} + +FORCE_INLINE uint64_t plain_bswap64(uint64_t v) { +#if defined(__GNUC__) || defined(__clang__) + return __builtin_bswap64(v); +#elif defined(_MSC_VER) + return _byteswap_uint64(v); +#else + return ((v & 0x00000000000000FFull) << 56) | + ((v & 0x000000000000FF00ull) << 40) | + ((v & 0x0000000000FF0000ull) << 24) | + ((v & 0x00000000FF000000ull) << 8) | + ((v & 0x000000FF00000000ull) >> 8) | + ((v & 0x0000FF0000000000ull) >> 24) | + ((v & 0x00FF000000000000ull) >> 40) | + ((v & 0xFF00000000000000ull) >> 56); +#endif +} + class PlainDecoder : public Decoder { public: ~PlainDecoder() override = default; @@ -62,6 +99,113 @@ class PlainDecoder : public Decoder { common::ByteStream& in) override { return common::SerializationUtil::read_mystring(ret_String, &pa, in); } + + // ── Batch overrides ────────────────────────────────────────────────────── + // + // INT32: PLAIN encoding uses varint (variable stride). Override to avoid + // virtual dispatch per element; actual decode is still per-value. + int read_batch_int32(int32_t* out, int capacity, int& actual, + common::ByteStream& in) override { + actual = 0; + while (actual < capacity && in.has_remaining()) { + int ret = common::SerializationUtil::read_var_int(out[actual], in); + if (ret != common::E_OK) return ret; + ++actual; + } + return common::E_OK; + } + + int skip_int32(int count, int& skipped, common::ByteStream& in) override { + skipped = 0; + int32_t dummy; + while (skipped < count && in.has_remaining()) { + int ret = common::SerializationUtil::read_var_int(dummy, in); + if (ret != common::E_OK) { + return ret; + } + ++skipped; + } + return common::E_OK; + } + + // Fixed-stride INT64 / FLOAT / DOUBLE share the same shape: when the + // ByteStream is wrapped (contiguous buf), advance the read pointer in one + // step and byte-swap in place; otherwise fall back to per-value reads. + // The macros below expand into one override per type. +#define PLAIN_SKIP_FIXED(NAME, T, STRIDE, READ_ONE) \ + int NAME(int count, int& skipped, common::ByteStream& in) override { \ + skipped = 0; \ + if (!in.is_wrapped()) { \ + T dummy; \ + while (skipped < count && in.has_remaining()) { \ + int ret = READ_ONE(dummy, in); \ + if (ret != common::E_OK) { \ + return ret; \ + } \ + ++skipped; \ + } \ + return common::E_OK; \ + } \ + skipped = static_cast(std::min( \ + in.remaining_size() / (STRIDE), static_cast(count))); \ + if (skipped <= 0) { \ + skipped = 0; \ + return common::E_OK; \ + } \ + in.wrapped_buf_advance_read_pos(static_cast(skipped) * \ + (STRIDE)); \ + return common::E_OK; \ + } + +#define PLAIN_READ_BATCH_FIXED(NAME, T, U, STRIDE, READ_ONE, BSWAP) \ + int NAME(T* out, int capacity, int& actual, common::ByteStream& in) \ + override { \ + actual = 0; \ + if (!in.is_wrapped()) { \ + while (actual < capacity && in.has_remaining()) { \ + int ret = READ_ONE(out[actual], in); \ + if (ret != common::E_OK) { \ + return ret; \ + } \ + ++actual; \ + } \ + return common::E_OK; \ + } \ + int n = static_cast(std::min( \ + in.remaining_size() / (STRIDE), static_cast(capacity))); \ + if (n <= 0) { \ + return common::E_OK; \ + } \ + const uint8_t* src = \ + (const uint8_t*)in.get_wrapped_buf() + in.read_pos(); \ + in.wrapped_buf_advance_read_pos(static_cast(n) * (STRIDE)); \ + actual = n; \ + for (int i = 0; i < n; ++i) { \ + U v; \ + memcpy(&v, src + i * (STRIDE), (STRIDE)); \ + v = BSWAP(v); \ + memcpy(&out[i], &v, (STRIDE)); \ + } \ + return common::E_OK; \ + } + + PLAIN_SKIP_FIXED(skip_int64, int64_t, 8, + common::SerializationUtil::read_i64) + PLAIN_SKIP_FIXED(skip_float, float, 4, + common::SerializationUtil::read_float) + PLAIN_SKIP_FIXED(skip_double, double, 8, + common::SerializationUtil::read_double) + + PLAIN_READ_BATCH_FIXED(read_batch_int64, int64_t, uint64_t, 8, + common::SerializationUtil::read_i64, plain_bswap64) + PLAIN_READ_BATCH_FIXED(read_batch_float, float, uint32_t, 4, + common::SerializationUtil::read_float, plain_bswap32) + PLAIN_READ_BATCH_FIXED(read_batch_double, double, uint64_t, 8, + common::SerializationUtil::read_double, + plain_bswap64) + +#undef PLAIN_SKIP_FIXED +#undef PLAIN_READ_BATCH_FIXED }; } // end namespace storage diff --git a/cpp/src/encoding/plain_encoder.h b/cpp/src/encoding/plain_encoder.h index b768c9bf0..84ebee238 100644 --- a/cpp/src/encoding/plain_encoder.h +++ b/cpp/src/encoding/plain_encoder.h @@ -20,50 +20,221 @@ #ifndef ENCODING_PLAIN_ENCODER_H #define ENCODING_PLAIN_ENCODER_H +#include + #include "encoder.h" +#if defined(__ARM_NEON) || defined(__ARM_NEON__) +#include +#define TSFILE_HAS_NEON 1 +#endif + namespace storage { class PlainEncoder : public Encoder { public: PlainEncoder() {} ~PlainEncoder() { destroy(); } - void destroy() { /* do nothing for PlainEncoder */ + void destroy() override { /* do nothing for PlainEncoder */ } - void reset() { /* do thing for PlainEncoder */ + void reset() override { /* do thing for PlainEncoder */ } - FORCE_INLINE int encode(bool value, common::ByteStream& out_stream) { + FORCE_INLINE int encode(bool value, + common::ByteStream& out_stream) override { return common::SerializationUtil::write_i8(value ? 1 : 0, out_stream); } - FORCE_INLINE int encode(int32_t value, common::ByteStream& out_stream) { + FORCE_INLINE int encode(int32_t value, + common::ByteStream& out_stream) override { return common::SerializationUtil::write_var_int(value, out_stream); } - FORCE_INLINE int encode(int64_t value, common::ByteStream& out_stream) { + FORCE_INLINE int encode(int64_t value, + common::ByteStream& out_stream) override { return common::SerializationUtil::write_i64(value, out_stream); } - FORCE_INLINE int encode(float value, common::ByteStream& out_stream) { + FORCE_INLINE int encode(float value, + common::ByteStream& out_stream) override { return common::SerializationUtil::write_float(value, out_stream); } - FORCE_INLINE int encode(double value, common::ByteStream& out_stream) { + FORCE_INLINE int encode(double value, + common::ByteStream& out_stream) override { return common::SerializationUtil::write_double(value, out_stream); } FORCE_INLINE int encode(common::String value, - common::ByteStream& out_stream) { + common::ByteStream& out_stream) override { return common::SerializationUtil::write_mystring(value, out_stream); } - int flush(common::ByteStream& out_stream) { + int flush(common::ByteStream& out_stream) override { // do nothing for PlainEncoder return common::E_OK; } - int get_max_byte_size() { return 0; } + int get_max_byte_size() override { return 0; } + + // Optimized batch encoding: directly byte-swap into ByteStream page buffer. + // Avoids per-value write_buf overhead entirely — only calls acquire_buf() + // once per page boundary crossing. + int encode_batch(const int64_t* values, uint32_t count, + common::ByteStream& out_stream) override { + if (count == 0) return common::E_OK; + uint32_t offset = 0; + while (offset < count) { + common::ByteStream::Buffer buf = out_stream.acquire_buf(); + if (UNLIKELY(buf.buf_ == nullptr)) return common::E_OOM; + // How many int64 values fit in the remaining page space? + uint32_t capacity = buf.len_ / 8; + if (capacity == 0) { + // Page has < 8 bytes left, fall back to write_buf for this one + return Encoder::encode_batch(values + offset, count - offset, + out_stream); + } + uint32_t batch = std::min(count - offset, capacity); + uint8_t* dst = (uint8_t*)buf.buf_; + const int64_t* src = values + offset; + uint32_t i = 0; +#if TSFILE_HAS_NEON + // NEON: byte-reverse 2 x int64 per iteration + for (; i + 2 <= batch; i += 2) { + uint8x16_t v = vld1q_u8((const uint8_t*)&src[i]); + v = vrev64q_u8(v); + vst1q_u8(dst, v); + dst += 16; + } +#endif + // Scalar tail + for (; i < batch; i++) { + uint64_t v = (uint64_t)src[i]; + dst[0] = (uint8_t)(v >> 56); + dst[1] = (uint8_t)(v >> 48); + dst[2] = (uint8_t)(v >> 40); + dst[3] = (uint8_t)(v >> 32); + dst[4] = (uint8_t)(v >> 24); + dst[5] = (uint8_t)(v >> 16); + dst[6] = (uint8_t)(v >> 8); + dst[7] = (uint8_t)(v); + dst += 8; + } + out_stream.buffer_used(batch * 8); + offset += batch; + } + return common::E_OK; + } + + int encode_batch(const double* values, uint32_t count, + common::ByteStream& out_stream) override { + if (count == 0) return common::E_OK; + uint32_t offset = 0; + while (offset < count) { + common::ByteStream::Buffer buf = out_stream.acquire_buf(); + if (UNLIKELY(buf.buf_ == nullptr)) return common::E_OOM; + uint32_t capacity = buf.len_ / 8; + if (capacity == 0) { + return Encoder::encode_batch(values + offset, count - offset, + out_stream); + } + uint32_t batch = std::min(count - offset, capacity); + uint8_t* dst = (uint8_t*)buf.buf_; + const double* src = values + offset; + uint32_t i = 0; +#if TSFILE_HAS_NEON + // NEON byte-reverse of raw bytes works for double bits too. + for (; i + 2 <= batch; i += 2) { + uint8x16_t v = vld1q_u8((const uint8_t*)&src[i]); + v = vrev64q_u8(v); + vst1q_u8(dst, v); + dst += 16; + } +#endif + // Scalar tail: round-trip the bits via memcpy to avoid the + // strict-aliasing violation of reading a double through an + // int64_t* (the old reinterpret_cast dispatch). + for (; i < batch; i++) { + uint64_t v; + memcpy(&v, &src[i], sizeof(double)); + dst[0] = (uint8_t)(v >> 56); + dst[1] = (uint8_t)(v >> 48); + dst[2] = (uint8_t)(v >> 40); + dst[3] = (uint8_t)(v >> 32); + dst[4] = (uint8_t)(v >> 24); + dst[5] = (uint8_t)(v >> 16); + dst[6] = (uint8_t)(v >> 8); + dst[7] = (uint8_t)(v); + dst += 8; + } + out_stream.buffer_used(batch * 8); + offset += batch; + } + return common::E_OK; + } + + int encode_batch(const float* values, uint32_t count, + common::ByteStream& out_stream) override { + if (count == 0) return common::E_OK; + uint32_t offset = 0; + while (offset < count) { + common::ByteStream::Buffer buf = out_stream.acquire_buf(); + if (UNLIKELY(buf.buf_ == nullptr)) return common::E_OOM; + uint32_t capacity = buf.len_ / 4; + if (capacity == 0) { + return Encoder::encode_batch(values + offset, count - offset, + out_stream); + } + uint32_t batch = std::min(count - offset, capacity); + uint8_t* dst = (uint8_t*)buf.buf_; + const float* src = values + offset; + uint32_t i = 0; +#if TSFILE_HAS_NEON + // NEON: byte-reverse 4 x float (32-bit) per iteration + for (; i + 4 <= batch; i += 4) { + uint8x16_t v = vld1q_u8((const uint8_t*)&src[i]); + v = vrev32q_u8(v); + vst1q_u8(dst, v); + dst += 16; + } +#endif + for (; i < batch; i++) { + uint32_t v; + memcpy(&v, &src[i], sizeof(float)); + dst[0] = (uint8_t)(v >> 24); + dst[1] = (uint8_t)(v >> 16); + dst[2] = (uint8_t)(v >> 8); + dst[3] = (uint8_t)(v); + dst += 4; + } + out_stream.buffer_used(batch * 4); + offset += batch; + } + return common::E_OK; + } + + // Batch encode strings from Arrow-style offset+buffer layout. + // Each string is serialized as: var_int(len) + raw bytes. + int encode_string_batch(const char* buffer, const uint32_t* offsets, + uint32_t start_idx, uint32_t count, + common::ByteStream& out_stream) override { + int ret = common::E_OK; + for (uint32_t i = 0; i < count; i++) { + uint32_t idx = start_idx + i; + uint32_t len = offsets[idx + 1] - offsets[idx]; + if (RET_FAIL(common::SerializationUtil::write_var_int( + (int32_t)len, out_stream))) { + return ret; + } + if (len > 0) { + if (RET_FAIL( + out_stream.write_buf(buffer + offsets[idx], len))) { + return ret; + } + } + } + return ret; + } }; } // end namespace storage diff --git a/cpp/src/encoding/ts2diff_decoder.h b/cpp/src/encoding/ts2diff_decoder.h index f37001003..bc6e89613 100644 --- a/cpp/src/encoding/ts2diff_decoder.h +++ b/cpp/src/encoding/ts2diff_decoder.h @@ -24,6 +24,7 @@ #include #include +#include #include #include "common/allocator/alloc_base.h" @@ -31,8 +32,174 @@ #include "decoder.h" #include "utils/util_define.h" +#ifdef ENABLE_SIMD +#include "simde/x86/avx2.h" +#endif + namespace storage { +// ============================================================================ +// SIMD batch decode helpers (INT32) +// ============================================================================ +#ifdef ENABLE_SIMD + +// Decode 4 INT32 values from bit-packed data using SIMD gather + shift. +// @in: pointer to the start of packed bit data for the block +// @bit_width: bits per delta value +// @delta_min: minimum delta offset for this block +// @index: current position within the block (0-based, among write_index_ +// deltas) +// @base: the previous reconstructed value (for prefix-sum) +// @out: output array (4 values written) +// Returns: the last reconstructed value (new base for next group) +static inline int32_t simd_decode_4_i32(const uint8_t* in, int32_t bit_width, + int32_t delta_min, int32_t index, + int32_t base, int32_t out[4]) { + static const simde__m128i SHUF_REV4 = simde_mm_setr_epi8( + 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); + + const simde__m128i VMIN4 = simde_mm_set1_epi32(delta_min); + + int32_t pos0 = index * bit_width; + int32_t pos[4] = {pos0, pos0 + bit_width, pos0 + 2 * bit_width, + pos0 + 3 * bit_width}; + int32_t bidx[4] = {pos[0] >> 3, pos[1] >> 3, pos[2] >> 3, pos[3] >> 3}; + int32_t off[4] = {pos[0] & 7, pos[1] & 7, pos[2] & 7, pos[3] & 7}; + + simde__m128i IDX = simde_mm_setr_epi32(bidx[0], bidx[1], bidx[2], bidx[3]); + simde__m128i OFF = simde_mm_setr_epi32(off[0], off[1], off[2], off[3]); + + simde__m128i V4; + + if (bit_width <= 16) { + int rshift = 32 - bit_width; + simde__m128i w32_le = simde_mm_i32gather_epi32((const int*)in, IDX, 1); + simde__m128i w32_be = simde_mm_shuffle_epi8(w32_le, SHUF_REV4); + simde__m128i U32 = simde_mm_sllv_epi32(w32_be, OFF); + simde__m128i RS32 = simde_mm_set1_epi32(rshift); + V4 = simde_mm_srlv_epi32(U32, RS32); + } else { + static const simde__m256i SHUF_REV8 = simde_mm256_setr_epi8( + 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, + 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); + int rshift = 64 - bit_width; + simde__m256i w64_le = + simde_mm256_i32gather_epi64((const int64_t*)in, IDX, 1); + simde__m256i w64_be = simde_mm256_shuffle_epi8(w64_le, SHUF_REV8); + simde__m256i OFF64 = simde_mm256_cvtepu32_epi64(OFF); + simde__m256i U64 = simde_mm256_sllv_epi64(w64_be, OFF64); + simde__m256i V64 = + simde_mm256_srl_epi64(U64, simde_mm_cvtsi32_si128(rshift)); + simde__m256i perm = simde_mm256_setr_epi32(0, 2, 4, 6, 0, 0, 0, 0); + simde__m256i comp = simde_mm256_permutevar8x32_epi32(V64, perm); + V4 = simde_mm256_castsi256_si128(comp); + } + + // Add delta_min + V4 = simde_mm_add_epi32(V4, VMIN4); + + // Prefix sum to reconstruct absolute values + simde__m128i t; + t = simde_mm_slli_si128(V4, 4); + V4 = simde_mm_add_epi32(V4, t); + t = simde_mm_slli_si128(V4, 8); + V4 = simde_mm_add_epi32(V4, t); + + // Add base + simde__m128i C4 = simde_mm_set1_epi32(base); + V4 = simde_mm_add_epi32(V4, C4); + + simde_mm_storeu_si128((simde__m128i*)out, V4); + return out[3]; +} + +// Decode 4 INT64 values from bit-packed data using SIMD. +static inline int64_t simd_decode_4_i64(const uint8_t* in, int32_t bit_width, + int64_t delta_min, int32_t index, + int64_t base, int64_t out[4]) { + static const simde__m256i SHUF_REV8 = simde_mm256_setr_epi8( + 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, + 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); + + const simde__m256i VMIN4 = simde_mm256_set1_epi64x(delta_min); + + int32_t pos0 = index * bit_width; + int32_t pos[4] = {pos0, pos0 + bit_width, pos0 + 2 * bit_width, + pos0 + 3 * bit_width}; + int32_t bidx[4] = {pos[0] >> 3, pos[1] >> 3, pos[2] >> 3, pos[3] >> 3}; + int32_t off[4] = {pos[0] & 7, pos[1] & 7, pos[2] & 7, pos[3] & 7}; + + simde__m128i IDX = simde_mm_setr_epi32(bidx[0], bidx[1], bidx[2], bidx[3]); + + int rshift = 64 - bit_width; + simde__m256i w64_le = + simde_mm256_i32gather_epi64((const int64_t*)in, IDX, 1); + simde__m256i w64_be = simde_mm256_shuffle_epi8(w64_le, SHUF_REV8); + simde__m256i OFF64 = simde_mm256_cvtepu32_epi64( + simde_mm_setr_epi32(off[0], off[1], off[2], off[3])); + simde__m256i U64 = simde_mm256_sllv_epi64(w64_be, OFF64); + simde__m256i V64 = + simde_mm256_srl_epi64(U64, simde_mm_cvtsi32_si128(rshift)); + + // Add delta_min + V64 = simde_mm256_add_epi64(V64, VMIN4); + + // Prefix sum (64-bit, 4 lanes) + simde__m256i t; + // shift by 8 bytes = 1 lane + t = simde_mm256_slli_si256(V64, 8); + V64 = simde_mm256_add_epi64(V64, t); + // cross-lane: add lane[1] to lane[2] and lane[3] + // Extract high 128 bits, add broadcast of element[1] to both elements + int64_t tmp_buf[4]; + simde_mm256_storeu_si256((simde__m256i*)tmp_buf, V64); + tmp_buf[2] += tmp_buf[1]; + tmp_buf[3] += tmp_buf[1]; + V64 = simde_mm256_loadu_si256((const simde__m256i*)tmp_buf); + + // Add base + simde__m256i C4 = simde_mm256_set1_epi64x(base); + V64 = simde_mm256_add_epi64(V64, C4); + + simde_mm256_storeu_si256((simde__m256i*)out, V64); + return out[3]; +} + +#endif // ENABLE_SIMD + +// ============================================================================ +// Scalar batch decode helpers +// ============================================================================ + +// Scalar: extract one value from bit-packed data. +// @data: pointer to packed bits (NOT advanced; caller handles position) +// @bit_pos: bit offset from start of data +// @bit_width: bits per value +static inline int64_t scalar_read_bits(const uint8_t* data, int32_t bit_pos, + int32_t bit_width) { + int64_t value = 0; + int bits = bit_width; + int byte_idx = bit_pos >> 3; + int bit_offset = bit_pos & 7; + int bits_avail = 8 - bit_offset; + + while (bits > 0) { + if (bits >= bits_avail) { + uint8_t d = data[byte_idx] & ((1 << bits_avail) - 1); + value = (value << bits_avail) | d; + bits -= bits_avail; + byte_idx++; + bits_avail = 8; + } else { + uint8_t d = + (data[byte_idx] >> (bits_avail - bits)) & ((1 << bits) - 1); + value = (value << bits) | d; + bits = 0; + } + } + return value; +} + namespace ts2diff_java_detail { // Java float/double TS_2DIFF overflow page markers. @@ -54,7 +221,7 @@ inline bool bitmap_marked(const std::vector& bm, int idx) { inline bool looks_like_ts2diff_header(common::ByteStream& in) { int ret = common::E_OK; - uint32_t probe_mark = in.read_pos(); + uint64_t probe_mark = in.read_pos(); int32_t write_index = 0; int32_t bit_width = 0; if (RET_FAIL(common::SerializationUtil::read_i32(write_index, in)) || @@ -82,7 +249,7 @@ inline int consume_float_double_ts2diff_prefix( underflow_bm.clear(); overflow_bm.clear(); segment_size = 0; - uint32_t mark = in.read_pos(); + uint64_t mark = in.read_pos(); uint32_t tag = 0; if (RET_FAIL(common::SerializationUtil::read_var_uint(tag, in))) { return ret; @@ -132,6 +299,9 @@ inline int consume_float_double_ts2diff_prefix( } // namespace ts2diff_java_detail +// ============================================================================ +// TS2DIFFDecoder template +// ============================================================================ template class TS2DIFFDecoder : public Decoder { public: @@ -148,12 +318,14 @@ class TS2DIFFDecoder : public Decoder { previous_value_ = 0; bit_width_ = 0; current_index_ = 0; + header_peeked_ = false; } FORCE_INLINE bool has_remaining(const common::ByteStream& buffer) override { if (buffer.has_remaining()) return true; - return bits_left_ != 0 || (current_index_ <= write_index_ && - write_index_ != -1 && current_index_ != 0); + return header_peeked_ || bits_left_ != 0 || + (current_index_ <= write_index_ && write_index_ != -1 && + current_index_ != 0); } void read_header(common::ByteStream& in) { @@ -208,6 +380,18 @@ class TS2DIFFDecoder : public Decoder { int read_String(common::String& ret_value, common::PageArena& pa, common::ByteStream& in) override; + int read_batch_int32(int32_t* out, int capacity, int& actual, + common::ByteStream& in) override; + int read_batch_int64(int64_t* out, int capacity, int& actual, + common::ByteStream& in) override; + int skip_int32(int count, int& skipped, common::ByteStream& in) override; + int skip_int64(int count, int& skipped, common::ByteStream& in) override; + + bool peek_next_block_range_int64(common::ByteStream& in, int64_t& block_min, + int64_t& block_max, + int& block_count) override; + int skip_peeked_block_int64(common::ByteStream& in, int& skipped) override; + public: T first_value_; T previous_value_; @@ -218,8 +402,13 @@ class TS2DIFFDecoder : public Decoder { int bit_width_; int write_index_; int current_index_; + bool header_peeked_; }; +// ============================================================================ +// Per-value decode (unchanged) +// ============================================================================ + template <> inline int32_t TS2DIFFDecoder::decode(common::ByteStream& in) { int32_t ret_value = stored_value_; @@ -274,6 +463,436 @@ inline int64_t TS2DIFFDecoder::decode(common::ByteStream& in) { return ret_value; } +// ============================================================================ +// Batch decode: INT32 +// Decodes one full block (up to 129 values) per call using SIMD when enabled. +// ============================================================================ + +template <> +inline int TS2DIFFDecoder::read_batch_int32(int32_t* out, int capacity, + int& actual, + common::ByteStream& in) { + actual = 0; + + while (actual < capacity && has_remaining(in)) { + // If we are mid-block (current_index_ != 0), finish it per-value. + if (current_index_ != 0) { + while (actual < capacity && current_index_ != 0 && + has_remaining(in)) { + out[actual++] = decode(in); + } + continue; + } + + // Start of a new block — read header + read_header(in); + common::SerializationUtil::read_i32(delta_min_, in); + common::SerializationUtil::read_i32(first_value_, in); + bits_left_ = 0; + buffer_ = 0; + + // Output first_value + if (actual >= capacity) { + // Must consume first_value next time; set state for per-value path + current_index_ = 0; + // We already consumed the header; push first_value as stored + // and let the next call to decode() handle it. + // Actually, we need to handle this: rewind is not possible. + // So we output first_value and accept going 1 over capacity. + } + out[actual++] = first_value_; + + if (write_index_ == 0) { + // Block has only first_value, no deltas + current_index_ = 0; + continue; + } + + int32_t remaining = write_index_; + if (actual + remaining > capacity) { + // Block won't fit in output. Fall back to per-value decode. + // Stream is at packed data start; bits_left_/buffer_ are reset. + current_index_ = 1; + continue; + } + if (!in.is_wrapped()) { + // SIMD/scalar block decode below requires a contiguous wrapped + // buffer. For a paged ByteStream, drop down to per-value + // decode the same way the doesn't-fit branch does. + current_index_ = 1; + continue; + } + + // Full block decode. Validate against corrupt headers before + // advancing the read position — a bogus bit_width_ or write_index_ + // could compute a block_bytes that overflows the int32_t multiply + // or runs past the wrapped buffer. + if (UNLIKELY(write_index_ < 0 || bit_width_ < 0 || bit_width_ > 32)) { + return common::E_TSFILE_CORRUPTED; + } + int64_t block_bytes_64 = + (static_cast(write_index_) * bit_width_ + 7) / 8; + if (UNLIKELY(block_bytes_64 > in.remaining_size())) { + return common::E_TSFILE_CORRUPTED; + } + int32_t block_bytes = static_cast(block_bytes_64); + const uint8_t* blk_ptr = + (const uint8_t*)in.get_wrapped_buf() + in.read_pos(); + in.wrapped_buf_advance_read_pos(static_cast(block_bytes)); + + int32_t prev = first_value_; + int32_t i = 0; + +#ifdef ENABLE_SIMD + // SIMD path: decode 8 values at a time (2 groups of 4) + for (; i + 7 < remaining; i += 8) { + int32_t need_bytes = ((i + 7) * bit_width_ + bit_width_ + 7) / 8 + + (bit_width_ > 16 ? 8 : 4); + if (need_bytes > block_bytes) break; + + int32_t grp_out[8]; + prev = simd_decode_4_i32(blk_ptr, bit_width_, delta_min_, i, prev, + grp_out); + prev = simd_decode_4_i32(blk_ptr, bit_width_, delta_min_, i + 4, + prev, grp_out + 4); + + memcpy(out + actual, grp_out, 8 * sizeof(int32_t)); + actual += 8; + } +#endif + + // Scalar tail + int32_t bit_pos = i * bit_width_; + for (; i < remaining; ++i) { + int64_t delta = scalar_read_bits(blk_ptr, bit_pos, bit_width_); + bit_pos += bit_width_; + int32_t val = (int32_t)delta + prev + delta_min_; + prev = val; + out[actual++] = val; + } + + // Block done, reset state + first_value_ = prev; + current_index_ = 0; + } + + return common::E_OK; +} + +// ============================================================================ +// Batch decode: INT64 +// ============================================================================ + +template <> +inline int TS2DIFFDecoder::read_batch_int64(int64_t* out, int capacity, + int& actual, + common::ByteStream& in) { + actual = 0; + + while (actual < capacity && has_remaining(in)) { + // If mid-block, finish per-value + if (current_index_ != 0) { + while (actual < capacity && current_index_ != 0 && + has_remaining(in)) { + out[actual++] = decode(in); + } + continue; + } + + // Start of a new block + if (!header_peeked_) { + read_header(in); + common::SerializationUtil::read_i64(delta_min_, in); + common::SerializationUtil::read_i64(first_value_, in); + bits_left_ = 0; + buffer_ = 0; + } + header_peeked_ = false; + + out[actual++] = first_value_; + + if (write_index_ == 0) { + current_index_ = 0; + continue; + } + + int32_t remaining = write_index_; + if (actual + remaining > capacity) { + // Block won't fit in output. Fall back to per-value decode. + // Stream is at packed data start; bits_left_/buffer_ are reset. + current_index_ = 1; + continue; + } + if (!in.is_wrapped()) { + // SIMD/scalar block decode below requires a contiguous wrapped + // buffer. Page-backed ByteStreams must use the per-value path. + current_index_ = 1; + continue; + } + + // Validate against corrupt headers (see int32 path). + if (UNLIKELY(write_index_ < 0 || bit_width_ < 0 || bit_width_ > 64)) { + return common::E_TSFILE_CORRUPTED; + } + int64_t block_bytes_64 = + (static_cast(write_index_) * bit_width_ + 7) / 8; + if (UNLIKELY(block_bytes_64 > in.remaining_size())) { + return common::E_TSFILE_CORRUPTED; + } + int32_t block_bytes = static_cast(block_bytes_64); + // Direct pointer into the wrapped ByteStream buffer. + const uint8_t* blk_ptr = + (const uint8_t*)in.get_wrapped_buf() + in.read_pos(); + in.wrapped_buf_advance_read_pos(static_cast(block_bytes)); + + int64_t prev = first_value_; + int32_t i = 0; + +#ifdef ENABLE_SIMD + // SIMD path: decode 4 INT64 values at a time + for (; i + 3 < remaining; i += 4) { + int32_t need_bytes = + ((i + 3) * bit_width_ + bit_width_ + 7) / 8 + 8; + if (need_bytes > block_bytes) break; + + int64_t grp_out[4]; + prev = simd_decode_4_i64(blk_ptr, bit_width_, delta_min_, i, prev, + grp_out); + memcpy(out + actual, grp_out, 4 * sizeof(int64_t)); + actual += 4; + } +#endif + + // Scalar tail + int32_t bit_pos = i * bit_width_; + for (; i < remaining; ++i) { + int64_t delta = scalar_read_bits(blk_ptr, bit_pos, bit_width_); + bit_pos += bit_width_; + int64_t val = delta + prev + delta_min_; + prev = val; + out[actual++] = val; + } + + first_value_ = prev; + current_index_ = 0; + } + + return common::E_OK; +} + +// ============================================================================ +// Skip: INT32 — read header only, jump over packed data +// ============================================================================ + +template <> +inline int TS2DIFFDecoder::skip_int32(int count, int& skipped, + common::ByteStream& in) { + skipped = 0; + + // If mid-block, finish current block per-value + while (skipped < count && current_index_ != 0 && has_remaining(in)) { + decode(in); + ++skipped; + } + + while (skipped < count && has_remaining(in)) { + int32_t wi, bw, dm, fv; + common::SerializationUtil::read_i32(wi, in); + common::SerializationUtil::read_i32(bw, in); + common::SerializationUtil::read_i32(dm, in); + common::SerializationUtil::read_i32(fv, in); + + int32_t block_vals = wi + 1; + bits_left_ = 0; + buffer_ = 0; + + if (count - skipped >= block_vals) { + // Whole-block fast path: jump over packed body. + int32_t skip_bytes = (wi * bw + 7) / 8; + in.wrapped_buf_advance_read_pos(skip_bytes); + skipped += block_vals; + current_index_ = 0; + write_index_ = -1; + } else { + // Partial block: reinstate decoder state as if we'd just + // emitted first_value_ from decode(), bump skipped by 1, + // then per-value decode the remaining count, leaving the + // rest of the block intact for the next decode() call. + write_index_ = wi; + bit_width_ = bw; + delta_min_ = dm; + first_value_ = fv; + current_index_ = (wi == 0) ? 0 : 1; + ++skipped; + while (skipped < count && current_index_ != 0 && + has_remaining(in)) { + decode(in); + ++skipped; + } + } + } + + return common::E_OK; +} + +// ============================================================================ +// Skip: INT64 +// ============================================================================ + +template <> +inline int TS2DIFFDecoder::skip_int64(int count, int& skipped, + common::ByteStream& in) { + skipped = 0; + + while (skipped < count && current_index_ != 0 && has_remaining(in)) { + decode(in); + ++skipped; + } + + while (skipped < count && has_remaining(in)) { + int32_t wi, bw; + int64_t dm, fv; + common::SerializationUtil::read_i32(wi, in); + common::SerializationUtil::read_i32(bw, in); + common::SerializationUtil::read_i64(dm, in); + common::SerializationUtil::read_i64(fv, in); + + int32_t block_vals = wi + 1; + bits_left_ = 0; + buffer_ = 0; + + if (count - skipped >= block_vals) { + int32_t skip_bytes = (wi * bw + 7) / 8; + in.wrapped_buf_advance_read_pos(skip_bytes); + skipped += block_vals; + current_index_ = 0; + write_index_ = -1; + } else { + write_index_ = wi; + bit_width_ = bw; + delta_min_ = dm; + first_value_ = fv; + current_index_ = (wi == 0) ? 0 : 1; + ++skipped; + while (skipped < count && current_index_ != 0 && + has_remaining(in)) { + decode(in); + ++skipped; + } + } + } + + return common::E_OK; +} + +// ============================================================================ +// Block-level filter check: peek header and compute value range +// ============================================================================ + +template <> +inline bool TS2DIFFDecoder::peek_next_block_range_int64( + common::ByteStream& in, int64_t& block_min, int64_t& block_max, + int& block_count) { + if (current_index_ != 0 || !has_remaining(in)) return false; + + read_header(in); + common::SerializationUtil::read_i64(delta_min_, in); + common::SerializationUtil::read_i64(first_value_, in); + bits_left_ = 0; + buffer_ = 0; + + block_min = first_value_; + block_count = write_index_ + 1; + + // Look-ahead: since timestamps are monotonically increasing, the true + // block_max is the last timestamp, which equals next block's first_value_. + // The next block header starts at read_pos + packed_bytes. first_value_ is + // at offset 16 within the header + // (write_index_(4)+bit_width_(4)+delta_min_(8)). We read it via raw pointer + // so the stream position is not consumed. + int32_t packed_bytes = (write_index_ * bit_width_ + 7) / 8; + if (in.remaining_size() >= (uint32_t)packed_bytes + 24) { + char* next_fv_ptr = + in.get_wrapped_buf() + in.read_pos() + packed_bytes + 16; + block_max = (int64_t)common::SerializationUtil::read_ui64(next_fv_ptr); + } else { + // Last block in page: fall back to conservative estimate. + if (write_index_ == 0 || bit_width_ == 0) { + block_max = first_value_ + (int64_t)write_index_ * delta_min_; + } else if (bit_width_ >= 63) { + block_max = INT64_MAX; + } else { + int64_t max_delta = delta_min_ + ((1LL << bit_width_) - 1); + block_max = first_value_ + (int64_t)write_index_ * max_delta; + } + } + + header_peeked_ = true; + return true; +} + +template <> +inline int TS2DIFFDecoder::skip_peeked_block_int64( + common::ByteStream& in, int& skipped) { + skipped = write_index_ + 1; + int32_t skip_bytes = (write_index_ * bit_width_ + 7) / 8; + in.wrapped_buf_advance_read_pos(skip_bytes); + header_peeked_ = false; + bits_left_ = 0; + buffer_ = 0; + current_index_ = 0; + write_index_ = -1; + return common::E_OK; +} + +// INT32 specialization: not applicable (timestamps are always INT64) +template <> +inline bool TS2DIFFDecoder::peek_next_block_range_int64( + common::ByteStream& in, int64_t& block_min, int64_t& block_max, + int& block_count) { + return false; +} + +template <> +inline int TS2DIFFDecoder::skip_peeked_block_int64( + common::ByteStream& in, int& skipped) { + return common::E_NOT_SUPPORT; +} + +// ============================================================================ +// Default (unsupported type) batch/skip — fall back to base class +// ============================================================================ + +template <> +inline int TS2DIFFDecoder::read_batch_int64(int64_t* out, int capacity, + int& actual, + common::ByteStream& in) { + return Decoder::read_batch_int64(out, capacity, actual, in); +} + +template <> +inline int TS2DIFFDecoder::skip_int64(int count, int& skipped, + common::ByteStream& in) { + return Decoder::skip_int64(count, skipped, in); +} + +template <> +inline int TS2DIFFDecoder::read_batch_int32(int32_t* out, int capacity, + int& actual, + common::ByteStream& in) { + return Decoder::read_batch_int32(out, capacity, actual, in); +} + +template <> +inline int TS2DIFFDecoder::skip_int32(int count, int& skipped, + common::ByteStream& in) { + return Decoder::skip_int32(count, skipped, in); +} + +// ============================================================================ +// Float / Double wrapper decoders (unchanged) +// ============================================================================ + class FloatTS2DIFFDecoder : public TS2DIFFDecoder { public: FloatTS2DIFFDecoder() = default; @@ -282,11 +901,24 @@ class FloatTS2DIFFDecoder : public TS2DIFFDecoder { return common::int_to_float(value_int); } - int read_boolean(bool& ret_value, common::ByteStream& in); - int read_int32(int32_t& ret_value, common::ByteStream& in); - int read_int64(int64_t& ret_value, common::ByteStream& in); - int read_float(float& ret_value, common::ByteStream& in); - int read_double(double& ret_value, common::ByteStream& in); + int read_boolean(bool& ret_value, common::ByteStream& in) override; + int read_int32(int32_t& ret_value, common::ByteStream& in) override; + int read_int64(int64_t& ret_value, common::ByteStream& in) override; + int read_float(float& ret_value, common::ByteStream& in) override; + int read_double(double& ret_value, common::ByteStream& in) override; + + int read_batch_float(float* out, int capacity, int& actual, + common::ByteStream& in) override { + // Reuse SIMD batch decode for int32, then bit-cast to float + int32_t* buf = reinterpret_cast(out); + int ret = TS2DIFFDecoder::read_batch_int32(buf, capacity, + actual, in); + if (ret != common::E_OK) return ret; + for (int i = 0; i < actual; ++i) { + out[i] = common::int_to_float(buf[i]); + } + return common::E_OK; + } private: bool is_legacy_raw_{false}; @@ -306,11 +938,24 @@ class DoubleTS2DIFFDecoder : public TS2DIFFDecoder { return common::long_to_double(value_long); } - int read_boolean(bool& ret_value, common::ByteStream& in); - int read_int32(int32_t& ret_value, common::ByteStream& in); - int read_int64(int64_t& ret_value, common::ByteStream& in); - int read_float(float& ret_value, common::ByteStream& in); - int read_double(double& ret_value, common::ByteStream& in); + int read_boolean(bool& ret_value, common::ByteStream& in) override; + int read_int32(int32_t& ret_value, common::ByteStream& in) override; + int read_int64(int64_t& ret_value, common::ByteStream& in) override; + int read_float(float& ret_value, common::ByteStream& in) override; + int read_double(double& ret_value, common::ByteStream& in) override; + + int read_batch_double(double* out, int capacity, int& actual, + common::ByteStream& in) override { + // Reuse SIMD batch decode for int64, then bit-cast to double + int64_t* buf = reinterpret_cast(out); + int ret = TS2DIFFDecoder::read_batch_int64(buf, capacity, + actual, in); + if (ret != common::E_OK) return ret; + for (int i = 0; i < actual; ++i) { + out[i] = common::long_to_double(buf[i]); + } + return common::E_OK; + } private: bool is_legacy_raw_{false}; diff --git a/cpp/src/encoding/ts2diff_encoder.h b/cpp/src/encoding/ts2diff_encoder.h index d1ab43bfd..fc494581a 100644 --- a/cpp/src/encoding/ts2diff_encoder.h +++ b/cpp/src/encoding/ts2diff_encoder.h @@ -29,12 +29,9 @@ #include "common/allocator/alloc_base.h" #include "common/allocator/byte_stream.h" #include "encoder.h" -#if defined(__SSE4_2__) -#include -#define USE_SSE 1 -#elif defined(__AVX2__) -#include -#define USE_AVX2 1 + +#ifdef ENABLE_SIMD +#include "simde/x86/avx2.h" #endif namespace storage { @@ -44,15 +41,16 @@ struct SIMDOps; template <> struct SIMDOps { -#ifdef USE_SSE +#ifdef ENABLE_SIMD static void rebase(int32_t* arr, int32_t min_val, size_t size) { - const __m128i min_vec = _mm_set1_epi32(min_val); + const simde__m128i min_vec = simde_mm_set1_epi32(min_val); size_t i = 0; for (; i + 3 < size; i += 4) { - __m128i vec = - _mm_loadu_si128(reinterpret_cast(arr + i)); - vec = _mm_sub_epi32(vec, min_vec); - _mm_storeu_si128(reinterpret_cast<__m128i*>(arr + i), vec); + simde__m128i vec = simde_mm_loadu_si128( + reinterpret_cast(arr + i)); + vec = simde_mm_sub_epi32(vec, min_vec); + simde_mm_storeu_si128(reinterpret_cast(arr + i), + vec); } for (; i < size; ++i) { arr[i] -= min_val; @@ -69,15 +67,16 @@ struct SIMDOps { template <> struct SIMDOps { -#ifdef USE_AVX2 +#ifdef ENABLE_SIMD static void rebase(int64_t* arr, int64_t min_val, size_t size) { - const __m256i min_vec = _mm256_set1_epi64x(min_val); + const simde__m256i min_vec = simde_mm256_set1_epi64x(min_val); size_t i = 0; for (; i + 3 < size; i += 4) { - __m256i vec = - _mm256_loadu_si256(reinterpret_cast(arr + i)); - vec = _mm256_sub_epi64(vec, min_vec); - _mm256_storeu_si256(reinterpret_cast<__m256i*>(arr + i), vec); + simde__m256i vec = simde_mm256_loadu_si256( + reinterpret_cast(arr + i)); + vec = simde_mm256_sub_epi64(vec, min_vec); + simde_mm256_storeu_si256(reinterpret_cast(arr + i), + vec); } for (; i < size; ++i) { arr[i] -= min_val; @@ -99,7 +98,7 @@ class TS2DIFFEncoder : public Encoder { ~TS2DIFFEncoder() { destroy(); } - void reset() { write_index_ = -1; } + void reset() override { write_index_ = -1; } void init() { block_size_ = 128; @@ -115,7 +114,7 @@ class TS2DIFFEncoder : public Encoder { previous_value_ = 0; } - void destroy() { + void destroy() override { if (delta_arr_ != nullptr) { common::mem_free(delta_arr_); delta_arr_ = nullptr; @@ -167,17 +166,71 @@ class TS2DIFFEncoder : public Encoder { return bit_width; } + // Batch bit-pack `count` values (each `bit_width` bits, MSB-first within + // byte) into a single contiguous buffer and write it to out_stream in one + // call. Avoids the per-byte write_buf overhead of the scalar write_bits + // loop. + // + // Result codes: + // E_OK → written successfully. + // -1 → caller must fall back to write_bits + flush_remaining because + // bit_width exceeds the safe accumulator width. + // any other non-zero value → real write_buf error; the caller must + // propagate it instead of treating the flush as successful. + template + static int pack_bits_msb(const U* values, int count, int bit_width, + common::ByteStream& out_stream) { + if (count <= 0 || bit_width <= 0) return common::E_OK; + if (bit_width > 56) return -1; // fall back + + size_t total_bytes = ((size_t)count * (size_t)bit_width + 7) / 8; + std::vector buf(total_bytes, 0); + + uint64_t accum = 0; + int bits_in_accum = 0; + size_t pos = 0; + const uint64_t mask = (1ULL << bit_width) - 1; + + for (int i = 0; i < count; i++) { + uint64_t v = static_cast(values[i]) & mask; + accum = (accum << bit_width) | v; + bits_in_accum += bit_width; + while (bits_in_accum >= 8) { + buf[pos++] = static_cast(accum >> (bits_in_accum - 8)); + bits_in_accum -= 8; + } + if (bits_in_accum > 0) { + accum &= ((1ULL << bits_in_accum) - 1); + } else { + accum = 0; + } + } + if (bits_in_accum > 0) { + buf[pos++] = static_cast(accum << (8 - bits_in_accum)); + } + // Surface write failures. Previously the return code was dropped on + // the floor and flush() returned E_OK, then reset() wiped the + // encoder state — the on-disk page ended up missing its delta block + // but the caller thought the data was safe. + return out_stream.write_buf(buf.data(), pos); + } + int do_encode(T value, common::ByteStream& out_stream); - int encode(bool value, common::ByteStream& out_stream); - int encode(int32_t value, common::ByteStream& out_stream); - int encode(int64_t value, common::ByteStream& out_stream); - int encode(float value, common::ByteStream& out_stream); - int encode(double value, common::ByteStream& out_stream); - int encode(common::String value, common::ByteStream& out_stream); + int encode(bool value, common::ByteStream& out_stream) override; + int encode(int32_t value, common::ByteStream& out_stream) override; + int encode(int64_t value, common::ByteStream& out_stream) override; + int encode(float value, common::ByteStream& out_stream) override; + int encode(double value, common::ByteStream& out_stream) override; + int encode(common::String value, common::ByteStream& out_stream) override; + + int encode_batch(const int32_t* values, uint32_t count, + common::ByteStream& out_stream) override; + int encode_batch(const int64_t* values, uint32_t count, + common::ByteStream& out_stream) override; - int flush(common::ByteStream& out_stream); + int flush(common::ByteStream& out_stream) override; - int get_max_byte_size() { + int get_max_byte_size() override { // The meaning of 24 is: index(4)+width(4)+minDeltaBase(8)+firstValue(8) return 24 + write_index_ * 8; } @@ -235,16 +288,39 @@ inline int TS2DIFFEncoder::flush(common::ByteStream& out_stream) { SIMDOps::rebase(delta_arr_, delta_arr_min_, write_index_); // Calculate the bit length of each value to writer int bit_width = cal_bit_width(delta_arr_max_ - delta_arr_min_); - // writer header - common::SerializationUtil::write_ui32(write_index_, out_stream); - common::SerializationUtil::write_ui32(bit_width, out_stream); - common::SerializationUtil::write_ui32(delta_arr_min_, out_stream); - common::SerializationUtil::write_ui32(first_value_, out_stream); - // writer data - for (int i = 0; i < write_index_; i++) { - write_bits(delta_arr_[i], bit_width, out_stream); + // Header writes can fail too (back-pressure / OOM on the underlying + // stream); a half-written header followed by reset() leaves the page + // corrupted but the caller thinking the data was flushed. + if (RET_FAIL( + common::SerializationUtil::write_ui32(write_index_, out_stream))) { + return ret; + } + if (RET_FAIL( + common::SerializationUtil::write_ui32(bit_width, out_stream))) { + return ret; + } + if (RET_FAIL(common::SerializationUtil::write_ui32(delta_arr_min_, + out_stream))) { + return ret; + } + if (RET_FAIL( + common::SerializationUtil::write_ui32(first_value_, out_stream))) { + return ret; + } + // writer data — batched bit-pack + single write_buf for the common case; + // fall back to per-bit path for the rare wide bit_width. + const int pack_ret = + pack_bits_msb(delta_arr_, write_index_, bit_width, out_stream); + if (pack_ret == -1) { + for (int i = 0; i < write_index_; i++) { + write_bits(delta_arr_[i], bit_width, out_stream); + } + flush_remaining(out_stream); + } else if (pack_ret != common::E_OK) { + // Real write failure — don't clear encoder state so the higher + // layer can detect the page is poisoned. + return pack_ret; } - flush_remaining(out_stream); reset(); return ret; } @@ -259,20 +335,222 @@ inline int TS2DIFFEncoder::flush(common::ByteStream& out_stream) { SIMDOps::rebase(delta_arr_, delta_arr_min_, write_index_); // Calculate the bit length of each value to writer int bit_width = cal_bit_width(delta_arr_max_ - delta_arr_min_); - // writer header - common::SerializationUtil::write_i32(write_index_, out_stream); - common::SerializationUtil::write_i32(bit_width, out_stream); - common::SerializationUtil::write_i64(delta_arr_min_, out_stream); - common::SerializationUtil::write_i64(first_value_, out_stream); - // writer data - for (int i = 0; i < write_index_; i++) { - write_bits(delta_arr_[i], bit_width, out_stream); + // Header writes can fail too — see int32 specialization for rationale. + if (RET_FAIL( + common::SerializationUtil::write_i32(write_index_, out_stream))) { + return ret; + } + if (RET_FAIL(common::SerializationUtil::write_i32(bit_width, out_stream))) { + return ret; + } + if (RET_FAIL( + common::SerializationUtil::write_i64(delta_arr_min_, out_stream))) { + return ret; + } + if (RET_FAIL( + common::SerializationUtil::write_i64(first_value_, out_stream))) { + return ret; + } + // writer data — batched bit-pack + single write_buf for the common case; + // fall back to per-bit path for the rare wide bit_width (>56). + const int pack_ret = + pack_bits_msb(delta_arr_, write_index_, bit_width, out_stream); + if (pack_ret == -1) { + for (int i = 0; i < write_index_; i++) { + write_bits(delta_arr_[i], bit_width, out_stream); + } + flush_remaining(out_stream); + } else if (pack_ret != common::E_OK) { + return pack_ret; } - flush_remaining(out_stream); reset(); // 语义,writeIndex=-1; return ret; } +// ============================================================================ +// Batch encode: INT32 +// Adjacent-difference removes sequential dependency; SIMD for delta + min/max. +// ============================================================================ + +template <> +inline int TS2DIFFEncoder::encode_batch( + const int32_t* values, uint32_t count, common::ByteStream& out_stream) { + int ret = common::E_OK; + uint32_t offset = 0; + + while (offset < count) { + // Start of new block: store first_value + if (write_index_ == -1) { + first_value_ = values[offset]; + previous_value_ = first_value_; + write_index_ = 0; + offset++; + continue; + } + + // How many deltas fit in current block + uint32_t space = static_cast(block_size_) - write_index_; + uint32_t batch = std::min(count - offset, space); + + // ── Adjacent difference: delta[i] = values[i] - values[i-1] ── + // First delta uses previous_value_ + delta_arr_[write_index_] = values[offset] - previous_value_; + + uint32_t i = 1; +#ifdef ENABLE_SIMD + // SIMD: 4 adjacent differences at a time + for (; i + 3 < batch; i += 4) { + simde__m128i cur = simde_mm_loadu_si128( + reinterpret_cast(values + offset + i)); + simde__m128i prv = simde_mm_loadu_si128( + reinterpret_cast(values + offset + i - 1)); + simde__m128i diff = simde_mm_sub_epi32(cur, prv); + simde_mm_storeu_si128( + reinterpret_cast(delta_arr_ + write_index_ + i), + diff); + } +#endif + for (; i < batch; i++) { + delta_arr_[write_index_ + i] = + values[offset + i] - values[offset + i - 1]; + } + previous_value_ = values[offset + batch - 1]; + + // ── Min/max of new deltas ── + int32_t local_min = delta_arr_[write_index_]; + int32_t local_max = delta_arr_[write_index_]; + + uint32_t j = 1; +#ifdef ENABLE_SIMD + if (batch >= 5) { + simde__m128i vmin = simde_mm_set1_epi32(local_min); + simde__m128i vmax = vmin; + for (; j + 3 < batch; j += 4) { + simde__m128i v = + simde_mm_loadu_si128(reinterpret_cast( + delta_arr_ + write_index_ + j)); + vmin = simde_mm_min_epi32(vmin, v); + vmax = simde_mm_max_epi32(vmax, v); + } + // Horizontal reduce + int32_t tmp[4]; + simde_mm_storeu_si128(reinterpret_cast(tmp), vmin); + for (int k = 0; k < 4; k++) + if (tmp[k] < local_min) local_min = tmp[k]; + simde_mm_storeu_si128(reinterpret_cast(tmp), vmax); + for (int k = 0; k < 4; k++) + if (tmp[k] > local_max) local_max = tmp[k]; + } +#endif + for (; j < batch; j++) { + int32_t d = delta_arr_[write_index_ + j]; + if (d < local_min) local_min = d; + if (d > local_max) local_max = d; + } + + // Merge with block min/max + if (write_index_ == 0) { + delta_arr_min_ = local_min; + delta_arr_max_ = local_max; + } else { + if (local_min < delta_arr_min_) delta_arr_min_ = local_min; + if (local_max > delta_arr_max_) delta_arr_max_ = local_max; + } + + write_index_ += batch; + offset += batch; + + if (write_index_ >= block_size_) { + if (RET_FAIL(flush(out_stream))) return ret; + } + } + return ret; +} + +// ============================================================================ +// Batch encode: INT64 +// ============================================================================ + +template <> +inline int TS2DIFFEncoder::encode_batch( + const int64_t* values, uint32_t count, common::ByteStream& out_stream) { + int ret = common::E_OK; + uint32_t offset = 0; + + while (offset < count) { + if (write_index_ == -1) { + first_value_ = values[offset]; + previous_value_ = first_value_; + write_index_ = 0; + offset++; + continue; + } + + uint32_t space = static_cast(block_size_) - write_index_; + uint32_t batch = std::min(count - offset, space); + + // Adjacent difference + delta_arr_[write_index_] = values[offset] - previous_value_; + + uint32_t i = 1; +#ifdef ENABLE_SIMD + // SIMD: 2 adjacent differences at a time (128-bit, native NEON) + for (; i + 1 < batch; i += 2) { + simde__m128i cur = simde_mm_loadu_si128( + reinterpret_cast(values + offset + i)); + simde__m128i prv = simde_mm_loadu_si128( + reinterpret_cast(values + offset + i - 1)); + simde__m128i diff = simde_mm_sub_epi64(cur, prv); + simde_mm_storeu_si128( + reinterpret_cast(delta_arr_ + write_index_ + i), + diff); + } +#endif + for (; i < batch; i++) { + delta_arr_[write_index_ + i] = + values[offset + i] - values[offset + i - 1]; + } + previous_value_ = values[offset + batch - 1]; + + // Min/max (scalar — no efficient 64-bit SIMD min/max before AVX-512) + int64_t local_min = delta_arr_[write_index_]; + int64_t local_max = delta_arr_[write_index_]; + for (uint32_t j = 1; j < batch; j++) { + int64_t d = delta_arr_[write_index_ + j]; + if (d < local_min) local_min = d; + if (d > local_max) local_max = d; + } + + if (write_index_ == 0) { + delta_arr_min_ = local_min; + delta_arr_max_ = local_max; + } else { + if (local_min < delta_arr_min_) delta_arr_min_ = local_min; + if (local_max > delta_arr_max_) delta_arr_max_ = local_max; + } + + write_index_ += batch; + offset += batch; + + if (write_index_ >= block_size_) { + if (RET_FAIL(flush(out_stream))) return ret; + } + } + return ret; +} + +// Default: unsupported types fall back to base class loop +template +int TS2DIFFEncoder::encode_batch(const int32_t* values, uint32_t count, + common::ByteStream& out) { + return Encoder::encode_batch(values, count, out); +} +template +int TS2DIFFEncoder::encode_batch(const int64_t* values, uint32_t count, + common::ByteStream& out) { + return Encoder::encode_batch(values, count, out); +} + class FloatTS2DIFFEncoder : public TS2DIFFEncoder { public: FloatTS2DIFFEncoder() : max_point_number_(2), max_point_value_(100.0) {} @@ -280,6 +558,14 @@ class FloatTS2DIFFEncoder : public TS2DIFFEncoder { int32_t value_int = convert_float_to_int(value); return TS2DIFFEncoder::do_encode(value_int, out_stream); } + // PageWriter resets the encoder between pages without going through a + // successful flush() (e.g. when the prior page was aborted). The base + // reset() only clears write_index_; underflow_flags_ would otherwise + // leak the prior page's overflow markers into the next page's bitmap. + void reset() override { + TS2DIFFEncoder::reset(); + underflow_flags_.clear(); + } int flush(common::ByteStream& out_stream) override; int encode(bool value, common::ByteStream& out_stream); int encode(int32_t value, common::ByteStream& out_stream); @@ -332,6 +618,12 @@ class DoubleTS2DIFFEncoder : public TS2DIFFEncoder { int64_t value_long = convert_double_to_long(value); return TS2DIFFEncoder::do_encode(value_long, out_stream); } + // See FloatTS2DIFFEncoder::reset for rationale — the prior page's + // overflow markers must not bleed into the next. + void reset() override { + TS2DIFFEncoder::reset(); + underflow_flags_.clear(); + } int flush(common::ByteStream& out_stream) override; int encode(bool value, common::ByteStream& out_stream); int encode(int32_t value, common::ByteStream& out_stream); @@ -518,7 +810,6 @@ FORCE_INLINE int FloatTS2DIFFEncoder::flush(common::ByteStream& out_stream) { write_bits(delta_arr_[i], bit_width, inner); } flush_remaining(inner); - reset(); const bool overflow = has_overflow(); if (overflow) { @@ -564,7 +855,12 @@ FORCE_INLINE int FloatTS2DIFFEncoder::flush(common::ByteStream& out_stream) { if (RET_FAIL(merge_byte_stream(out_stream, inner, true))) { return ret; } + // Defer encoder-state wipe until after every write into out_stream has + // committed. An earlier reset() let a mid-flush failure leave + // write_index_ at -1, so the next flush() short-circuited at the top + // and the data was silently lost. underflow_flags_.clear(); + TS2DIFFEncoder::reset(); return ret; } @@ -597,7 +893,6 @@ FORCE_INLINE int DoubleTS2DIFFEncoder::flush(common::ByteStream& out_stream) { write_bits(delta_arr_[i], bit_width, inner); } flush_remaining(inner); - reset(); const bool overflow = has_overflow(); if (overflow) { @@ -643,7 +938,11 @@ FORCE_INLINE int DoubleTS2DIFFEncoder::flush(common::ByteStream& out_stream) { if (RET_FAIL(merge_byte_stream(out_stream, inner, true))) { return ret; } + // Same deferred-reset rationale as FloatTS2DIFFEncoder::flush — keeping + // write_index_ live until every committed write succeeds avoids the + // "next flush returns E_OK on lost data" pattern. underflow_flags_.clear(); + TS2DIFFEncoder::reset(); return ret; } diff --git a/cpp/src/file/read_file.cc b/cpp/src/file/read_file.cc index d9902ddb9..c6bfd547a 100644 --- a/cpp/src/file/read_file.cc +++ b/cpp/src/file/read_file.cc @@ -26,6 +26,7 @@ #ifdef _WIN32 #include #include + ssize_t pread(int fd, void* buf, size_t count, uint64_t offset); #else #include diff --git a/cpp/src/file/restorable_tsfile_io_writer.cc b/cpp/src/file/restorable_tsfile_io_writer.cc index 22a3fb500..a1fc53402 100644 --- a/cpp/src/file/restorable_tsfile_io_writer.cc +++ b/cpp/src/file/restorable_tsfile_io_writer.cc @@ -328,12 +328,15 @@ static int recover_chunk_statistic( uint32_t value_buf_size = 0; std::vector time_decode_buf; const std::vector* times = nullptr; - std::vector aligned_value_notnull_bitmap; + // For aligned pages, retain the per-row not-null bitmap so the stat-update + // loop can skip null positions and bind each decoded value to its real + // timestamp. Without this we'd hand non-null values to times[0..N-1] and + // get wrong start/end/first/last stats on sparse columns. + const char* aligned_bitmap = nullptr; uint32_t aligned_num_values = 0; - const bool is_aligned_value_chunk = - (time_batch != nullptr && !time_batch->empty()); + bool is_aligned_page = false; - if (is_aligned_value_chunk) { + if (time_batch != nullptr && !time_batch->empty()) { // Aligned value page: uncompressed layout = uint32(num_values) + bitmap // + value_buf if (uncompressed_size < 4) { @@ -341,7 +344,7 @@ static int recover_chunk_statistic( CompressorFactory::free(compressor); return E_OK; } - aligned_num_values = + uint32_t num_values = (static_cast( static_cast(uncompressed_buf[0])) << 24) | @@ -353,20 +356,19 @@ static int recover_chunk_statistic( << 8) | (static_cast( static_cast(uncompressed_buf[3]))); - uint32_t bitmap_size = (aligned_num_values + 7) / 8; + uint32_t bitmap_size = (num_values + 7) / 8; if (uncompressed_size < 4 + bitmap_size) { compressor->after_uncompress(uncompressed_buf); CompressorFactory::free(compressor); return E_OK; } - aligned_value_notnull_bitmap.resize(bitmap_size); - if (bitmap_size > 0) { - std::memcpy(aligned_value_notnull_bitmap.data(), - uncompressed_buf + 4, bitmap_size); - } value_buf = uncompressed_buf + 4 + bitmap_size; value_buf_size = uncompressed_size - 4 - bitmap_size; times = time_batch; + aligned_bitmap = uncompressed_buf + 4; + aligned_num_values = std::min( + num_values, static_cast(time_batch->size())); + is_aligned_page = true; } else { // Non-aligned value page: var_uint(time_buf_size) + time_buf + // value_buf @@ -419,25 +421,25 @@ static int recover_chunk_statistic( value_decoder->reset(); size_t idx = 0; const size_t num_times = times->size(); - while (idx < num_times) { - int64_t t = (*times)[idx]; - bool has_value = true; - if (is_aligned_value_chunk) { - has_value = false; - const uint32_t byte_idx = static_cast(idx / 8); - const uint32_t bit_shift = static_cast(idx % 8); - if (byte_idx < aligned_value_notnull_bitmap.size()) { - has_value = ((aligned_value_notnull_bitmap[byte_idx] & 0xFF) & - (0x80 >> bit_shift)) != 0; - } - } - if (!has_value) { + // For aligned pages the value stream only stores non-null rows; advance + // `idx` past null bitmap entries so each decoded value pairs with the + // matching timestamp. Non-aligned pages have no bitmap (every row is + // present), so we keep the dense walk. + auto bitmap_is_valid = [&](size_t row) -> bool { + if (!is_aligned_page) return true; + if (row >= aligned_num_values) return false; + // Aligned value-page bitmap: MSB-first within each byte, bit set + // means the row is NOT null. + unsigned char byte = + static_cast(aligned_bitmap[row / 8]); + return (byte & static_cast(0x80 >> (row % 8))) != 0; + }; + while (idx < num_times && value_decoder->has_remaining(value_in)) { + if (!bitmap_is_valid(idx)) { idx++; continue; } - if (!value_decoder->has_remaining(value_in)) { - break; - } + int64_t t = (*times)[idx]; switch (chdr.data_type_) { case common::BOOLEAN: { bool v; @@ -518,6 +520,12 @@ void RestorableTsFileIOWriter::close() { write_file_ = nullptr; write_file_owned_ = false; } + // Run the base writer's cleanup (frees post-recovery appended chunk + // metadata) before tearing down self_check_arena_ that backs the + // recovered ChunkGroupMeta entries. Base destroy() only touches entries + // it allocated itself (tracked in appended_chunk_metas_ / + // appended_chunk_group_metas_), so it never dereferences self_check + // arena memory. TsFileIOWriter::destroy(); for (ChunkGroupMeta* cgm : self_check_recovered_cgm_) { cgm->device_id_.reset(); @@ -842,15 +850,13 @@ int RestorableTsFileIOWriter::self_check(bool truncate_corrupted) { } } - // --- Attach recovered ChunkGroupMeta to writer; record per-CGM prefix - // length so destroy() can free stats appended later. --- - recovery_chunk_meta_prefix_.clear(); + // Attach recovered ChunkGroupMeta entries to the base writer. These + // live in self_check_arena_ and are *not* tracked in + // appended_chunk_group_metas_ — base destroy() leaves them alone, and + // close() resets their device_id_ refs before tearing down the arena. for (ChunkGroupMeta* cgm : recovered_cgm_list) { - recovery_chunk_meta_prefix_[cgm] = - static_cast(cgm->chunk_meta_list_.size()); push_chunk_group_meta(cgm); } - chunk_group_meta_from_recovery_ = true; return E_OK; } diff --git a/cpp/src/file/tsfile_io_reader.cc b/cpp/src/file/tsfile_io_reader.cc index 296556c15..014e78832 100644 --- a/cpp/src/file/tsfile_io_reader.cc +++ b/cpp/src/file/tsfile_io_reader.cc @@ -51,6 +51,8 @@ void TsFileIOReader::reset() { } read_file_ = nullptr; tsfile_meta_page_arena_.destroy(); + device_node_cache_.clear(); + device_node_cache_pa_.destroy(); tsfile_meta_ready_ = false; } } @@ -61,6 +63,9 @@ int TsFileIOReader::alloc_ssi(std::shared_ptr device_id, common::PageArena& pa, Filter* time_filter) { int ret = E_OK; if (RET_FAIL(load_tsfile_meta_if_necessary())) { + } else if (!bloom_filter_contains(device_id->get_device_name(), + measurement_name)) { + return E_NO_MORE_DATA; } else { ssi = new TsFileSeriesScanIterator; ssi->init(device_id, measurement_name, read_file_, time_filter, pa); @@ -80,6 +85,95 @@ int TsFileIOReader::alloc_ssi(std::shared_ptr device_id, return ret; } +int TsFileIOReader::alloc_multi_ssi( + std::shared_ptr device_id, + const std::vector& measurement_names, + TsFileSeriesScanIterator*& ssi, common::PageArena& pa, + Filter* time_filter) { + int ret = E_OK; + if (RET_FAIL(load_tsfile_meta_if_necessary())) return ret; + + ssi = new TsFileSeriesScanIterator; + ssi->init(device_id, measurement_names.empty() ? "" : measurement_names[0], + read_file_, time_filter, pa); + + auto& ssi_pa = ssi->timeseries_index_pa_; + + // Use cached device measurement node (avoids repeated file I/O) + CachedDeviceNode cached; + if (RET_FAIL(get_cached_device_node(device_id, ssi_pa, cached))) { + delete ssi; + ssi = nullptr; + return ret; + } + auto top_node = cached.top_node; + if (!cached.is_aligned) { + delete ssi; + ssi = nullptr; + return E_NOT_SUPPORT; + } + + // Get time column metadata + TimeseriesIndex* time_ts_idx = nullptr; + if (RET_FAIL(get_time_column_metadata(top_node, time_ts_idx, ssi_pa))) { + delete ssi; + ssi = nullptr; + return ret; + } + + // Create MultiAlignedTimeseriesIndex + void* multi_buf = ssi_pa.alloc(sizeof(MultiAlignedTimeseriesIndex)); + if (IS_NULL(multi_buf)) { + delete ssi; + ssi = nullptr; + return E_OOM; + } + auto* multi_idx = new (multi_buf) MultiAlignedTimeseriesIndex; + multi_idx->time_ts_idx_ = time_ts_idx; + + // Load each measurement's TimeseriesIndex + for (const auto& meas_name : measurement_names) { + std::shared_ptr meas_entry; + int64_t meas_end_offset = 0; + if (RET_FAIL(load_measurement_index_entry( + meas_name, top_node, meas_entry, meas_end_offset))) { + // Measurement not found — abort multi path + delete ssi; + ssi = nullptr; + return ret; + } + + ITimeseriesIndex* ts_idx = nullptr; + if (RET_FAIL(do_load_timeseries_index( + meas_name, meas_entry->get_offset(), meas_end_offset, ssi_pa, + ts_idx, /*is_aligned=*/true))) { + delete ssi; + ssi = nullptr; + return ret; + } + + auto* aligned_idx = dynamic_cast(ts_idx); + if (aligned_idx && aligned_idx->value_ts_idx_) { + multi_idx->value_ts_idxs_.push_back(aligned_idx->value_ts_idx_); + } else { + delete ssi; + ssi = nullptr; + return E_NOT_EXIST; + } + } + + ssi->itimeseries_index_ = multi_idx; + + // Skip global statistic filter for multi — per-chunk filtering still works. + + if (RET_FAIL(ssi->init_chunk_reader())) { + ssi->destroy(); + delete ssi; + ssi = nullptr; + } + return ret; +} + void TsFileIOReader::revert_ssi(TsFileSeriesScanIterator* ssi) { if (ssi != nullptr) { ssi->destroy(); @@ -96,61 +190,14 @@ int TsFileIOReader::get_device_timeseries_meta_without_chunk_meta( int64_t end_offset; std::vector, int64_t>> meta_index_entry_list; - std::shared_ptr top_node; - bool is_aligned = false; - TimeseriesIndex* time_timeseries_index = nullptr; if (RET_FAIL(load_device_index_entry( std::make_shared(device_id), meta_index_entry, end_offset))) { - } else { - int64_t start_offset = meta_index_entry->get_offset(); - ASSERT(start_offset < end_offset); - const int32_t read_size = end_offset - start_offset; - int32_t ret_read_len = 0; - char* data_buf = (char*)pa.alloc(read_size); - void* m_idx_node_buf = pa.alloc(sizeof(MetaIndexNode)); - if (IS_NULL(data_buf) || IS_NULL(m_idx_node_buf)) { - return E_OOM; - } - auto* top_node_ptr = new (m_idx_node_buf) MetaIndexNode(&pa); - top_node = std::shared_ptr(top_node_ptr, - MetaIndexNode::self_deleter); - if (RET_FAIL(read_file_->read(start_offset, data_buf, read_size, - ret_read_len))) { - } else if (RET_FAIL(top_node->deserialize_from(data_buf, read_size))) { - } else { - is_aligned = is_aligned_device(top_node); - if (is_aligned) { - if (RET_FAIL(get_time_column_metadata( - top_node, time_timeseries_index, pa))) { - return ret; - } - } - } - } - if (RET_FAIL(ret)) { - return ret; - } - if (RET_FAIL(load_all_measurement_index_entry( - meta_index_entry->get_offset(), end_offset, pa, - meta_index_entry_list))) { + } else if (RET_FAIL(load_all_measurement_index_entry( + meta_index_entry->get_offset(), end_offset, pa, + meta_index_entry_list))) { } else if (RET_FAIL(do_load_all_timeseries_index(meta_index_entry_list, pa, timeseries_indexs))) { - } else if (is_aligned && time_timeseries_index != nullptr) { - for (size_t i = 0; i < timeseries_indexs.size(); i++) { - void* buf = pa.alloc(sizeof(AlignedTimeseriesIndex)); - if (IS_NULL(buf)) { - return E_OOM; - } - auto* aligned_ts_idx = new (buf) AlignedTimeseriesIndex; - aligned_ts_idx->time_ts_idx_ = time_timeseries_index; - aligned_ts_idx->value_ts_idx_ = - dynamic_cast(timeseries_indexs[i]); - if (aligned_ts_idx->value_ts_idx_ == nullptr) { - return E_TYPE_NOT_MATCH; - } - timeseries_indexs[i] = aligned_ts_idx; - } } return ret; } @@ -225,6 +272,20 @@ bool TsFileIOReader::filter_stasify(ITimeseriesIndex* ts_index, return time_filter->satisfy(ts_index->get_statistic()); } +bool TsFileIOReader::bloom_filter_contains( + const std::string& device_name, const std::string& measurement_name) { + BloomFilter* bf = tsfile_meta_.bloom_filter_; + if (bf == nullptr || bf->is_empty()) { + return true; // no bloom filter — assume present + } + common::String dev_str, meas_str; + dev_str.buf_ = const_cast(device_name.c_str()); + dev_str.len_ = static_cast(device_name.size()); + meas_str.buf_ = const_cast(measurement_name.c_str()); + meas_str.len_ = static_cast(measurement_name.size()); + return bf->contains(dev_str, meas_str); +} + int TsFileIOReader::load_tsfile_meta_if_necessary() { int ret = E_OK; if (!tsfile_meta_ready_) { @@ -323,44 +384,111 @@ int TsFileIOReader::load_tsfile_meta() { return ret; } -int TsFileIOReader::load_timeseries_index_for_ssi( - std::shared_ptr device_id, const std::string& measurement_name, - TsFileSeriesScanIterator*& ssi) { +int TsFileIOReader::get_cached_device_node(std::shared_ptr device_id, + common::PageArena& pa, + CachedDeviceNode& out) { + std::string dev_name = device_id->get_device_name(); + + { + std::lock_guard lk(device_node_cache_mu_); + auto it = device_node_cache_.find(dev_name); + if (it != device_node_cache_.end()) { + out = it->second; + return E_OK; + } + } + + // Read the device meta index outside the lock — load_device_index_entry() + // and the file read can block on I/O, and we don't want to serialize all + // concurrent first-time lookups behind one slow disk fetch. Two callers + // racing on the same missing device may both do the read; that's wasted + // work but not corruption — the second insert is dropped below. int ret = E_OK; std::shared_ptr device_index_entry; int64_t device_ie_end_offset = 0; - std::shared_ptr measurement_index_entry; - int64_t measurement_ie_end_offset = 0; - // bool is_aligned = false; if (RET_FAIL(load_device_index_entry( std::make_shared(device_id), device_index_entry, device_ie_end_offset))) { return ret; } - auto& pa = ssi->timeseries_index_pa_; int64_t start_offset = device_index_entry->get_offset(), end_offset = device_ie_end_offset; ASSERT(start_offset < end_offset); - const int32_t read_size = end_offset - start_offset; + const int64_t read_size_i64 = end_offset - start_offset; + // read_file_->read() takes int32_t; a meta index node larger than 2 GiB + // is implausible but explicitly reject it instead of silently truncating + // the read length and corrupting the parse. + if (read_size_i64 <= 0 || read_size_i64 > INT32_MAX) { + return E_TSFILE_CORRUPTED; + } + const int32_t read_size = static_cast(read_size_i64); int32_t ret_read_len = 0; - char* data_buf = (char*)pa.alloc(read_size); - void* m_idx_node_buf = pa.alloc(sizeof(MetaIndexNode)); - if (IS_NULL(data_buf) || IS_NULL(m_idx_node_buf)) { + + // Read into a heap-owned buffer outside the lock. The previous + // implementation allocated data_buf inside device_node_cache_pa_ before + // the read happened — every failed read or parse left that allocation + // pinned forever in the shared arena, and repeated disk errors on the + // same device let a long-lived reader grow it without bound. Using a + // unique_ptr here means the read buffer is released on every failure + // path, and only the small MetaIndexNode allocations inside the lock + // share the arena. + std::unique_ptr data_buf(new (std::nothrow) char[read_size]); + if (data_buf == nullptr) { return E_OOM; } - auto* top_node_ptr = new (m_idx_node_buf) MetaIndexNode(&pa); - auto top_node = std::shared_ptr(top_node_ptr, - MetaIndexNode::self_deleter); - - if (RET_FAIL(read_file_->read(start_offset, data_buf, read_size, + if (RET_FAIL(read_file_->read(start_offset, data_buf.get(), read_size, ret_read_len))) { return ret; - } else if (RET_FAIL(top_node->deserialize_from(data_buf, read_size))) { + } + + CachedDeviceNode cached; + { + // Allocations into device_node_cache_pa_ and the map insert must be + // serialized — PageArena is not thread-safe, and unordered_map's + // rehash invalidates concurrent lookups. + std::lock_guard lk(device_node_cache_mu_); + // Re-check: another thread may have populated the entry while we + // were doing I/O. + auto it = device_node_cache_.find(dev_name); + if (it != device_node_cache_.end()) { + out = it->second; + return E_OK; + } + + void* m_idx_node_buf = + device_node_cache_pa_.alloc(sizeof(MetaIndexNode)); + if (IS_NULL(m_idx_node_buf)) { + return E_OOM; + } + auto* top_node_ptr = + new (m_idx_node_buf) MetaIndexNode(&device_node_cache_pa_); + auto top_node = std::shared_ptr( + top_node_ptr, MetaIndexNode::self_deleter); + if (RET_FAIL(top_node->deserialize_from(data_buf.get(), read_size))) { + return ret; + } + cached.top_node = top_node; + cached.is_aligned = is_aligned_device(top_node); + device_node_cache_.emplace(std::move(dev_name), cached); + } + out = cached; + return E_OK; +} + +int TsFileIOReader::load_timeseries_index_for_ssi( + std::shared_ptr device_id, const std::string& measurement_name, + TsFileSeriesScanIterator*& ssi) { + int ret = E_OK; + auto& pa = ssi->timeseries_index_pa_; + + CachedDeviceNode cached; + if (RET_FAIL(get_cached_device_node(device_id, pa, cached))) { return ret; } + auto top_node = cached.top_node; + bool is_aligned = cached.is_aligned; - bool is_aligned = is_aligned_device(top_node); TimeseriesIndex* timeseries_index = nullptr; if (is_aligned) { if (RET_FAIL( @@ -369,6 +497,8 @@ int TsFileIOReader::load_timeseries_index_for_ssi( } } + std::shared_ptr measurement_index_entry; + int64_t measurement_ie_end_offset = 0; if (RET_FAIL(load_measurement_index_entry(measurement_name, top_node, measurement_index_entry, measurement_ie_end_offset))) { @@ -570,16 +700,30 @@ int TsFileIOReader::get_timeseries_indexes( int64_t idx = 0; for (const auto& measurement_name : measurement_names) { - if (RET_FAIL(load_measurement_index_entry(measurement_name, top_node, - measurement_index_entry, - measurement_ie_end_offset))) { - } else if (do_load_timeseries_index( - measurement_name, measurement_index_entry->get_offset(), - measurement_ie_end_offset, pa, timeseries_indexs[idx], - is_aligned) == E_NOT_EXIST) { + timeseries_indexs[idx] = nullptr; + ret = load_measurement_index_entry(measurement_name, top_node, + measurement_index_entry, + measurement_ie_end_offset); + if (ret == E_MEASUREMENT_NOT_EXIST || ret == E_NOT_EXIST) { + ret = E_OK; idx++; continue; } + if (RET_FAIL(ret)) { + return ret; + } + + ret = do_load_timeseries_index( + measurement_name, measurement_index_entry->get_offset(), + measurement_ie_end_offset, pa, timeseries_indexs[idx], is_aligned); + if (ret == E_NOT_EXIST) { + ret = E_OK; + idx++; + continue; + } + if (RET_FAIL(ret)) { + return ret; + } if (is_aligned) { AlignedTimeseriesIndex* aligned_timeseries_index = dynamic_cast(timeseries_indexs[idx]); @@ -677,6 +821,9 @@ int TsFileIOReader::search_from_internal_node( bool TsFileIOReader::is_aligned_device( std::shared_ptr measurement_node) { + if (measurement_node->children_.empty()) { + return false; + } auto entry = measurement_node->children_[0]; return entry->get_name().is_null() || entry->get_name().to_std_string() == ""; diff --git a/cpp/src/file/tsfile_io_reader.h b/cpp/src/file/tsfile_io_reader.h index 85443326f..0073603fb 100644 --- a/cpp/src/file/tsfile_io_reader.h +++ b/cpp/src/file/tsfile_io_reader.h @@ -20,6 +20,8 @@ #ifndef FILE_TSFILE_IO_REAER_H #define FILE_TSFILE_IO_REAER_H +#include +#include #include #include "common/tsblock/tsblock.h" @@ -46,6 +48,26 @@ class TsFileIOReader { tsfile_meta_ready_(false), read_file_created_(false) { tsfile_meta_page_arena_.init(512, common::MOD_TSFILE_READER); + device_node_cache_pa_.init(512, common::MOD_TSFILE_READER); + } + + // Free only the ReadFile we own (created by init(const std::string&)). + // Without an explicit destructor that raw pointer leaks whenever a + // TsFileIOReader value goes out of scope without an explicit reset() (e.g. + // a stack instance in a test). We deliberately do NOT call reset() here: + // reset() also runs tsfile_meta_page_arena_.destroy(), which would free the + // arena that tsfile_meta_ lives in *before* the implicit ~TsFileMeta member + // destructor runs, leaving its arena-allocated MetaIndexNode / shared_ptr + // graph dangling (use-after-free / crash). The arenas and TsFileMeta clean + // themselves up correctly via member destruction order (tsfile_meta_ is + // destroyed before its backing arena). An owner that already called + // reset() leaves read_file_ == nullptr, so this never double-frees. + ~TsFileIOReader() { + if (read_file_created_ && read_file_ != nullptr) { + read_file_->destroy(); + delete read_file_; + read_file_ = nullptr; + } } int init(const std::string& file_path); @@ -59,6 +81,11 @@ class TsFileIOReader { TsFileSeriesScanIterator*& ssi, common::PageArena& pa, Filter* time_filter = nullptr); + int alloc_multi_ssi(std::shared_ptr device_id, + const std::vector& measurement_names, + TsFileSeriesScanIterator*& ssi, common::PageArena& pa, + Filter* time_filter = nullptr); + void revert_ssi(TsFileSeriesScanIterator* ssi); std::string get_file_path() const { return read_file_->file_path(); } @@ -147,17 +174,40 @@ class TsFileIOReader { bool filter_stasify(ITimeseriesIndex* ts_index, Filter* time_filter); + bool bloom_filter_contains(const std::string& device_name, + const std::string& measurement_name); + int get_all_leaf( std::shared_ptr index_node, std::vector, int64_t>>& index_node_entry_list); + struct CachedDeviceNode { + std::shared_ptr top_node; + bool is_aligned; + }; + + // Returns E_OK on hit (out is filled), or an error code on miss / load + // failure (E_DEVICE_NOT_EXIST when the device is absent, the propagated + // error otherwise). Copying into out keeps the caller safe from rehash / + // concurrent eviction of the cache map. + int get_cached_device_node(std::shared_ptr device_id, + common::PageArena& pa, CachedDeviceNode& out); + private: ReadFile* read_file_; common::PageArena tsfile_meta_page_arena_; TsFileMeta tsfile_meta_; bool tsfile_meta_ready_; bool read_file_created_; + // Cache: device_name → deserialized measurement MetaIndexNode. + // Guarded by device_node_cache_mu_ — multiple SSIs and Result Sets can + // hit the cache concurrently on the same reader, and an unsynchronized + // unordered_map insert would race with a parallel lookup (rehash, + // bucket-list rewrite) and with the underlying PageArena allocation. + common::PageArena device_node_cache_pa_; + std::unordered_map device_node_cache_; + mutable std::mutex device_node_cache_mu_; }; } // end namespace storage diff --git a/cpp/src/file/tsfile_io_writer.cc b/cpp/src/file/tsfile_io_writer.cc index 42d99feda..71bb08a7e 100644 --- a/cpp/src/file/tsfile_io_writer.cc +++ b/cpp/src/file/tsfile_io_writer.cc @@ -21,6 +21,8 @@ #include +#include +#include #include #include "common/device_id.h" @@ -40,14 +42,20 @@ namespace storage { #define OFFSET_DEBUG(msg) void(msg) #endif +int64_t TsFileIOWriter::get_meta_size() const { + return meta_allocator_.get_total_used_bytes(); +} + int TsFileIOWriter::init(WriteFile* write_file) { int ret = E_OK; const uint32_t page_size = 1024; meta_allocator_.init(page_size, MOD_TSFILE_WRITER_META); chunk_meta_count_ = 0; - recovery_chunk_meta_prefix_.clear(); - destroyed_ = false; file_ = write_file; + // Re-arm destroy() for the new lifecycle. Without this, a writer that + // was destroy()'d and then init()'d again would leak the fresh + // meta_allocator_/write_stream_/file_ on its next destroy(). + destroyed_ = false; return ret; } @@ -55,48 +63,37 @@ void TsFileIOWriter::destroy() { if (destroyed_) { return; } - // Recovery attaches a prefix of ChunkGroupMeta; device_id and chunk stats - // in that snapshot live in reader/recovery memory. After open, new chunks - // may be pushed into the same ChunkGroupMeta (same device); only those - // appended ChunkMeta need statistic_->destroy() (see - // recovery_chunk_meta_prefix_). - for (auto iter = chunk_group_meta_list_.begin(); - iter != chunk_group_meta_list_.end(); iter++) { - ChunkGroupMeta* cgm = iter.get(); - auto prefix_it = recovery_chunk_meta_prefix_.find(cgm); - const bool is_recovery_cgm = - chunk_group_meta_from_recovery_ && cgm != nullptr && - prefix_it != recovery_chunk_meta_prefix_.end(); - uint32_t recovered_cm_count = is_recovery_cgm ? prefix_it->second : 0; - - if (!is_recovery_cgm) { - if (cgm != nullptr && cgm->device_id_) { - cgm->device_id_.reset(); - } - } - - if (cgm == nullptr) { - continue; - } - uint32_t cm_idx = 0; - for (auto chunk_meta = cgm->chunk_meta_list_.begin(); - chunk_meta != cgm->chunk_meta_list_.end(); - chunk_meta++, cm_idx++) { - if (chunk_meta.get() == nullptr || - chunk_meta.get()->statistic_ == nullptr) { - continue; - } - if (is_recovery_cgm && cm_idx < recovered_cm_count) { - continue; - } - chunk_meta.get()->statistic_->destroy(); + // Free heap-allocated PageArenas held by each appended statistic and + // drop shared_ptr refs on each appended CGM's device_id_. Recovered + // entries from RestorableTsFileIOWriter live in self_check_arena_ and + // are not tracked here; the restorable writer cleans those up itself. + for (ChunkMeta* cm : appended_chunk_metas_) { + if (cm != nullptr && cm->statistic_ != nullptr) { + cm->statistic_->destroy(); } } - - if (cur_chunk_meta_ != nullptr && cur_chunk_meta_->statistic_ != nullptr) { - cur_chunk_meta_->statistic_->destroy(); - cur_chunk_meta_ = nullptr; + appended_chunk_metas_.clear(); + for (ChunkGroupMeta* cgm : appended_chunk_group_metas_) { + if (cgm != nullptr && cgm->device_id_) { + cgm->device_id_.reset(); + } } + appended_chunk_group_metas_.clear(); + // Drop every pointer that referenced meta_allocator_-owned memory before + // destroying the arena. Without this, a reused writer (destroy() + a new + // init()) would still see the dangling CGM list/index/cur_* slots from + // the previous lifecycle and dereference freed nodes the next time + // start_flush_chunk_group() linear-scans the list. + chunk_group_meta_list_.clear(); + chunk_group_meta_index_.clear(); + cur_chunk_meta_ = nullptr; + cur_chunk_group_meta_ = nullptr; + cur_device_name_.reset(); + chunk_meta_count_ = 0; + use_prev_alloc_cgm_ = false; + is_aligned_ = false; + file_base_offset_ = 0; + destroyed_ = true; meta_allocator_.destroy(); write_stream_.destroy(); @@ -104,7 +101,6 @@ void TsFileIOWriter::destroy() { delete file_; file_ = nullptr; } - destroyed_ = true; } int TsFileIOWriter::start_file() { @@ -145,6 +141,7 @@ int TsFileIOWriter::start_flush_chunk_group( } else { cur_chunk_group_meta_ = new (buf) ChunkGroupMeta(&meta_allocator_); cur_chunk_group_meta_->init(device_name); + appended_chunk_group_metas_.push_back(cur_chunk_group_meta_); } } return ret; @@ -183,6 +180,7 @@ int TsFileIOWriter::start_flush_chunk(common::ByteStream& chunk_data, ret = cur_chunk_meta_->init(mname, data_type, cur_file_position(), chunk_statistic_copy, mask, encoding, compression, meta_allocator_); + appended_chunk_metas_.push_back(cur_chunk_meta_); } // Step 2. serialize chunk header to write_stream_ @@ -258,6 +256,8 @@ int TsFileIOWriter::end_flush_chunk_group(bool is_aligned) { cur_chunk_group_meta_ = nullptr; return common::E_OK; } + chunk_group_meta_index_[cur_device_name_->get_device_name()] = + cur_chunk_group_meta_; int ret = chunk_group_meta_list_.push_back(cur_chunk_group_meta_); cur_chunk_group_meta_ = nullptr; return ret; @@ -269,17 +269,19 @@ int TsFileIOWriter::end_file() { return E_OK; } OFFSET_DEBUG("before end file"); + if (RET_FAIL(write_log_index_range())) { std::cout << "writer range index error, ret =" << ret << std::endl; } else if (RET_FAIL(write_file_index())) { std::cout << "writer file index error, ret = " << ret << std::endl; } else if (RET_FAIL(write_file_footer())) { std::cout << "writer file footer error, ret = " << ret << std::endl; - } else if (RET_FAIL(sync_file())) { + } else if (g_config_value_.sync_on_close_ && RET_FAIL(sync_file())) { std::cout << "sync file error, ret = " << ret << std::endl; } else if (RET_FAIL(close_file())) { std::cout << "close file error, ret = " << ret << std::endl; } + return ret; } diff --git a/cpp/src/file/tsfile_io_writer.h b/cpp/src/file/tsfile_io_writer.h index 088e52f56..4904b924a 100644 --- a/cpp/src/file/tsfile_io_writer.h +++ b/cpp/src/file/tsfile_io_writer.h @@ -21,6 +21,7 @@ #define FILE_TSFILE_IO_WRITER_H #include +#include #include #include "common/allocator/page_arena.h" @@ -108,6 +109,7 @@ class TsFileIOWriter { FORCE_INLINE std::string get_file_path() { return file_->get_file_path(); } FORCE_INLINE std::shared_ptr get_schema() { return schema_; } + int64_t get_meta_size() const; private: int write_log_index_range(); @@ -191,13 +193,19 @@ class TsFileIOWriter { /** For RestorableTsFileIOWriter: append a recovered ChunkGroupMeta. */ void push_chunk_group_meta(ChunkGroupMeta* cgm) { chunk_group_meta_list_.push_back(cgm); + if (cgm->device_id_) { + chunk_group_meta_index_[cgm->device_id_->get_device_name()] = cgm; + } } - /** True when chunk_group_meta_list_ has a prefix loaded from recovery; - * destroy() must not free device_id_/statistic_ for that prefix only. */ - bool chunk_group_meta_from_recovery_ = false; - /** Recovered ChunkGroupMeta* -> chunk_meta_list_.size() at attach (pointer - * keys avoid idx skew). */ - std::map recovery_chunk_meta_prefix_; + /** Chunks/CGMs allocated from meta_allocator_ via start_flush_chunk*() + * (post-recovery for the restorable writer, all chunks for the normal + * writer). destroy() iterates these directly to free the heap-allocated + * PageArena owned by each statistic and the shared_ptr held + * by each new CGM, without touching recovery-owned entries that live in + * RestorableTsFileIOWriter::self_check_arena_. */ + std::vector appended_chunk_metas_; + std::vector appended_chunk_group_metas_; + bool destroyed_ = false; /** * Recovery only: set file_base_offset_ so that cur_file_position() returns * correct absolute offsets. After recovery the writer behaves as if the @@ -214,6 +222,9 @@ class TsFileIOWriter { ChunkGroupMeta* cur_chunk_group_meta_; int32_t chunk_meta_count_; // for debug common::SimpleList chunk_group_meta_list_; + // O(1) lookup for existing ChunkGroupMeta by device name, avoiding the + // O(N) linear scan through chunk_group_meta_list_ per device. + std::unordered_map chunk_group_meta_index_; bool use_prev_alloc_cgm_; // chunk group meta std::shared_ptr cur_device_name_; WriteFile* file_; @@ -227,10 +238,6 @@ class TsFileIOWriter { /** Recovery only: absolute file offset at which write_stream_ logically * begins. Normal (non-recovery) path keeps this at 0. */ int64_t file_base_offset_ = 0; - /** Set after destroy() completes; avoids double cleanup when - * RestorableTsFileIOWriter::close() calls destroy() before - * self_check_arena_.destroy(), then ~TsFileIOWriter runs again. */ - bool destroyed_ = false; friend class RestorableTsFileIOWriter; // uses push_chunk_group_meta }; diff --git a/cpp/src/reader/aligned_chunk_reader.cc b/cpp/src/reader/aligned_chunk_reader.cc index 49c469547..7e2bda41e 100644 --- a/cpp/src/reader/aligned_chunk_reader.cc +++ b/cpp/src/reader/aligned_chunk_reader.cc @@ -19,8 +19,13 @@ #include "aligned_chunk_reader.h" +#include #include +#include "common/global.h" +#ifdef ENABLE_THREADS +#include "common/thread_pool.h" +#endif #include "compress/compressor_factory.h" #include "encoding/decoder_factory.h" @@ -56,19 +61,74 @@ void AlignedChunkReader::reset() { if (file_data_buf != nullptr) { mem_free(file_data_buf); } + time_in_stream_.clear_wrapped_buf(); time_in_stream_.reset(); file_data_buf = value_in_stream_.get_wrapped_buf(); if (file_data_buf != nullptr) { mem_free(file_data_buf); } + value_in_stream_.clear_wrapped_buf(); value_in_stream_.reset(); file_data_time_buf_size_ = 0; file_data_value_buf_size_ = 0; time_chunk_visit_offset_ = 0; value_chunk_visit_offset_ = 0; + page_plan_built_ = false; + current_page_loaded_ = false; + current_page_plan_index_ = 0; + time_predecoded_ = false; + page_all_times_.clear(); + page_time_count_ = 0; + page_time_cursor_ = 0; + + // Free leftover uncompressed buffers from the previous chunk. + if (time_uncompressed_buf_ != nullptr && time_compressor_ != nullptr) { + time_compressor_->after_uncompress(time_uncompressed_buf_); + time_uncompressed_buf_ = nullptr; + } + + // Multi-value reset + for (auto* col : value_columns_) { + // Free uncompressed buffer before resetting. + if (col->uncompressed_buf != nullptr && col->compressor != nullptr) { + col->compressor->after_uncompress(col->uncompressed_buf); + col->uncompressed_buf = nullptr; + } + char* buf = col->in_stream.get_wrapped_buf(); + if (buf != nullptr) mem_free(buf); + col->in_stream.clear_wrapped_buf(); + col->in_stream.reset(); + col->in.reset(); + col->chunk_header.reset(); + col->cur_page_header.reset(); + col->file_data_buf_size = 0; + col->chunk_visit_offset = 0; + col->notnull_bitmap.clear(); + col->cur_value_index = -1; + col->chunk_meta = nullptr; + for (auto& pps : col->per_page_state) { + pps.predecode_pa.destroy(); + } + col->per_page_state.clear(); + col->pending_decoded_values.clear(); + col->pending_decoded_count = 0; + col->pending_decoded_cursor = 0; + col->pending_decoded = false; + // Note: decoder/compressor are NOT freed here — they are reused by + // alloc_compressor_and_decoder() in load_by_aligned_meta_multi(). + } + release_current_page_state(); + chunk_pages_.clear(); + per_page_times_.clear(); } void AlignedChunkReader::destroy() { + // .clear() leaves the vector's internal heap buffer allocated, which + // mem_free can't reach because we placement-new the reader. swap with + // an empty vector to actually release the backing storage so ASan's + // LeakSanitizer doesn't flag the (rather large) ChunkPageInfo buffers. + std::vector{}.swap(chunk_pages_); + std::vector{}.swap(page_all_times_); if (time_uncompressed_buf_ != nullptr && time_compressor_ != nullptr) { time_compressor_->after_uncompress(time_uncompressed_buf_); time_uncompressed_buf_ = nullptr; @@ -112,6 +172,59 @@ void AlignedChunkReader::destroy() { } cur_value_page_header_.reset(); chunk_header_.~ChunkHeader(); + + // Multi-value destroy + for (size_t ci = 0; ci < value_columns_.size(); ci++) { + auto* col = value_columns_[ci]; + if (col->decoder != nullptr) { + col->decoder->~Decoder(); + DecoderFactory::free(col->decoder); + col->decoder = nullptr; + } + if (col->compressor != nullptr) { + col->compressor->~Compressor(); + CompressorFactory::free(col->compressor); + col->compressor = nullptr; + } + for (auto& pps : col->per_page_state) { + pps.predecode_pa.destroy(); + } + col->per_page_state.clear(); + col->pending_decoded_values.clear(); + buf = col->in_stream.get_wrapped_buf(); + if (buf != nullptr) { + mem_free(buf); + col->in_stream.clear_wrapped_buf(); + } + col->cur_page_header.reset(); + delete col; + } + // This reader is placement-new'd and torn down via destroy() + mem_free + // without ever running ~AlignedChunkReader (see + // TsFileSeriesScanIterator::destroy), so .clear() would leave these + // vectors' backing buffers allocated and unreachable. swap with an empty + // vector to actually release the storage, matching the chunk_pages_ / + // page_all_times_ handling above. + std::vector().swap(value_columns_); + release_current_page_state(); + std::vector>().swap(per_page_times_); +#ifdef ENABLE_THREADS + decode_pool_ = nullptr; // borrowed, not owned + for (auto* d : time_decoder_pool_) { + if (d != nullptr) { + d->~Decoder(); + DecoderFactory::free(d); + } + } + std::vector().swap(time_decoder_pool_); + for (auto* c : time_compressor_pool_) { + if (c != nullptr) { + c->~Compressor(); + CompressorFactory::free(c); + } + } + std::vector().swap(time_compressor_pool_); +#endif } int AlignedChunkReader::load_by_aligned_meta(ChunkMeta* time_chunk_meta, @@ -218,15 +331,19 @@ int AlignedChunkReader::alloc_compressor_and_decoder( int AlignedChunkReader::get_next_page(TsBlock* ret_tsblock, Filter* oneshoot_filter, PageArena& pa) { + if (multi_value_mode_) { + return get_next_page_multi(ret_tsblock, oneshoot_filter, pa); + } int ret = E_OK; Filter* filter = (oneshoot_filter != nullptr ? oneshoot_filter : time_filter_); - if (prev_time_page_not_finish() && prev_value_page_not_finish()) { - ret = decode_time_value_buf_into_tsblock(ret_tsblock, oneshoot_filter, - &pa); + bool pt = prev_time_page_not_finish(); + bool pv = prev_value_page_not_finish(); + if (pt && pv) { + ret = decode_time_value_buf_into_tsblock(ret_tsblock, filter, &pa); return ret; } - if (!prev_time_page_not_finish() && !prev_value_page_not_finish()) { + if (!pt && !pv) { while (IS_SUCC(ret)) { if (RET_FAIL(get_cur_page_header( time_chunk_meta_, time_in_stream_, cur_time_page_header_, @@ -249,8 +366,7 @@ int AlignedChunkReader::get_next_page(TsBlock* ret_tsblock, } } if (IS_SUCC(ret)) { - ret = decode_time_value_buf_into_tsblock(ret_tsblock, oneshoot_filter, - &pa); + ret = decode_time_value_buf_into_tsblock(ret_tsblock, filter, &pa); } return ret; } @@ -259,7 +375,8 @@ int AlignedChunkReader::get_cur_page_header(ChunkMeta*& chunk_meta, common::ByteStream& in_stream, PageHeader& cur_page_header, uint32_t& chunk_visit_offset, - ChunkHeader& chunk_header) { + ChunkHeader& chunk_header, + int32_t* override_buf_size) { int ret = E_OK; bool retry = true; int cur_page_header_serialized_size = 0; @@ -282,7 +399,8 @@ int AlignedChunkReader::get_cur_page_header(ChunkMeta*& chunk_meta, retry = false; retry_read_want_size += 1024; int32_t& file_data_buf_size = - chunk_header.data_type_ == common::VECTOR + override_buf_size != nullptr ? *override_buf_size + : chunk_header.data_type_ == common::VECTOR ? file_data_time_buf_size_ : file_data_value_buf_size_; // do not shrink buffer for page header, otherwise, the buffer is @@ -326,9 +444,13 @@ int AlignedChunkReader::read_from_file_and_rewrap( (may_shrink && read_size < file_data_buf_size / 10)) { file_data_buf = (char*)mem_realloc(file_data_buf, read_size); if (IS_NULL(file_data_buf)) { + in_stream_.clear_wrapped_buf(); return E_OOM; } file_data_buf_size = read_size; + // Update stream pointer immediately so it stays valid even if + // the subsequent read fails and the caller frees via destroy(). + in_stream_.wrap_from(file_data_buf, read_size); } int ret_read_len = 0; if (RET_FAIL( @@ -563,6 +685,7 @@ int AlignedChunkReader::decode_time_value_buf_into_tsblock( row_appender.append_null(1); \ continue; \ } \ + assert(value_decoder_->has_remaining(value_in)); \ if (!value_decoder_->has_remaining(value_in)) { \ return common::E_DATA_INCONSISTENCY; \ } \ @@ -597,19 +720,19 @@ int AlignedChunkReader::i32_DECODE_TYPED_TV_INTO_TSBLOCK( if (value_page_col_notnull_bitmap_.empty() || ((value_page_col_notnull_bitmap_[cur_value_index / 8] & 0xFF) & (mask >> (cur_value_index % 8))) == 0) { - if (UNLIKELY(!row_appender.add_row())) { - ret = E_OVERFLOW; - cur_value_index--; - break; - } ret = time_decoder_->read_int64(time, time_in); if (ret != E_OK) { break; } + if (UNLIKELY(!row_appender.add_row())) { + ret = E_OVERFLOW; + break; + } row_appender.append(0, (char*)&time, sizeof(time)); row_appender.append_null(1); continue; } + assert(value_decoder_->has_remaining(value_in)); if (!value_decoder_->has_remaining(value_in)) { return common::E_DATA_INCONSISTENCY; } @@ -632,6 +755,566 @@ int AlignedChunkReader::i32_DECODE_TYPED_TV_INTO_TSBLOCK( return ret; } +int AlignedChunkReader::i32_DECODE_TV_BATCH(ByteStream& time_in, + ByteStream& value_in, + RowAppender& row_appender, + Filter* filter) { + int ret = E_OK; + const int BATCH = 129; + int64_t times[BATCH]; + int32_t values[BATCH]; + const uint32_t null_mask_base = 1 << 7; + + while (time_decoder_->has_remaining(time_in)) { + if (row_appender.remaining() < (uint32_t)BATCH) { + ret = E_OVERFLOW; + break; + } + + // Block-level time filter check + bool block_all_pass = false; + if (filter != nullptr) { + int64_t block_min, block_max; + int block_count; + if (time_decoder_->peek_next_block_range_int64( + time_in, block_min, block_max, block_count)) { + if (!filter->satisfy_start_end_time(block_min, block_max)) { + int skipped = 0; + time_decoder_->skip_peeked_block_int64(time_in, skipped); + int nonnull = 0; + for (int i = 0; i < block_count; ++i) { + int vi = cur_value_index + 1 + i; + if (!value_page_col_notnull_bitmap_.empty() && + ((value_page_col_notnull_bitmap_[vi / 8] & 0xFF) & + (null_mask_base >> (vi % 8))) != 0) { + ++nonnull; + } + } + cur_value_index += block_count; + if (nonnull > 0) { + // skip_* may legitimately fail (truncated page) or + // short-read (corrupt bitmap vs. data); both must + // abort the loop rather than silently desync the + // value decoder. Same defect the multi-value path + // already guards against. + int sk = 0; + if (RET_FAIL(value_decoder_->skip_int32(nonnull, sk, + value_in))) { + break; + } + if (sk != nonnull) { + ret = E_TSFILE_CORRUPTED; + break; + } + } + continue; + } + if (filter->contain_start_end_time(block_min, block_max)) { + block_all_pass = true; + } + } + } + + int time_count = 0; + if (RET_FAIL(time_decoder_->read_batch_int64(times, BATCH, time_count, + time_in))) { + break; + } + if (time_count == 0) break; + + bool is_null[BATCH]; + int nonnull_count = 0; + for (int i = 0; i < time_count; ++i) { + int vi = cur_value_index + 1 + i; + if (value_page_col_notnull_bitmap_.empty() || + ((value_page_col_notnull_bitmap_[vi / 8] & 0xFF) & + (null_mask_base >> (vi % 8))) == 0) { + is_null[i] = true; + } else { + is_null[i] = false; + ++nonnull_count; + } + } + + bool time_mask[BATCH]; + int pass_count = time_count; + if (filter != nullptr && !block_all_pass) { + pass_count = + filter->satisfy_batch_time(times, time_count, time_mask); + } + + if (pass_count == 0) { + if (nonnull_count > 0) { + int skipped = 0; + if (RET_FAIL(value_decoder_->skip_int32(nonnull_count, skipped, + value_in))) { + break; + } + if (skipped != nonnull_count) { + ret = E_TSFILE_CORRUPTED; + break; + } + } + cur_value_index += time_count; + continue; + } + + int value_count = 0; + if (nonnull_count > 0) { + if (RET_FAIL(value_decoder_->read_batch_int32( + values, nonnull_count, value_count, value_in))) { + break; + } + } + + int val_idx = 0; + for (int i = 0; i < time_count; ++i) { + cur_value_index++; + if (filter != nullptr && !block_all_pass && !time_mask[i]) { + if (!is_null[i]) ++val_idx; + continue; + } + if (is_null[i]) { + if (UNLIKELY(!row_appender.add_row())) { + ret = E_OVERFLOW; + break; + } + row_appender.append(0, (char*)×[i], sizeof(int64_t)); + row_appender.append_null(1); + } else { + int32_t val = values[val_idx++]; + if (filter != nullptr && !block_all_pass && + !filter->satisfy(times[i], (int64_t)val)) { + continue; + } + if (UNLIKELY(!row_appender.add_row())) { + ret = E_OVERFLOW; + break; + } + row_appender.append(0, (char*)×[i], sizeof(int64_t)); + row_appender.append(1, (char*)&val, sizeof(int32_t)); + } + } + if (ret != E_OK) break; + } + return ret; +} + +int AlignedChunkReader::i64_DECODE_TV_BATCH(ByteStream& time_in, + ByteStream& value_in, + RowAppender& row_appender, + Filter* filter) { + int ret = E_OK; + const int BATCH = 129; + int64_t times[BATCH]; + int64_t values[BATCH]; + const uint32_t null_mask_base = 1 << 7; + + while (time_decoder_->has_remaining(time_in)) { + if (row_appender.remaining() < (uint32_t)BATCH) { + ret = E_OVERFLOW; + break; + } + + // Block-level time filter check: skip entire block if out of range + bool block_all_pass = false; + if (filter != nullptr) { + int64_t block_min, block_max; + int block_count; + if (time_decoder_->peek_next_block_range_int64( + time_in, block_min, block_max, block_count)) { + if (!filter->satisfy_start_end_time(block_min, block_max)) { + int skipped = 0; + time_decoder_->skip_peeked_block_int64(time_in, skipped); + int nonnull = 0; + for (int i = 0; i < block_count; ++i) { + int vi = cur_value_index + 1 + i; + if (!value_page_col_notnull_bitmap_.empty() && + ((value_page_col_notnull_bitmap_[vi / 8] & 0xFF) & + (null_mask_base >> (vi % 8))) != 0) { + ++nonnull; + } + } + cur_value_index += block_count; + if (nonnull > 0) { + // See i32 path above for the rationale. + int sk = 0; + if (RET_FAIL(value_decoder_->skip_int64(nonnull, sk, + value_in))) { + break; + } + if (sk != nonnull) { + ret = E_TSFILE_CORRUPTED; + break; + } + } + continue; + } + if (filter->contain_start_end_time(block_min, block_max)) { + block_all_pass = true; + } + } + } + + int time_count = 0; + if (RET_FAIL(time_decoder_->read_batch_int64(times, BATCH, time_count, + time_in))) { + break; + } + if (time_count == 0) break; + + bool is_null[BATCH]; + int nonnull_count = 0; + for (int i = 0; i < time_count; ++i) { + int vi = cur_value_index + 1 + i; + if (value_page_col_notnull_bitmap_.empty() || + ((value_page_col_notnull_bitmap_[vi / 8] & 0xFF) & + (null_mask_base >> (vi % 8))) == 0) { + is_null[i] = true; + } else { + is_null[i] = false; + ++nonnull_count; + } + } + + bool time_mask[BATCH]; + int pass_count = time_count; + if (filter != nullptr && !block_all_pass) { + pass_count = + filter->satisfy_batch_time(times, time_count, time_mask); + } + + if (pass_count == 0) { + if (nonnull_count > 0) { + int skipped = 0; + if (RET_FAIL(value_decoder_->skip_int64(nonnull_count, skipped, + value_in))) { + break; + } + if (skipped != nonnull_count) { + ret = E_TSFILE_CORRUPTED; + break; + } + } + cur_value_index += time_count; + continue; + } + + int value_count = 0; + if (nonnull_count > 0) { + if (RET_FAIL(value_decoder_->read_batch_int64( + values, nonnull_count, value_count, value_in))) { + break; + } + } + + int val_idx = 0; + for (int i = 0; i < time_count; ++i) { + cur_value_index++; + if (filter != nullptr && !block_all_pass && !time_mask[i]) { + if (!is_null[i]) ++val_idx; + continue; + } + if (is_null[i]) { + if (UNLIKELY(!row_appender.add_row())) { + ret = E_OVERFLOW; + break; + } + row_appender.append(0, (char*)×[i], sizeof(int64_t)); + row_appender.append_null(1); + } else { + int64_t val = values[val_idx++]; + if (filter != nullptr && !block_all_pass && + !filter->satisfy(times[i], val)) { + continue; + } + if (UNLIKELY(!row_appender.add_row())) { + ret = E_OVERFLOW; + break; + } + row_appender.append(0, (char*)×[i], sizeof(int64_t)); + row_appender.append(1, (char*)&val, sizeof(int64_t)); + } + } + if (ret != E_OK) break; + } + return ret; +} + +int AlignedChunkReader::float_DECODE_TV_BATCH(ByteStream& time_in, + ByteStream& value_in, + RowAppender& row_appender, + Filter* filter) { + int ret = E_OK; + const int BATCH = 129; + int64_t times[BATCH]; + float values[BATCH]; + const uint32_t null_mask_base = 1 << 7; + + while (time_decoder_->has_remaining(time_in)) { + if (row_appender.remaining() < (uint32_t)BATCH) { + ret = E_OVERFLOW; + break; + } + + // Block-level time filter check + bool block_all_pass = false; + if (filter != nullptr) { + int64_t block_min, block_max; + int block_count; + if (time_decoder_->peek_next_block_range_int64( + time_in, block_min, block_max, block_count)) { + if (!filter->satisfy_start_end_time(block_min, block_max)) { + int skipped = 0; + time_decoder_->skip_peeked_block_int64(time_in, skipped); + int nonnull = 0; + for (int i = 0; i < block_count; ++i) { + int vi = cur_value_index + 1 + i; + if (!value_page_col_notnull_bitmap_.empty() && + ((value_page_col_notnull_bitmap_[vi / 8] & 0xFF) & + (null_mask_base >> (vi % 8))) != 0) { + ++nonnull; + } + } + cur_value_index += block_count; + if (nonnull > 0) { + // See i32 path above for the rationale. + int sk = 0; + if (RET_FAIL(value_decoder_->skip_float(nonnull, sk, + value_in))) { + break; + } + if (sk != nonnull) { + ret = E_TSFILE_CORRUPTED; + break; + } + } + continue; + } + if (filter->contain_start_end_time(block_min, block_max)) { + block_all_pass = true; + } + } + } + + int time_count = 0; + if (RET_FAIL(time_decoder_->read_batch_int64(times, BATCH, time_count, + time_in))) { + break; + } + if (time_count == 0) break; + + bool is_null[BATCH]; + int nonnull_count = 0; + for (int i = 0; i < time_count; ++i) { + int vi = cur_value_index + 1 + i; + if (value_page_col_notnull_bitmap_.empty() || + ((value_page_col_notnull_bitmap_[vi / 8] & 0xFF) & + (null_mask_base >> (vi % 8))) == 0) { + is_null[i] = true; + } else { + is_null[i] = false; + ++nonnull_count; + } + } + + bool time_mask[BATCH]; + int pass_count = time_count; + if (filter != nullptr && !block_all_pass) { + pass_count = + filter->satisfy_batch_time(times, time_count, time_mask); + } + + if (pass_count == 0) { + if (nonnull_count > 0) { + int skipped = 0; + if (RET_FAIL(value_decoder_->skip_float(nonnull_count, skipped, + value_in))) { + break; + } + if (skipped != nonnull_count) { + ret = E_TSFILE_CORRUPTED; + break; + } + } + cur_value_index += time_count; + continue; + } + + int value_count = 0; + if (nonnull_count > 0) { + if (RET_FAIL(value_decoder_->read_batch_float( + values, nonnull_count, value_count, value_in))) { + break; + } + } + + int val_idx = 0; + for (int i = 0; i < time_count; ++i) { + cur_value_index++; + if (filter != nullptr && !block_all_pass && !time_mask[i]) { + if (!is_null[i]) ++val_idx; + continue; + } + if (is_null[i]) { + if (UNLIKELY(!row_appender.add_row())) { + ret = E_OVERFLOW; + break; + } + row_appender.append(0, (char*)×[i], sizeof(int64_t)); + row_appender.append_null(1); + } else { + float val = values[val_idx++]; + if (UNLIKELY(!row_appender.add_row())) { + ret = E_OVERFLOW; + break; + } + row_appender.append(0, (char*)×[i], sizeof(int64_t)); + row_appender.append(1, (char*)&val, sizeof(float)); + } + } + if (ret != E_OK) break; + } + return ret; +} + +int AlignedChunkReader::double_DECODE_TV_BATCH(ByteStream& time_in, + ByteStream& value_in, + RowAppender& row_appender, + Filter* filter) { + int ret = E_OK; + const int BATCH = 129; + int64_t times[BATCH]; + double values[BATCH]; + const uint32_t null_mask_base = 1 << 7; + + while (time_decoder_->has_remaining(time_in)) { + if (row_appender.remaining() < (uint32_t)BATCH) { + ret = E_OVERFLOW; + break; + } + + // Block-level time filter check + bool block_all_pass = false; + if (filter != nullptr) { + int64_t block_min, block_max; + int block_count; + if (time_decoder_->peek_next_block_range_int64( + time_in, block_min, block_max, block_count)) { + if (!filter->satisfy_start_end_time(block_min, block_max)) { + int skipped = 0; + time_decoder_->skip_peeked_block_int64(time_in, skipped); + int nonnull = 0; + for (int i = 0; i < block_count; ++i) { + int vi = cur_value_index + 1 + i; + if (!value_page_col_notnull_bitmap_.empty() && + ((value_page_col_notnull_bitmap_[vi / 8] & 0xFF) & + (null_mask_base >> (vi % 8))) != 0) { + ++nonnull; + } + } + cur_value_index += block_count; + if (nonnull > 0) { + // See i32 path above for the rationale. + int sk = 0; + if (RET_FAIL(value_decoder_->skip_double(nonnull, sk, + value_in))) { + break; + } + if (sk != nonnull) { + ret = E_TSFILE_CORRUPTED; + break; + } + } + continue; + } + if (filter->contain_start_end_time(block_min, block_max)) { + block_all_pass = true; + } + } + } + + int time_count = 0; + if (RET_FAIL(time_decoder_->read_batch_int64(times, BATCH, time_count, + time_in))) { + break; + } + if (time_count == 0) break; + + bool is_null[BATCH]; + int nonnull_count = 0; + for (int i = 0; i < time_count; ++i) { + int vi = cur_value_index + 1 + i; + if (value_page_col_notnull_bitmap_.empty() || + ((value_page_col_notnull_bitmap_[vi / 8] & 0xFF) & + (null_mask_base >> (vi % 8))) == 0) { + is_null[i] = true; + } else { + is_null[i] = false; + ++nonnull_count; + } + } + + bool time_mask[BATCH]; + int pass_count = time_count; + if (filter != nullptr && !block_all_pass) { + pass_count = + filter->satisfy_batch_time(times, time_count, time_mask); + } + + if (pass_count == 0) { + if (nonnull_count > 0) { + int skipped = 0; + if (RET_FAIL(value_decoder_->skip_double(nonnull_count, skipped, + value_in))) { + break; + } + if (skipped != nonnull_count) { + ret = E_TSFILE_CORRUPTED; + break; + } + } + cur_value_index += time_count; + continue; + } + + int value_count = 0; + if (nonnull_count > 0) { + if (RET_FAIL(value_decoder_->read_batch_double( + values, nonnull_count, value_count, value_in))) { + break; + } + } + + int val_idx = 0; + for (int i = 0; i < time_count; ++i) { + cur_value_index++; + if (filter != nullptr && !block_all_pass && !time_mask[i]) { + if (!is_null[i]) ++val_idx; + continue; + } + if (is_null[i]) { + if (UNLIKELY(!row_appender.add_row())) { + ret = E_OVERFLOW; + break; + } + row_appender.append(0, (char*)×[i], sizeof(int64_t)); + row_appender.append_null(1); + } else { + double val = values[val_idx++]; + if (UNLIKELY(!row_appender.add_row())) { + ret = E_OVERFLOW; + break; + } + row_appender.append(0, (char*)×[i], sizeof(int64_t)); + row_appender.append(1, (char*)&val, sizeof(double)); + } + } + if (ret != E_OK) break; + } + return ret; +} + int AlignedChunkReader::decode_tv_buf_into_tsblock_by_datatype( ByteStream& time_in, ByteStream& value_in, TsBlock* ret_tsblock, Filter* filter, common::PageArena* pa) { @@ -644,23 +1327,24 @@ int AlignedChunkReader::decode_tv_buf_into_tsblock_by_datatype( break; case common::DATE: case common::INT32: - // DECODE_TYPED_TV_INTO_TSBLOCK(int32_t, int32, time_in_, value_in_, - // row_appender); - ret = i32_DECODE_TYPED_TV_INTO_TSBLOCK(time_in_, value_in_, - row_appender, filter); + // Batch decode path: read_batch_int{32,64} consumes whole TS_2DIFF + // blocks at once (and uses SIMD when ENABLE_SIMD); replaces a + // per-value decode() loop that hot-dominated the read flame graph. + ret = + i32_DECODE_TV_BATCH(time_in_, value_in_, row_appender, filter); break; case common::TIMESTAMP: case common::INT64: - DECODE_TYPED_TV_INTO_TSBLOCK(int64_t, int64, time_in_, value_in_, - row_appender); + ret = + i64_DECODE_TV_BATCH(time_in_, value_in_, row_appender, filter); break; case common::FLOAT: - DECODE_TYPED_TV_INTO_TSBLOCK(float, float, time_in_, value_in_, - row_appender); + ret = float_DECODE_TV_BATCH(time_in_, value_in_, row_appender, + filter); break; case common::DOUBLE: - DECODE_TYPED_TV_INTO_TSBLOCK(double, double, time_in_, value_in_, - row_appender); + ret = double_DECODE_TV_BATCH(time_in_, value_in_, row_appender, + filter); break; case common::STRING: case common::BLOB: @@ -695,6 +1379,7 @@ int AlignedChunkReader::STRING_DECODE_TYPED_TV_INTO_TSBLOCK( } if (should_read_data) { + assert(value_decoder_->has_remaining(value_in)); if (!value_decoder_->has_remaining(value_in)) { return E_DATA_INCONSISTENCY; } @@ -740,21 +1425,15 @@ bool AlignedChunkReader::should_skip_page_by_offset(int& row_offset) { if (row_offset <= 0) { return false; } - // Aligned TV pages: only skip a whole page by count when both page headers - // expose the same positive row count. Using a single side (or min) when - // the other is missing or unequal can desynchronize row_offset from - // decoded row order vs. the paired time/value stream. - Statistic* ts = cur_time_page_header_.statistic_; - Statistic* vs = cur_value_page_header_.statistic_; - if (ts == nullptr || vs == nullptr) { - return false; + // Use time page statistic for count. + Statistic* stat = cur_time_page_header_.statistic_; + if (stat == nullptr) { + stat = cur_value_page_header_.statistic_; } - int32_t tc = ts->count_; - int32_t vc = vs->count_; - if (tc <= 0 || vc <= 0 || tc != vc) { + if (stat == nullptr || stat->count_ == 0) { return false; } - int32_t count = tc; + int32_t count = stat->count_; if (row_offset >= count) { row_offset -= count; return true; @@ -766,6 +1445,19 @@ int AlignedChunkReader::get_next_page(TsBlock* ret_tsblock, Filter* oneshoot_filter, PageArena& pa, int64_t min_time_hint, int& row_offset, int& row_limit) { + if (multi_value_mode_) { + // Multi-value aligned path doesn't yet honour row_offset / row_limit + // / min_time_hint — they get dropped on the floor, which silently + // returns full chunk data when the caller asked for a sub-range. + // Refuse the combination so the caller sees an actual error instead + // of garbage results. set_row_range(0, -1) keeps the all-rows + // contract intact for normal queries. + if (row_offset > 0 || row_limit >= 0 || + min_time_hint != std::numeric_limits::min()) { + return common::E_NOT_SUPPORT; + } + return get_next_page_multi(ret_tsblock, oneshoot_filter, pa); + } int ret = E_OK; Filter* filter = (oneshoot_filter != nullptr ? oneshoot_filter : time_filter_); @@ -774,12 +1466,14 @@ int AlignedChunkReader::get_next_page(TsBlock* ret_tsblock, return E_NO_MORE_DATA; } - if (prev_time_page_not_finish() && prev_value_page_not_finish()) { - ret = decode_time_value_buf_into_tsblock(ret_tsblock, oneshoot_filter, - &pa); + bool pt = prev_time_page_not_finish(); + bool pv = prev_value_page_not_finish(); + + if (pt && pv) { + ret = decode_time_value_buf_into_tsblock(ret_tsblock, filter, &pa); return ret; } - if (!prev_time_page_not_finish() && !prev_value_page_not_finish()) { + if (!pt && !pv) { while (IS_SUCC(ret)) { if (RET_FAIL(get_cur_page_header( time_chunk_meta_, time_in_stream_, cur_time_page_header_, @@ -810,10 +1504,1560 @@ int AlignedChunkReader::get_next_page(TsBlock* ret_tsblock, } } if (IS_SUCC(ret)) { - ret = decode_time_value_buf_into_tsblock(ret_tsblock, oneshoot_filter, - &pa); + ret = decode_time_value_buf_into_tsblock(ret_tsblock, filter, &pa); + } + return ret; +} + +// ══════════════════════════════════════════════════════════════════════════ +// Multi-value AlignedChunkReader implementation +// ══════════════════════════════════════════════════════════════════════════ + +int AlignedChunkReader::load_by_aligned_meta_multi( + ChunkMeta* time_chunk_meta, const std::vector& value_metas) { + int ret = E_OK; + multi_value_mode_ = true; + time_chunk_meta_ = time_chunk_meta; + page_plan_built_ = false; + current_page_loaded_ = false; + current_page_plan_index_ = 0; + time_predecoded_ = false; + page_all_times_.clear(); + page_time_count_ = 0; + page_time_cursor_ = 0; + + // ── Load time chunk header ── + file_data_time_buf_size_ = 1024; + int32_t ret_read_len = 0; + char* time_file_data_buf = + (char*)mem_alloc(file_data_time_buf_size_, MOD_CHUNK_READER); + if (IS_NULL(time_file_data_buf)) return E_OOM; + + ret = read_file_->read(time_chunk_meta_->offset_of_chunk_header_, + time_file_data_buf, file_data_time_buf_size_, + ret_read_len); + if (IS_SUCC(ret) && ret_read_len < ChunkHeader::MIN_SERIALIZED_SIZE) { + ret = E_TSFILE_CORRUPTED; + mem_free(time_file_data_buf); + return ret; + } + if (IS_SUCC(ret)) { + time_in_stream_.wrap_from(time_file_data_buf, ret_read_len); + if (RET_FAIL(time_chunk_header_.deserialize_from(time_in_stream_))) { + return ret; + } + time_chunk_visit_offset_ = time_in_stream_.read_pos(); + } + + // Alloc time decoder/compressor + if (IS_SUCC(ret)) { + if (RET_FAIL(alloc_compressor_and_decoder( + time_decoder_, time_compressor_, + time_chunk_header_.encoding_type_, + time_chunk_header_.data_type_, + time_chunk_header_.compression_type_))) { + return ret; + } + } + + // ── Load each value column ── + // Reuse existing ValueColumnState objects if count matches (reset() already + // cleared their internal state). Otherwise, recreate. + if (value_columns_.size() != value_metas.size()) { + for (auto* p : value_columns_) delete p; + value_columns_.clear(); + value_columns_.reserve(value_metas.size()); + for (size_t c = 0; c < value_metas.size(); c++) { + value_columns_.push_back(new ValueColumnState); + } + } + for (size_t c = 0; c < value_metas.size() && IS_SUCC(ret); c++) { + auto* col = value_columns_[c]; + col->chunk_meta = value_metas[c]; + col->file_data_buf_size = 1024; + ret_read_len = 0; + char* vbuf = + (char*)mem_alloc(col->file_data_buf_size, MOD_CHUNK_READER); + if (IS_NULL(vbuf)) return E_OOM; + + ret = read_file_->read(col->chunk_meta->offset_of_chunk_header_, vbuf, + col->file_data_buf_size, ret_read_len); + if (IS_SUCC(ret) && ret_read_len < ChunkHeader::MIN_SERIALIZED_SIZE) { + ret = E_TSFILE_CORRUPTED; + mem_free(vbuf); + break; + } + if (IS_SUCC(ret)) { + col->in_stream.wrap_from(vbuf, ret_read_len); + if (RET_FAIL(col->chunk_header.deserialize_from(col->in_stream))) { + break; + } + col->chunk_visit_offset = col->in_stream.read_pos(); + if (RET_FAIL(alloc_compressor_and_decoder( + col->decoder, col->compressor, + col->chunk_header.encoding_type_, + col->chunk_header.data_type_, + col->chunk_header.compression_type_))) { + break; + } + } + } + + return ret; +} + +bool AlignedChunkReader::has_more_data_multi() const { + if (page_plan_built_) { + if (current_page_loaded_) { + return page_time_cursor_ < page_time_count_; + } + return current_page_plan_index_ < chunk_pages_.size(); + } + if (prev_time_page_not_finish() || prev_any_value_page_not_finish_multi()) { + return true; + } + if (time_chunk_visit_offset_ - time_chunk_header_.serialized_size_ < + time_chunk_header_.data_size_) { + return true; + } + for (const auto* col : value_columns_) { + if (col->chunk_visit_offset - col->chunk_header.serialized_size_ < + col->chunk_header.data_size_) { + return true; + } + } + return false; +} + +bool AlignedChunkReader::prev_any_value_page_not_finish_multi() const { + for (const auto* col : value_columns_) { + if ((col->decoder && col->decoder->has_remaining(col->in)) || + col->in.has_remaining()) { + return true; + } + } + return false; +} + +bool AlignedChunkReader::has_variable_length_value_column() const { + for (const auto* col : value_columns_) { + if (col->chunk_header.data_type_ == common::STRING || + col->chunk_header.data_type_ == common::TEXT || + col->chunk_header.data_type_ == common::BLOB) { + return true; + } + } + return false; +} + +int AlignedChunkReader::count_non_null_prefix( + const std::vector& bitmap, int32_t row_limit) const { + if (row_limit <= 0 || bitmap.empty()) { + return 0; + } + const uint32_t mask_base = 1 << 7; + int count = 0; + for (int32_t i = 0; i < row_limit; i++) { + if (((bitmap[i / 8] & 0xFF) & (mask_base >> (i % 8))) != 0) { + count++; + } + } + return count; +} + +int AlignedChunkReader::decode_time_page_direct( + const ChunkPageInfo& page_info, std::vector& out_times) { + return decode_time_page_with(page_info, out_times, time_decoder_, + time_compressor_); +} + +// Worker-safe variant: uses caller-provided decoder + compressor instead of +// the shared time_decoder_/time_compressor_ members. Used by the parallel +// time-page decode dispatch in decode_all_planned_pages. +int AlignedChunkReader::decode_time_page_with(const ChunkPageInfo& page_info, + std::vector& out_times, + Decoder* decoder, + Compressor* compressor) { + out_times.clear(); + if (page_info.time_compressed_size == 0) { + return E_OK; + } + + char stack_buf[4096]; + char* compressed_buf = stack_buf; + bool heap = page_info.time_compressed_size > sizeof(stack_buf); + if (heap) { + compressed_buf = static_cast(common::mem_alloc( + page_info.time_compressed_size, common::MOD_DEFAULT)); + if (compressed_buf == nullptr) { + return E_OOM; + } + } + + int32_t read_len = 0; + int ret = read_file_->read(page_info.time_file_offset, compressed_buf, + page_info.time_compressed_size, read_len); + if (IS_FAIL(ret)) { + if (heap) common::mem_free(compressed_buf); + return ret; + } + // ReadFile::read() returns E_OK + short read_len on EOF; uncompressing + // page_info.time_compressed_size from a buffer with uninitialised tail + // bytes would feed garbage to the decompressor. + if (read_len != static_cast(page_info.time_compressed_size)) { + if (heap) common::mem_free(compressed_buf); + return E_TSFILE_CORRUPTED; + } + + char* uncompressed_buf = nullptr; + uint32_t uncompressed_size = 0; + if (RET_FAIL(compressor->reset(false))) { + if (heap) common::mem_free(compressed_buf); + return ret; + } + ret = compressor->uncompress(compressed_buf, page_info.time_compressed_size, + uncompressed_buf, uncompressed_size); + if (heap && compressed_buf != uncompressed_buf) { + common::mem_free(compressed_buf); + } + if (IS_FAIL(ret) || uncompressed_size != page_info.time_uncompressed_size) { + if (uncompressed_buf != nullptr) { + compressor->after_uncompress(uncompressed_buf); + } + return E_TSFILE_CORRUPTED; + } + + common::ByteStream in; + in.wrap_from(uncompressed_buf, uncompressed_size); + decoder->reset(); + const int batch_size = 1024; + int64_t batch[batch_size]; + while (decoder->has_remaining(in)) { + int actual = 0; + if (RET_FAIL( + decoder->read_batch_int64(batch, batch_size, actual, in))) { + break; + } + if (actual == 0) { + break; + } + out_times.insert(out_times.end(), batch, batch + actual); + } + compressor->after_uncompress(uncompressed_buf); + return ret; +} + +int AlignedChunkReader::build_page_plan(Filter* filter) { + int ret = E_OK; + chunk_pages_.clear(); + current_page_plan_index_ = 0; + current_page_loaded_ = false; + page_plan_built_ = false; + + const uint32_t num_cols = value_columns_.size(); + while (IS_SUCC(ret)) { + if (time_chunk_visit_offset_ - time_chunk_header_.serialized_size_ >= + time_chunk_header_.data_size_) { + break; + } + + if (RET_FAIL(get_cur_page_header( + time_chunk_meta_, time_in_stream_, cur_time_page_header_, + time_chunk_visit_offset_, time_chunk_header_))) { + break; + } + if (cur_time_page_header_.compressed_size_ == 0 && + cur_time_page_header_.uncompressed_size_ == 0) { + break; + } + + ChunkPageInfo page_info; + page_info.time_file_offset = time_chunk_meta_->offset_of_chunk_header_ + + time_chunk_visit_offset_; + page_info.time_compressed_size = cur_time_page_header_.compressed_size_; + page_info.time_uncompressed_size = + cur_time_page_header_.uncompressed_size_; + page_info.value_file_offsets.resize(num_cols); + page_info.value_compressed_sizes.resize(num_cols); + page_info.value_uncompressed_sizes.resize(num_cols); + + for (uint32_t c = 0; c < num_cols && IS_SUCC(ret); c++) { + auto* col = value_columns_[c]; + if (RET_FAIL(get_cur_page_header( + col->chunk_meta, col->in_stream, col->cur_page_header, + col->chunk_visit_offset, col->chunk_header, + &col->file_data_buf_size))) { + break; + } + page_info.value_file_offsets[c] = + col->chunk_meta->offset_of_chunk_header_ + + col->chunk_visit_offset; + page_info.value_compressed_sizes[c] = + col->cur_page_header.compressed_size_; + page_info.value_uncompressed_sizes[c] = + col->cur_page_header.uncompressed_size_; + } + if (IS_FAIL(ret)) { + break; + } + + Statistic* stat = cur_time_page_header_.statistic_; + if (filter == nullptr) { + page_info.pass_type = PagePassType::FULL_PASS; + page_info.row_begin = 0; + page_info.row_end = stat != nullptr ? stat->count_ : 0; + } else if (stat != nullptr && !filter->satisfy(stat)) { + page_info.pass_type = PagePassType::SKIP; + } else if (stat != nullptr && filter->contain_start_end_time( + stat->start_time_, stat->end_time_)) { + page_info.pass_type = PagePassType::FULL_PASS; + page_info.row_begin = 0; + page_info.row_end = stat->count_; + } else { + page_info.pass_type = PagePassType::BOUNDARY; + std::vector times; + if (RET_FAIL(decode_time_page_direct(page_info, times))) { + break; + } + int32_t first = -1; + int32_t last = -1; + for (int32_t i = 0; i < static_cast(times.size()); i++) { + if (filter->satisfy_start_end_time(times[i], times[i])) { + if (first < 0) first = i; + last = i; + } + } + if (first >= 0) { + page_info.row_begin = first; + page_info.row_end = last + 1; + } else { + page_info.pass_type = PagePassType::SKIP; + } + } + + if (page_info.pass_type != PagePassType::SKIP) { + if (page_info.row_end == 0) { + std::vector times; + if (RET_FAIL(decode_time_page_direct(page_info, times))) { + break; + } + page_info.row_end = static_cast(times.size()); + } + if (page_info.row_begin < page_info.row_end) { + chunk_pages_.push_back(std::move(page_info)); + } + } + + time_chunk_visit_offset_ += cur_time_page_header_.compressed_size_; + time_in_stream_.wrapped_buf_advance_read_pos( + cur_time_page_header_.compressed_size_); + for (uint32_t c = 0; c < num_cols; c++) { + auto* col = value_columns_[c]; + col->chunk_visit_offset += col->cur_page_header.compressed_size_; + col->in_stream.wrapped_buf_advance_read_pos( + col->cur_page_header.compressed_size_); + } + } + + page_plan_built_ = IS_SUCC(ret); + + if (page_plan_built_) { + per_page_times_.assign(chunk_pages_.size(), std::vector{}); + for (auto* col : value_columns_) { + col->per_page_state.clear(); + col->per_page_state.resize(chunk_pages_.size()); + } + } + return ret; +} + +void AlignedChunkReader::release_current_page_state() { + time_predecoded_ = false; + page_all_times_.clear(); + page_time_count_ = 0; + page_time_cursor_ = 0; + for (auto* col : value_columns_) { + if (col->uncompressed_buf != nullptr && col->compressor != nullptr) { + col->compressor->after_uncompress(col->uncompressed_buf); + col->uncompressed_buf = nullptr; + } + col->notnull_bitmap.clear(); + col->cur_value_index = -1; + col->in.reset(); + for (auto& pps : col->per_page_state) { + pps.predecode_pa.destroy(); + } + col->per_page_state.clear(); + col->pending_decoded_values.clear(); + col->pending_decoded_count = 0; + col->pending_decoded_cursor = 0; + col->pending_decoded = false; + } + per_page_times_.clear(); + current_page_loaded_ = false; +} + +int AlignedChunkReader::decode_value_page_for_slot(uint32_t col_idx, + size_t page_idx) { + const ChunkPageInfo& page_info = chunk_pages_[page_idx]; + auto* col = value_columns_[col_idx]; + auto& pps = col->per_page_state[page_idx]; + + pps.notnull_bitmap.clear(); + pps.predecoded_values.clear(); + pps.predecoded_strings.clear(); + pps.predecoded_read_pos = 0; + pps.predecoded_count = 0; + pps.predecode_pa.destroy(); + + if (page_info.value_compressed_sizes[col_idx] == 0) { + return E_OK; + } + + char stack_buf[4096]; + char* compressed_buf = stack_buf; + bool heap = page_info.value_compressed_sizes[col_idx] > sizeof(stack_buf); + if (heap) { + compressed_buf = static_cast(common::mem_alloc( + page_info.value_compressed_sizes[col_idx], common::MOD_DEFAULT)); + if (compressed_buf == nullptr) return E_OOM; + } + + int32_t read_len = 0; + int ret = + read_file_->read(page_info.value_file_offsets[col_idx], compressed_buf, + page_info.value_compressed_sizes[col_idx], read_len); + if (IS_FAIL(ret)) { + if (heap) common::mem_free(compressed_buf); + return ret; + } + if (read_len != + static_cast(page_info.value_compressed_sizes[col_idx])) { + if (heap) common::mem_free(compressed_buf); + return E_TSFILE_CORRUPTED; + } + + char* uncompressed_buf = nullptr; + uint32_t uncompressed_size = 0; + if (RET_FAIL(col->compressor->reset(false))) { + if (heap) common::mem_free(compressed_buf); + return ret; + } + ret = col->compressor->uncompress(compressed_buf, + page_info.value_compressed_sizes[col_idx], + uncompressed_buf, uncompressed_size); + if (heap && compressed_buf != uncompressed_buf) { + common::mem_free(compressed_buf); + } + if (IS_FAIL(ret) || + uncompressed_size != page_info.value_uncompressed_sizes[col_idx]) { + if (uncompressed_buf != nullptr) { + col->compressor->after_uncompress(uncompressed_buf); + } + return E_TSFILE_CORRUPTED; + } + // The value page begins with a uint32 data_num followed by a bitmap of + // ceil(data_num/8) bytes; a corrupt or truncated page that doesn't even + // hold the data_num header would let read_ui32() walk past the buffer. + if (uncompressed_size < sizeof(uint32_t)) { + col->compressor->after_uncompress(uncompressed_buf); + return E_TSFILE_CORRUPTED; + } + + uint32_t offset = 0; + uint32_t data_num = SerializationUtil::read_ui32(uncompressed_buf); + offset += sizeof(uint32_t); + uint32_t bitmap_bytes = (data_num + 7) / 8; + if (uncompressed_size - offset < bitmap_bytes) { + col->compressor->after_uncompress(uncompressed_buf); + return E_TSFILE_CORRUPTED; + } + pps.notnull_bitmap.resize(bitmap_bytes); + for (size_t i = 0; i < pps.notnull_bitmap.size(); i++) { + pps.notnull_bitmap[i] = *(uncompressed_buf + offset++); + } + + char* value_buf = uncompressed_buf + offset; + uint32_t value_buf_size = uncompressed_size - offset; + common::ByteStream in; + in.wrap_from(value_buf, value_buf_size); + col->decoder->reset(); + + auto dt = col->chunk_header.data_type_; + int nonnull_total = count_non_null_prefix(pps.notnull_bitmap, + static_cast(data_num)); + int prefix_nonnull = + count_non_null_prefix(pps.notnull_bitmap, page_info.row_begin); + pps.predecoded_read_pos = prefix_nonnull; + + auto cleanup = [&]() { + col->compressor->after_uncompress(uncompressed_buf); + }; + + if (dt == common::STRING || dt == common::TEXT || dt == common::BLOB) { + pps.predecode_pa.init(512, common::MOD_TSFILE_READER); + pps.predecoded_strings.resize(nonnull_total); + for (int i = 0; i < nonnull_total; i++) { + if (RET_FAIL(col->decoder->read_String(pps.predecoded_strings[i], + pps.predecode_pa, in))) { + cleanup(); + return ret; + } + } + pps.predecoded_count = nonnull_total; + cleanup(); + return E_OK; + } + + if (nonnull_total == 0) { + cleanup(); + return E_OK; + } + + uint32_t elem_size = common::get_data_type_size(dt); + pps.predecoded_values.resize(static_cast(nonnull_total) * + elem_size); + int actual = 0; + switch (dt) { + case common::BOOLEAN: { + bool* out = reinterpret_cast(pps.predecoded_values.data()); + for (int i = 0; i < nonnull_total; i++) { + if (RET_FAIL(col->decoder->read_boolean(out[i], in))) { + cleanup(); + return ret; + } + } + actual = nonnull_total; + break; + } + case common::INT32: + case common::DATE: + if (RET_FAIL(col->decoder->read_batch_int32( + reinterpret_cast(pps.predecoded_values.data()), + nonnull_total, actual, in))) { + cleanup(); + return ret; + } + break; + case common::INT64: + case common::TIMESTAMP: + if (RET_FAIL(col->decoder->read_batch_int64( + reinterpret_cast(pps.predecoded_values.data()), + nonnull_total, actual, in))) { + cleanup(); + return ret; + } + break; + case common::FLOAT: + if (RET_FAIL(col->decoder->read_batch_float( + reinterpret_cast(pps.predecoded_values.data()), + nonnull_total, actual, in))) { + cleanup(); + return ret; + } + break; + case common::DOUBLE: + if (RET_FAIL(col->decoder->read_batch_double( + reinterpret_cast(pps.predecoded_values.data()), + nonnull_total, actual, in))) { + cleanup(); + return ret; + } + break; + default: + cleanup(); + return E_NOT_SUPPORT; + } + pps.predecoded_count = actual; + cleanup(); + return E_OK; +} + +// Multi-thread path: one task per value column, each decoding all non-SKIP +// pages of that column serially. Time pages dispatched as worker-bucketed +// strided tasks using per-worker decoder/compressor (filled from +// time_decoder_pool_ / time_compressor_pool_) so they don't contend on the +// shared time_decoder_/time_compressor_. +// +// Single-thread: do NOT pre-decode every page upfront — leave per_page_state +// empty so the scatter loop decodes on demand and releases after each page +// (see decode_page_lazy() / release_page_slot()). Bounds memory to one page. +int AlignedChunkReader::decode_all_planned_pages() { + if (chunk_pages_.empty()) return E_OK; + +#ifdef ENABLE_THREADS + if (decode_pool_ != nullptr && value_columns_.size() > 1) { + // Lazily grow the per-worker time decoder/compressor pool. Both + // factories can return nullptr on OOM/unsupported config; without + // checking, the worker task below dereferences null when calling + // decode_time_page_with(). + size_t worker_count = decode_pool_->num_threads(); + if (time_decoder_pool_.size() < worker_count) { + time_decoder_pool_.resize(worker_count, nullptr); + time_compressor_pool_.resize(worker_count, nullptr); + for (size_t w = 0; w < worker_count; w++) { + if (time_decoder_pool_[w] == nullptr) { + time_decoder_pool_[w] = + DecoderFactory::alloc_time_decoder(); + if (time_decoder_pool_[w] == nullptr) return E_OOM; + } + if (time_compressor_pool_[w] == nullptr) { + time_compressor_pool_[w] = + CompressorFactory::alloc_compressor( + time_chunk_header_.compression_type_); + if (time_compressor_pool_[w] == nullptr) return E_OOM; + } + } + } + + std::vector> futures; + std::vector col_rets(value_columns_.size(), E_OK); + for (uint32_t c = 0; c < value_columns_.size(); c++) { + int* col_ret = &col_rets[c]; + futures.push_back(decode_pool_->submit([this, c, col_ret]() { + for (size_t p = 0; p < chunk_pages_.size(); p++) { + int r = decode_value_page_for_slot(c, p); + if (IS_FAIL(r)) { + *col_ret = r; + return; + } + } + })); + } + // Time pages dispatched in worker-sized chunks (one task per worker) + // to amortize submit/wait overhead. Stride for load balance. + size_t time_task_count = std::min(worker_count, chunk_pages_.size()); + std::vector time_rets(time_task_count, E_OK); + for (size_t k = 0; k < time_task_count; k++) { + int* tr = &time_rets[k]; + futures.push_back(decode_pool_->submit( + [this, k, tr, time_task_count, worker_count]() { + size_t wid = common::ThreadPool::current_worker_id(); + if (wid >= worker_count) wid = 0; + Decoder* dec = time_decoder_pool_[wid]; + Compressor* comp = time_compressor_pool_[wid]; + for (size_t p = k; p < chunk_pages_.size(); + p += time_task_count) { + int r = decode_time_page_with( + chunk_pages_[p], per_page_times_[p], dec, comp); + if (IS_FAIL(r)) { + *tr = r; + return; + } + } + })); + } + // Wait on each task's own future rather than draining the whole pool: + // it is shared process-wide, so wait_all() would also block on + // unrelated concurrent operations' tasks still in flight. + for (auto& f : futures) f.get(); + for (auto r : time_rets) { + if (IS_FAIL(r)) return r; + } + for (uint32_t c = 0; c < value_columns_.size(); c++) { + if (IS_FAIL(col_rets[c])) return col_rets[c]; + } + return E_OK; + } +#endif + // Single-thread: defer decode to scatter time. + return E_OK; +} + +// Decode time + all value columns for a single page slot on demand. +// Used by the single-thread path to keep memory bounded to one page. +int AlignedChunkReader::decode_page_lazy(size_t page_idx) { + int ret = E_OK; + if (RET_FAIL(decode_time_page_direct(chunk_pages_[page_idx], + per_page_times_[page_idx]))) { + return ret; + } + for (uint32_t c = 0; c < value_columns_.size(); c++) { + if (RET_FAIL(decode_value_page_for_slot(c, page_idx))) { + return ret; + } + } + return E_OK; +} + +// Release the decoded buffers of one page slot so they can be reused by the +// next page (keeps memory footprint bounded for the single-thread path). +void AlignedChunkReader::release_page_slot(size_t page_idx) { + std::vector{}.swap(per_page_times_[page_idx]); + for (auto* col : value_columns_) { + if (page_idx >= col->per_page_state.size()) continue; + auto& pps = col->per_page_state[page_idx]; + std::vector{}.swap(pps.notnull_bitmap); + std::vector{}.swap(pps.predecoded_values); + std::vector{}.swap(pps.predecoded_strings); + pps.predecode_pa.destroy(); + pps.predecoded_count = 0; + pps.predecoded_read_pos = 0; + } +} + +int AlignedChunkReader::get_next_page_multi(TsBlock* ret_tsblock, + Filter* oneshoot_filter, + PageArena& pa) { + int ret = E_OK; + Filter* filter = + (oneshoot_filter != nullptr ? oneshoot_filter : time_filter_); + + // Dispatch: + // - Multi-column with a thread pool → chunk-level pre-decode: one task + // per value column decodes that column's whole chunk up front, then the + // scatter loop bulk-memcpys. decode_all_planned_pages() works for any + // column count. (An earlier cutoff sent >6 columns down the serial + // path because per_page_state — the upfront predecode buffer — grows + // with column count and was feared to thrash cache; it still grows, so + // very wide aligned chunks are the case to watch if reads regress.) + // - Single column, or no thread pool → serial path: decode the current + // page's columns inline (multi_DECODE_TV_BATCH), no thread-pool + // fan-out. +#ifdef ENABLE_THREADS + const bool use_chunk_level = + decode_pool_ != nullptr && value_columns_.size() > 1; +#else + const bool use_chunk_level = false; +#endif + if (!use_chunk_level) { + return get_next_page_multi_serial(ret_tsblock, filter, pa); + } + + if (!page_plan_built_) { + if (RET_FAIL(build_page_plan(filter))) { + return ret; + } + if (RET_FAIL(decode_all_planned_pages())) { + return ret; + } + } + if (chunk_pages_.empty()) { + return E_NO_MORE_DATA; + } + + const uint32_t null_mask_base = 1 << 7; + const uint32_t num_cols = value_columns_.size(); + RowAppender row_appender(ret_tsblock); + // Detect single-thread lazy mode by whether decode_all_planned_pages left + // per_page_times_ empty (it leaves slots empty when there's no pool). + const bool single_thread_lazy = per_page_times_[0].empty(); + + while (current_page_plan_index_ < chunk_pages_.size()) { + const ChunkPageInfo& page_info = chunk_pages_[current_page_plan_index_]; + + if (!current_page_loaded_) { + if (single_thread_lazy) { + if (RET_FAIL(decode_page_lazy(current_page_plan_index_))) { + return ret; + } + } + page_time_cursor_ = page_info.row_begin; + page_time_count_ = page_info.row_end; + current_page_loaded_ = true; + } + const std::vector& times = + per_page_times_[current_page_plan_index_]; + + int32_t remaining_in_page = page_time_count_ - page_time_cursor_; + uint32_t budget = row_appender.remaining(); + + // Fast path: FULL_PASS page, no nulls in any value column, types + // match destination, budget > 0. Bulk-memcpys up to + // min(budget, remaining_in_page) rows from page_time_cursor_; tail + // pages of an SSI tsblock still take the memcpy path instead of + // falling into the row-by-row scatter loop. + bool can_bulk = page_info.pass_type == PagePassType::FULL_PASS && + remaining_in_page > 0 && budget > 0; + if (can_bulk) { + for (uint32_t c = 0; c < num_cols; c++) { + auto* col = value_columns_[c]; + auto& pps = col->per_page_state[current_page_plan_index_]; + auto dt = col->chunk_header.data_type_; + if (dt == common::STRING || dt == common::TEXT || + dt == common::BLOB || + ret_tsblock->get_vector(c + 1)->get_vector_type() != dt || + pps.predecoded_count != page_time_count_) { + can_bulk = false; + break; + } + } + } + + if (can_bulk) { + uint32_t bulk_count = + std::min(budget, static_cast(remaining_in_page)); + size_t time_byte_off = + static_cast(page_time_cursor_) * sizeof(int64_t); + // Bulk-append both bytes AND row count for every Vector. + // Skipping add_row_nums() would leave each Vector's row_num_ + // at 0 while the TsBlock-level row_count_ jumped to bulk_count; + // fill_trailling_nulls() would then mark every just-written + // row as null, and column iterators would report the wrong + // length. + common::Vector* time_vec = ret_tsblock->get_vector(0); + time_vec->get_value_data().append_fixed_value( + reinterpret_cast(times.data()) + time_byte_off, + bulk_count * sizeof(int64_t)); + time_vec->add_row_nums(bulk_count); + for (uint32_t c = 0; c < num_cols; c++) { + auto* col = value_columns_[c]; + auto& pps = col->per_page_state[current_page_plan_index_]; + uint32_t elem_size = + common::get_data_type_size(col->chunk_header.data_type_); + common::Vector* vec = ret_tsblock->get_vector(c + 1); + vec->get_value_data().append_fixed_value( + pps.predecoded_values.data() + + static_cast(page_time_cursor_) * elem_size, + bulk_count * elem_size); + vec->add_row_nums(bulk_count); + } + row_appender.add_rows(bulk_count); + page_time_cursor_ += bulk_count; + if (page_time_cursor_ >= page_time_count_) { + if (single_thread_lazy) { + release_page_slot(current_page_plan_index_); + } + current_page_plan_index_++; + current_page_loaded_ = false; + continue; + } + // Budget exhausted mid-page; caller will drain and resume. + return E_OK; + } + + // Slow path: row-by-row. Handles null bitmap, type promotion, + // BOUNDARY pages, and partial-page E_OVERFLOW. + // BOUNDARY pages: build_page_plan compressed the page to the + // [first-hit, last-hit] range, but timestamps inside that range may + // still fail the filter (e.g. TimeIn({2, 8}) leaves 3..7 unmatched). + // Re-apply the filter per timestamp here, advancing predecoded + // read positions for skipped non-null rows so the cursor stays + // aligned with the page's value layout. + const bool boundary_filter = + page_info.pass_type == PagePassType::BOUNDARY && filter != nullptr; + while (page_time_cursor_ < page_time_count_) { + if (row_appender.remaining() == 0) { + return E_OK; + } + int64_t ts = times[page_time_cursor_]; + if (boundary_filter && !filter->satisfy_start_end_time(ts, ts)) { + for (uint32_t c = 0; c < num_cols; c++) { + auto* col = value_columns_[c]; + auto& pps = col->per_page_state[current_page_plan_index_]; + bool is_null = true; + if (!pps.notnull_bitmap.empty()) { + is_null = + ((pps.notnull_bitmap[page_time_cursor_ / 8] & + 0xFF) & + (null_mask_base >> (page_time_cursor_ % 8))) == 0; + } + if (!is_null) pps.predecoded_read_pos++; + } + page_time_cursor_++; + continue; + } + if (UNLIKELY(!row_appender.add_row())) { + return E_OK; + } + row_appender.append(0, reinterpret_cast(&ts), sizeof(ts)); + + for (uint32_t c = 0; c < num_cols; c++) { + auto* col = value_columns_[c]; + auto& pps = col->per_page_state[current_page_plan_index_]; + bool is_null = true; + if (!pps.notnull_bitmap.empty()) { + is_null = + ((pps.notnull_bitmap[page_time_cursor_ / 8] & 0xFF) & + (null_mask_base >> (page_time_cursor_ % 8))) == 0; + } + if (is_null) { + row_appender.append_null(c + 1); + continue; + } + if (col->chunk_header.data_type_ == common::STRING || + col->chunk_header.data_type_ == common::TEXT || + col->chunk_header.data_type_ == common::BLOB) { + const common::String& value = + pps.predecoded_strings[pps.predecoded_read_pos++]; + row_appender.append(c + 1, value.buf_, value.len_); + } else { + uint32_t elem_size = common::get_data_type_size( + col->chunk_header.data_type_); + row_appender.append( + c + 1, + pps.predecoded_values.data() + + static_cast(pps.predecoded_read_pos++) * + elem_size, + elem_size); + } + } + page_time_cursor_++; + } + + if (single_thread_lazy) { + release_page_slot(current_page_plan_index_); + } + current_page_plan_index_++; + current_page_loaded_ = false; + } + return E_NO_MORE_DATA; +} + +int AlignedChunkReader::get_next_page_multi_serial(TsBlock* ret_tsblock, + Filter* filter, + PageArena& pa) { + int ret = E_OK; + bool pt = prev_time_page_not_finish(); + bool pv = prev_any_value_page_not_finish_multi(); + if (pt && pv) { + ret = + decode_time_value_buf_into_tsblock_multi(ret_tsblock, filter, &pa); + return ret; + } + if (!pt && !pv) { + while (IS_SUCC(ret)) { + if (RET_FAIL(get_cur_page_header( + time_chunk_meta_, time_in_stream_, cur_time_page_header_, + time_chunk_visit_offset_, time_chunk_header_))) { + break; + } + for (size_t c = 0; c < value_columns_.size() && IS_SUCC(ret); c++) { + auto* col = value_columns_[c]; + if (RET_FAIL(get_cur_page_header( + col->chunk_meta, col->in_stream, col->cur_page_header, + col->chunk_visit_offset, col->chunk_header, + &col->file_data_buf_size))) { + } + } + if (IS_FAIL(ret)) break; + if (cur_page_statisify_filter_multi(filter)) break; + if (RET_FAIL(skip_cur_page_multi())) break; + if (!has_more_data()) { + ret = E_NO_MORE_DATA; + break; + } + } + if (IS_SUCC(ret)) { + ret = decode_cur_time_page_data(); + if (IS_SUCC(ret)) ret = decode_cur_value_pages_multi(); + } + } + if (IS_SUCC(ret)) { + ret = + decode_time_value_buf_into_tsblock_multi(ret_tsblock, filter, &pa); + } + return ret; +} + +bool AlignedChunkReader::cur_page_statisify_filter_multi(Filter* filter) { + bool time_satisfy = filter == nullptr || + cur_time_page_header_.statistic_ == nullptr || + filter->satisfy(cur_time_page_header_.statistic_); + return time_satisfy; +} + +int AlignedChunkReader::skip_cur_page_multi() { + time_chunk_visit_offset_ += cur_time_page_header_.compressed_size_; + time_in_stream_.wrapped_buf_advance_read_pos( + cur_time_page_header_.compressed_size_); + for (auto* col : value_columns_) { + col->chunk_visit_offset += col->cur_page_header.compressed_size_; + col->in_stream.wrapped_buf_advance_read_pos( + col->cur_page_header.compressed_size_); + } + return E_OK; +} + +int AlignedChunkReader::decode_cur_value_pages_multi() { + int ret = E_OK; + // Phase 1: Serial IO — ensure each column's page data is in memory. + for (size_t c = 0; c < value_columns_.size() && IS_SUCC(ret); c++) { + ret = ensure_value_page_loaded(*value_columns_[c]); + } + if (IS_FAIL(ret)) return ret; + + // Phase 2: decompress + parse bitmap + reset decoder for each column's + // current page, inline. This serial path now only runs for single-column + // reads or when no thread pool exists — multi-column reads with a pool take + // the chunk-level path (decode_all_planned_pages), so there is no per-page + // thread-pool fan-out here anymore. predecode=false lets the scatter loop + // (multi_DECODE_TV_BATCH) decode inline, which has better cache locality + // when there is no parallelism to amortize an extra predecode buffer write. + for (size_t c = 0; c < value_columns_.size() && IS_SUCC(ret); c++) { + ret = decompress_and_parse_value_page(*value_columns_[c], false); + } + return ret; +} + +int AlignedChunkReader::decode_cur_value_page_data_for(ValueColumnState& col) { + int ret = E_OK; + + // Step 1: ensure full page data is loaded + if (col.in_stream.remaining_size() < col.cur_page_header.compressed_size_) { + if (RET_FAIL(read_from_file_and_rewrap( + col.in_stream, col.chunk_meta, col.chunk_visit_offset, + col.file_data_buf_size, + col.cur_page_header.compressed_size_))) { + return ret; + } + } + + if (col.cur_page_header.compressed_size_ == 0) { + col.in.wrap_from(nullptr, 0); + return E_OK; + } + + // Step 2: uncompress + char* compressed_buf = + col.in_stream.get_wrapped_buf() + col.in_stream.read_pos(); + uint32_t compressed_size = col.cur_page_header.compressed_size_; + col.in_stream.wrapped_buf_advance_read_pos(compressed_size); + col.chunk_visit_offset += compressed_size; + + char* uncompressed_buf = nullptr; + uint32_t uncompressed_size = 0; + if (RET_FAIL(col.compressor->reset(false))) { + return ret; + } + if (RET_FAIL(col.compressor->uncompress(compressed_buf, compressed_size, + uncompressed_buf, + uncompressed_size))) { + return ret; + } + col.uncompressed_buf = uncompressed_buf; + + if (uncompressed_size != col.cur_page_header.uncompressed_size_) { + return E_TSFILE_CORRUPTED; + } + + // Step 3: parse bitmap + value data + if (uncompressed_size < sizeof(uint32_t)) return E_TSFILE_CORRUPTED; + uint32_t offset = 0; + uint32_t data_num = SerializationUtil::read_ui32(uncompressed_buf); + offset += sizeof(uint32_t); + uint32_t bitmap_bytes = (data_num + 7) / 8; + if (uncompressed_size - offset < bitmap_bytes) return E_TSFILE_CORRUPTED; + col.notnull_bitmap.resize(bitmap_bytes); + for (size_t i = 0; i < col.notnull_bitmap.size(); i++) { + col.notnull_bitmap[i] = *(uncompressed_buf + offset); + offset++; + } + col.cur_value_index = -1; + + char* value_buf = uncompressed_buf + offset; + uint32_t value_buf_size = uncompressed_size - offset; + col.decoder->reset(); + col.in.wrap_from(value_buf, value_buf_size); + return ret; +} + +int AlignedChunkReader::ensure_value_page_loaded(ValueColumnState& col) { + int ret = E_OK; + if (col.in_stream.remaining_size() < col.cur_page_header.compressed_size_) { + if (RET_FAIL(read_from_file_and_rewrap( + col.in_stream, col.chunk_meta, col.chunk_visit_offset, + col.file_data_buf_size, + col.cur_page_header.compressed_size_))) { + return ret; + } + } + return ret; +} + +int AlignedChunkReader::decompress_and_parse_value_page(ValueColumnState& col, + bool predecode) { + int ret = E_OK; + + if (col.cur_page_header.compressed_size_ == 0) { + col.in.wrap_from(nullptr, 0); + return E_OK; + } + + // Decompress + char* compressed_buf = + col.in_stream.get_wrapped_buf() + col.in_stream.read_pos(); + uint32_t compressed_size = col.cur_page_header.compressed_size_; + col.in_stream.wrapped_buf_advance_read_pos(compressed_size); + col.chunk_visit_offset += compressed_size; + + char* uncompressed_buf = nullptr; + uint32_t uncompressed_size = 0; + if (RET_FAIL(col.compressor->reset(false))) { + return ret; + } + if (RET_FAIL(col.compressor->uncompress(compressed_buf, compressed_size, + uncompressed_buf, + uncompressed_size))) { + return ret; + } + col.uncompressed_buf = uncompressed_buf; + + if (uncompressed_size != col.cur_page_header.uncompressed_size_) { + return E_TSFILE_CORRUPTED; + } + + // Parse bitmap + value data + if (uncompressed_size < sizeof(uint32_t)) return E_TSFILE_CORRUPTED; + uint32_t offset = 0; + uint32_t data_num = SerializationUtil::read_ui32(uncompressed_buf); + offset += sizeof(uint32_t); + uint32_t bitmap_bytes = (data_num + 7) / 8; + if (uncompressed_size - offset < bitmap_bytes) return E_TSFILE_CORRUPTED; + col.notnull_bitmap.resize(bitmap_bytes); + for (size_t i = 0; i < col.notnull_bitmap.size(); i++) { + col.notnull_bitmap[i] = *(uncompressed_buf + offset); + offset++; + } + col.cur_value_index = -1; + + char* value_buf = uncompressed_buf + offset; + uint32_t value_buf_size = uncompressed_size - offset; + col.decoder->reset(); + col.in.wrap_from(value_buf, value_buf_size); + + // Pre-decode all non-null values into pending_decoded_values so the + // scatter loop (multi_DECODE_TV_BATCH) just memcpys instead of calling + // the decoder. Moves the expensive int64/double decode into the worker + // task so it runs in parallel. Only handles fixed-length types — strings + // stay on the inline-decode path. + col.pending_decoded = false; + col.pending_decoded_count = 0; + col.pending_decoded_cursor = 0; + auto dt = col.chunk_header.data_type_; + if (predecode && dt != common::STRING && dt != common::TEXT && + dt != common::BLOB) { + int nonnull_total = 0; + for (uint32_t i = 0; i < data_num; i++) { + if ((col.notnull_bitmap[i / 8] & (0x80 >> (i % 8))) != 0) { + nonnull_total++; + } + } + if (nonnull_total > 0) { + uint32_t elem_size = common::get_data_type_size(dt); + col.pending_decoded_values.resize( + static_cast(nonnull_total) * elem_size); + int actual = 0; + int rret = common::E_OK; + switch (dt) { + case common::BOOLEAN: { + bool* out = reinterpret_cast( + col.pending_decoded_values.data()); + for (int i = 0; i < nonnull_total; i++) { + bool v; + if (col.decoder->read_boolean(v, col.in) != + common::E_OK) { + rret = common::E_OUT_OF_RANGE; + break; + } + out[i] = v; + } + actual = nonnull_total; + break; + } + case common::INT32: + case common::DATE: + rret = col.decoder->read_batch_int32( + reinterpret_cast( + col.pending_decoded_values.data()), + nonnull_total, actual, col.in); + break; + case common::INT64: + case common::TIMESTAMP: + rret = col.decoder->read_batch_int64( + reinterpret_cast( + col.pending_decoded_values.data()), + nonnull_total, actual, col.in); + break; + case common::FLOAT: + rret = col.decoder->read_batch_float( + reinterpret_cast( + col.pending_decoded_values.data()), + nonnull_total, actual, col.in); + break; + case common::DOUBLE: + rret = col.decoder->read_batch_double( + reinterpret_cast( + col.pending_decoded_values.data()), + nonnull_total, actual, col.in); + break; + default: + rret = common::E_OUT_OF_RANGE; + } + if (rret == common::E_OK && actual == nonnull_total) { + col.pending_decoded_count = nonnull_total; + col.pending_decoded = true; + } + } else { + col.pending_decoded = true; // empty page is trivially predecoded + } + } + return ret; +} + +int AlignedChunkReader::decode_time_value_buf_into_tsblock_multi( + TsBlock*& ret_tsblock, Filter* filter, PageArena* pa) { + int ret = E_OK; + RowAppender row_appender(ret_tsblock); + ret = multi_DECODE_TV_BATCH(ret_tsblock, row_appender, filter, pa); + + // Release uncompressed buffers if pages are done + if (ret != E_OVERFLOW) { + if (time_uncompressed_buf_ != nullptr) { + time_compressor_->after_uncompress(time_uncompressed_buf_); + time_uncompressed_buf_ = nullptr; + } + for (auto* col : value_columns_) { + if (col->uncompressed_buf != nullptr) { + col->compressor->after_uncompress(col->uncompressed_buf); + col->uncompressed_buf = nullptr; + } + // The time stream and bitmap define the page's row/value count. + // Once the page is fully processed, bytes left in an all-null + // value stream are only encoder terminators or padding and must + // not make has_more_data_multi() treat the page as unfinished. + col->in.reset(); + col->notnull_bitmap.clear(); + col->notnull_bitmap.shrink_to_fit(); + } + if (!prev_time_page_not_finish()) { + time_in_.reset(); + } + } else { + ret = E_OK; + } + return ret; +} + +int AlignedChunkReader::multi_DECODE_TV_BATCH(TsBlock* ret_tsblock, + RowAppender& row_appender, + Filter* filter, PageArena* pa) { + int ret = E_OK; + const int BATCH = 129; + int64_t times[BATCH]; + const uint32_t null_mask_base = 1 << 7; + const uint32_t num_cols = value_columns_.size(); + + while (time_decoder_->has_remaining(time_in_)) { + // Cap each pass to what the appender can still hold; mirrors the fix + // in ChunkReader's per-type batch loops. A blanket "remaining < BATCH + // → E_OVERFLOW" made progress impossible whenever the caller handed + // us a TsBlock with capacity below BATCH (e.g. small per-block sizes + // in multi-chunk queries). + int eff_batch = + std::min(BATCH, static_cast(row_appender.remaining())); + if (eff_batch <= 0) { + ret = E_OVERFLOW; + break; + } + + // ── Phase 1: Decode a batch of timestamps ── + int time_count = 0; + if (RET_FAIL(time_decoder_->read_batch_int64(times, eff_batch, + time_count, time_in_))) { + break; + } + if (time_count == 0) break; + + // ── Phase 2: Apply time filter ── + bool time_mask[BATCH]; + bool block_all_pass = (filter == nullptr); + int pass_count = time_count; + if (!block_all_pass) { + pass_count = + filter->satisfy_batch_time(times, time_count, time_mask); + } + + // ── Phase 3: Per-column null check + value decode ── + // For each column, compute null flags and decode non-null values. + // We store decoded values in column-specific buffers. + // Max 8 bytes per value, 129 values per batch. + struct ColBatch { + bool is_null[BATCH]; + int nonnull_count; + // Value buffer for fixed-width types — up to 129 * 8 bytes + char val_buf[BATCH * 8]; + int val_count; + // Variable-length values for STRING/TEXT/BLOB columns. Only + // populated when the column's data_type_ is variable; their + // bufs are owned by the caller-provided PageArena. + std::vector str_vals; + }; + // Allocate on heap if many columns, stack for small counts + std::vector col_batches(num_cols); + + for (uint32_t c = 0; c < num_cols; c++) { + auto* col = value_columns_[c]; + auto& cb = col_batches[c]; + cb.nonnull_count = 0; + cb.val_count = 0; + for (int i = 0; i < time_count; i++) { + int vi = col->cur_value_index + 1 + i; + if (col->notnull_bitmap.empty() || + ((col->notnull_bitmap[vi / 8] & 0xFF) & + (null_mask_base >> (vi % 8))) == 0) { + cb.is_null[i] = true; + } else { + cb.is_null[i] = false; + cb.nonnull_count++; + } + } + + // Skip values if no rows pass time filter. Skip/read errors and + // short reads (decoder returned fewer values than the bitmap + // promised) must abort; otherwise the input stream is left + // mid-value and later batches would decode garbage from + // misaligned bytes. + if (pass_count == 0 && cb.nonnull_count > 0) { + int dret = common::E_OK; + int sk = 0; + switch (col->chunk_header.data_type_) { + case common::BOOLEAN: { + bool dummy; + for (sk = 0; sk < cb.nonnull_count; sk++) { + dret = col->decoder->read_boolean(dummy, col->in); + if (dret != common::E_OK) break; + } + break; + } + case common::INT32: + case common::DATE: + dret = col->decoder->skip_int32(cb.nonnull_count, sk, + col->in); + break; + case common::INT64: + case common::TIMESTAMP: + dret = col->decoder->skip_int64(cb.nonnull_count, sk, + col->in); + break; + case common::FLOAT: + dret = col->decoder->skip_float(cb.nonnull_count, sk, + col->in); + break; + case common::DOUBLE: + dret = col->decoder->skip_double(cb.nonnull_count, sk, + col->in); + break; + case common::STRING: + case common::TEXT: + case common::BLOB: { + // The decoder has no fast skip for var-length strings; + // reading + discarding is the only way to advance the + // input stream past the row's payload. + common::String tmp; + for (sk = 0; sk < cb.nonnull_count; sk++) { + dret = col->decoder->read_String(tmp, *pa, col->in); + if (dret != common::E_OK) break; + } + break; + } + default: + ret = E_TSFILE_CORRUPTED; + break; + } + if (ret != common::E_OK) break; + if (dret != common::E_OK) { + ret = dret; + break; + } + if (sk != cb.nonnull_count) { + ret = E_TSFILE_CORRUPTED; + break; + } + cb.nonnull_count = 0; // bytes consumed cleanly + } + + // Decode non-null values. Fast path: values were predecoded + // into col->pending_decoded_values by the parallel worker — just + // memcpy the slice for this batch. Fallback: call the decoder + // inline (used for STRING/TEXT/BLOB and when predecode was + // skipped). + if (cb.nonnull_count > 0) { + if (col->pending_decoded) { + uint32_t elem_size = common::get_data_type_size( + col->chunk_header.data_type_); + memcpy( + cb.val_buf, + col->pending_decoded_values.data() + + static_cast(col->pending_decoded_cursor) * + elem_size, + static_cast(cb.nonnull_count) * elem_size); + col->pending_decoded_cursor += cb.nonnull_count; + cb.val_count = cb.nonnull_count; + } else { + int dret = common::E_OK; + switch (col->chunk_header.data_type_) { + case common::BOOLEAN: { + bool* out = reinterpret_cast(cb.val_buf); + cb.val_count = 0; + for (int s = 0; s < cb.nonnull_count; s++) { + bool v; + dret = col->decoder->read_boolean(v, col->in); + if (dret != common::E_OK) break; + out[cb.val_count++] = v; + } + break; + } + case common::INT32: + case common::DATE: + dret = col->decoder->read_batch_int32( + reinterpret_cast(cb.val_buf), + cb.nonnull_count, cb.val_count, col->in); + break; + case common::INT64: + case common::TIMESTAMP: + dret = col->decoder->read_batch_int64( + reinterpret_cast(cb.val_buf), + cb.nonnull_count, cb.val_count, col->in); + break; + case common::FLOAT: + dret = col->decoder->read_batch_float( + reinterpret_cast(cb.val_buf), + cb.nonnull_count, cb.val_count, col->in); + break; + case common::DOUBLE: + dret = col->decoder->read_batch_double( + reinterpret_cast(cb.val_buf), + cb.nonnull_count, cb.val_count, col->in); + break; + case common::STRING: + case common::TEXT: + case common::BLOB: { + // Variable-length payload doesn't fit in + // cb.val_buf; pull each value into str_vals and + // let the scatter loop index by val_count. + cb.str_vals.resize(cb.nonnull_count); + cb.val_count = 0; + for (int s = 0; s < cb.nonnull_count; s++) { + dret = col->decoder->read_String(cb.str_vals[s], + *pa, col->in); + if (dret != common::E_OK) break; + cb.val_count++; + } + break; + } + default: + break; + } + // Any decoder error, or a short decode that produced + // fewer values than the bitmap promised, indicates a + // corrupt page; propagate immediately so the scatter + // loop doesn't read uninitialised cb.val_buf bytes. + if (dret != common::E_OK) { + ret = dret; + break; + } + if (col->chunk_header.data_type_ != common::STRING && + col->chunk_header.data_type_ != common::TEXT && + col->chunk_header.data_type_ != common::BLOB && + cb.val_count != cb.nonnull_count) { + ret = E_TSFILE_CORRUPTED; + break; + } + } + } + } + if (ret != E_OK) break; + + // ── Phase 4: Skip if no rows pass ── + if (pass_count == 0) { + for (uint32_t c = 0; c < num_cols; c++) { + value_columns_[c]->cur_value_index += time_count; + } + continue; + } + + // ── Phase 5: Scatter into TsBlock ── + + // Fast path: all rows pass filter AND all columns have no nulls + // → batch memcpy directly into Vector buffers. STRING/TEXT/BLOB + // columns have variable-width payload and live in cb.str_vals, not + // cb.val_buf, so they must take the slow scatter path. + if (pass_count == time_count) { + bool all_nonnull = true; + for (uint32_t c = 0; c < num_cols; c++) { + auto dt = value_columns_[c]->chunk_header.data_type_; + if (col_batches[c].nonnull_count != time_count || + dt == common::STRING || dt == common::TEXT || + dt == common::BLOB) { + all_nonnull = false; + break; + } + } + if (all_nonnull) { + // Batch append time column (bytes + row count); see the + // chunk-level bulk path above for why add_row_nums() is + // required alongside append_fixed_value(). + common::Vector* time_vec = ret_tsblock->get_vector(0); + time_vec->get_value_data().append_fixed_value( + (const char*)times, + static_cast(time_count) * sizeof(int64_t)); + time_vec->add_row_nums(static_cast(time_count)); + // Batch append each value column + for (uint32_t c = 0; c < num_cols; c++) { + auto& cb = col_batches[c]; + auto* col = value_columns_[c]; + uint32_t elem_size = common::get_data_type_size( + col->chunk_header.data_type_); + common::Vector* vec = ret_tsblock->get_vector(c + 1); + vec->get_value_data().append_fixed_value( + cb.val_buf, + static_cast(cb.val_count) * elem_size); + vec->add_row_nums(static_cast(cb.val_count)); + col->cur_value_index += time_count; + } + row_appender.add_rows(static_cast(time_count)); + continue; + } + } + + // Slow path: per-row scatter (has filter or has nulls or strings) + std::vector val_idx(num_cols, 0); + + for (int i = 0; i < time_count; i++) { + bool passes = block_all_pass || time_mask[i]; + + if (!passes) { + for (uint32_t c = 0; c < num_cols; c++) { + value_columns_[c]->cur_value_index++; + if (!col_batches[c].is_null[i]) val_idx[c]++; + } + continue; + } + + if (UNLIKELY(!row_appender.add_row())) { + ret = E_OVERFLOW; + break; + } + + row_appender.append(0, (char*)×[i], sizeof(int64_t)); + + for (uint32_t c = 0; c < num_cols; c++) { + value_columns_[c]->cur_value_index++; + auto& cb = col_batches[c]; + auto* col = value_columns_[c]; + + if (cb.is_null[i]) { + row_appender.append_null(c + 1); + } else { + auto dt = col->chunk_header.data_type_; + if (dt == common::STRING || dt == common::TEXT || + dt == common::BLOB) { + const common::String& sv = cb.str_vals[val_idx[c]]; + row_appender.append(c + 1, sv.buf_, sv.len_); + } else { + uint32_t elem_size = common::get_data_type_size(dt); + row_appender.append(c + 1, + cb.val_buf + val_idx[c] * elem_size, + elem_size); + } + val_idx[c]++; + } + } + } + if (ret != E_OK) break; } return ret; } -} // end namespace storage \ No newline at end of file +} // end namespace storage diff --git a/cpp/src/reader/aligned_chunk_reader.h b/cpp/src/reader/aligned_chunk_reader.h index 91281215e..69ce48f4a 100644 --- a/cpp/src/reader/aligned_chunk_reader.h +++ b/cpp/src/reader/aligned_chunk_reader.h @@ -28,8 +28,70 @@ #include "reader/filter/filter.h" #include "reader/ichunk_reader.h" +#ifdef ENABLE_THREADS +namespace common { +class ThreadPool; +} +#endif + namespace storage { +// Page classification for chunk-level parallel decode. +enum class PagePassType { SKIP, FULL_PASS, BOUNDARY }; + +// Metadata collected per page during the chunk scan phase. +struct ChunkPageInfo { + PagePassType pass_type = PagePassType::SKIP; + // File offsets of compressed data for time and each value column. + int64_t time_file_offset = 0; + uint32_t time_compressed_size = 0; + uint32_t time_uncompressed_size = 0; + int32_t row_begin = 0; // inclusive + int32_t row_end = 0; // exclusive + std::vector value_file_offsets; + std::vector value_compressed_sizes; + std::vector value_uncompressed_sizes; +}; + +// Decoded state for one (column, page) slot. Populated by chunk-level +// parallel decode; consumed by the scatter loop. +struct PageDecodedState { + std::vector notnull_bitmap; + std::vector predecoded_values; + std::vector predecoded_strings; + common::PageArena predecode_pa; + int32_t predecoded_count = 0; + int32_t predecoded_read_pos = 0; +}; + +// Per-value-column state for multi-value AlignedChunkReader. +struct ValueColumnState { + ChunkMeta* chunk_meta = nullptr; + ChunkHeader chunk_header; + Decoder* decoder = nullptr; + Compressor* compressor = nullptr; + common::ByteStream in_stream; // raw data from file + common::ByteStream in; // decompressed data + char* uncompressed_buf = nullptr; + int32_t file_data_buf_size = 0; + uint32_t chunk_visit_offset = 0; + PageHeader cur_page_header; + std::vector notnull_bitmap; + int32_t cur_value_index = -1; + + // Per-page decoded state for chunk-level parallel decode. + std::vector per_page_state; + + // Pre-decoded value buffer for the CURRENT page, filled by + // decompress_and_parse_value_page when the dense-multi path predecodes + // values in worker threads. Consumed by multi_DECODE_TV_BATCH instead of + // calling the decoder inline. Holds nonnull values only. + std::vector pending_decoded_values; + int32_t pending_decoded_count = 0; + int32_t pending_decoded_cursor = 0; + bool pending_decoded = false; +}; + class AlignedChunkReader : public IChunkReader { public: AlignedChunkReader() @@ -64,11 +126,13 @@ class AlignedChunkReader : public IChunkReader { ~AlignedChunkReader() override = default; bool has_more_data() const override { - return prev_value_page_not_finish() || + if (multi_value_mode_) { + return has_more_data_multi(); + } + return prev_value_page_not_finish() || prev_time_page_not_finish() || (value_chunk_visit_offset_ - value_chunk_header_.serialized_size_ < value_chunk_header_.data_size_) || - prev_time_page_not_finish() || (time_chunk_visit_offset_ - time_chunk_header_.serialized_size_ < time_chunk_header_.data_size_); } @@ -76,13 +140,36 @@ class AlignedChunkReader : public IChunkReader { int load_by_aligned_meta(ChunkMeta* time_meta, ChunkMeta* value_meta) override; + // Multi-value: load one time chunk + N value chunks. + int load_by_aligned_meta_multi(ChunkMeta* time_meta, + const std::vector& value_metas); + int get_next_page(common::TsBlock* tsblock, Filter* oneshoot_filter, common::PageArena& pa) override; - int get_next_page(common::TsBlock* tsblock, Filter* oneshoot_filter, common::PageArena& pa, int64_t min_time_hint, int& row_offset, int& row_limit) override; + // Multi-value: get the number of value columns. + uint32_t get_value_column_count() const { + return multi_value_mode_ ? value_columns_.size() : 1; + } + + // Multi-value: get chunk header for a specific value column. + ChunkHeader& get_value_chunk_header(uint32_t col) { + if (multi_value_mode_ && col < value_columns_.size()) { + return value_columns_[col]->chunk_header; + } + return value_chunk_header_; + } + + bool is_multi_value_mode() const { return multi_value_mode_; } + +#ifdef ENABLE_THREADS + // Set external thread pool for parallel decode (not owned). + void set_decode_pool(common::ThreadPool* pool) { decode_pool_ = pool; } +#endif + private: bool should_skip_page_by_time(int64_t min_time_hint); bool should_skip_page_by_offset(int& row_offset); @@ -100,7 +187,8 @@ class AlignedChunkReader : public IChunkReader { common::ByteStream& in_stream_, PageHeader& cur_page_header_, uint32_t& chunk_visit_offset, - ChunkHeader& chunk_header); + ChunkHeader& chunk_header, + int32_t* override_buf_size = nullptr); int read_from_file_and_rewrap(common::ByteStream& in_stream_, ChunkMeta*& chunk_meta, uint32_t& chunk_visit_offset, @@ -114,6 +202,7 @@ class AlignedChunkReader : public IChunkReader { Filter* filter, common::PageArena* pa); bool prev_time_page_not_finish() const { + if (time_predecoded_) return page_time_cursor_ < page_time_count_; return (time_decoder_ && time_decoder_->has_remaining(time_in_)) || time_in_.has_remaining(); } @@ -132,58 +221,119 @@ class AlignedChunkReader : public IChunkReader { common::ByteStream& value_in, common::RowAppender& row_appender, Filter* filter); + int i32_DECODE_TV_BATCH(common::ByteStream& time_in, + common::ByteStream& value_in, + common::RowAppender& row_appender, Filter* filter); + int i64_DECODE_TV_BATCH(common::ByteStream& time_in, + common::ByteStream& value_in, + common::RowAppender& row_appender, Filter* filter); + int float_DECODE_TV_BATCH(common::ByteStream& time_in, + common::ByteStream& value_in, + common::RowAppender& row_appender, + Filter* filter); + int double_DECODE_TV_BATCH(common::ByteStream& time_in, + common::ByteStream& value_in, + common::RowAppender& row_appender, + Filter* filter); int STRING_DECODE_TYPED_TV_INTO_TSBLOCK(common::ByteStream& time_in, common::ByteStream& value_in, common::RowAppender& row_appender, common::PageArena& pa, Filter* filter); + // ── Multi-value private methods (page-level, serial fallback) ──────── + bool has_more_data_multi() const; + bool prev_any_value_page_not_finish_multi() const; + int get_next_page_multi(common::TsBlock* ret_tsblock, + Filter* oneshoot_filter, common::PageArena& pa); + int get_next_page_multi_serial(common::TsBlock* ret_tsblock, Filter* filter, + common::PageArena& pa); + int skip_cur_page_multi(); + bool cur_page_statisify_filter_multi(Filter* filter); + int decode_cur_value_pages_multi(); + int decode_cur_value_page_data_for(ValueColumnState& col); + int ensure_value_page_loaded(ValueColumnState& col); + static int decompress_and_parse_value_page(ValueColumnState& col, + bool predecode); + void predecode_all_timestamps(); + int decode_time_value_buf_into_tsblock_multi(common::TsBlock*& ret_tsblock, + Filter* filter, + common::PageArena* pa); + int multi_DECODE_TV_BATCH(common::TsBlock* ret_tsblock, + common::RowAppender& row_appender, Filter* filter, + common::PageArena* pa); + int build_page_plan(Filter* filter); + int decode_time_page_direct(const ChunkPageInfo& page_info, + std::vector& out_times); + int decode_time_page_with(const ChunkPageInfo& page_info, + std::vector& out_times, Decoder* decoder, + Compressor* compressor); + int decode_all_planned_pages(); + int decode_value_page_for_slot(uint32_t col_idx, size_t page_idx); + int decode_page_lazy(size_t page_idx); + void release_page_slot(size_t page_idx); + void release_current_page_state(); + bool has_variable_length_value_column() const; + int count_non_null_prefix(const std::vector& bitmap, + int32_t row_limit) const; + private: ReadFile* read_file_; + // ── Single-value mode fields (kept for backward compat) ────────────── ChunkMeta* time_chunk_meta_; ChunkMeta* value_chunk_meta_; common::String measurement_name_; ChunkHeader time_chunk_header_; - // TODO: support reading more than one measurement in AlignedChunkReader. ChunkHeader value_chunk_header_; PageHeader cur_time_page_header_; PageHeader cur_value_page_header_; - /* - * Data reader from file is stored in @in_stream_, and the size - * is stored in @file_data_buf_size_. Note, in_stream_.total_size_ - * is used to limit deserialization, that is why we still have - * @file_data_buf_size_. - * - * Since we may want keep data of current page (and page header - * of next page) in memory, we need a byte-size cursor to tell - * us which byte we are processing, so we have @chunk_visit_offset_ - * it refer to position from the start of chunk_header_, - * also refer to offset within the chunk (including chunk header). - * It advanced by step of a page header or a page tv data. - */ - common::ByteStream time_in_stream_{common::MOD_CHUNK_READER}; - common::ByteStream value_in_stream_{common::MOD_CHUNK_READER}; + common::ByteStream time_in_stream_; + common::ByteStream value_in_stream_; int32_t file_data_time_buf_size_; int32_t file_data_value_buf_size_; uint32_t time_chunk_visit_offset_; uint32_t value_chunk_visit_offset_; - // Statistic *page_statistic_; Compressor* time_compressor_; Compressor* value_compressor_; Filter* time_filter_; Decoder* time_decoder_; Decoder* value_decoder_; - common::ByteStream time_in_{common::MOD_CHUNK_READER}; - common::ByteStream value_in_{common::MOD_CHUNK_READER}; + common::ByteStream time_in_; + common::ByteStream value_in_; char* time_uncompressed_buf_; char* value_uncompressed_buf_; std::vector value_page_col_notnull_bitmap_; uint32_t value_page_data_num_; int32_t cur_value_index; + + // ── Multi-value mode fields ────────────────────────────────────────── + bool multi_value_mode_ = false; + std::vector value_columns_; + + // Pre-decoded timestamps for page-level parallel decode. + std::vector page_all_times_; + int page_time_count_ = 0; + int page_time_cursor_ = 0; + bool time_predecoded_ = false; + + // ── Page-plan state ──────────────────────────────────────────────── + std::vector chunk_pages_; + std::vector> per_page_times_; + bool page_plan_built_ = false; + bool current_page_loaded_ = false; + size_t current_page_plan_index_ = 0; + +#ifdef ENABLE_THREADS + common::ThreadPool* decode_pool_ = nullptr; // borrowed, not owned + // Per-worker time decoder + compressor pool for parallel time-page decode. + // Sized to decode_pool_->num_threads() on first use, owned by this reader. + std::vector time_decoder_pool_; + std::vector time_compressor_pool_; +#endif }; } // end namespace storage -#endif // READER_CHUNK_READER_H +#endif // READER_CHUNK_ALIGNED_READER_H diff --git a/cpp/src/reader/block/single_device_tsblock_reader.cc b/cpp/src/reader/block/single_device_tsblock_reader.cc index 93f42efd3..5fb9d80d2 100644 --- a/cpp/src/reader/block/single_device_tsblock_reader.cc +++ b/cpp/src/reader/block/single_device_tsblock_reader.cc @@ -19,8 +19,18 @@ #include "single_device_tsblock_reader.h" +#include +#include +#include + +#include "common/db_common.h" + namespace storage { +namespace { +const char* kTimeOnlyContextName = "__time_only_aligned_context__"; +} + SingleDeviceTsBlockReader::SingleDeviceTsBlockReader( DeviceQueryTask* device_query_task, uint32_t block_size, IMetadataQuerier* metadata_querier, TsFileIOReader* tsfile_io_reader, @@ -55,6 +65,25 @@ int SingleDeviceTsBlockReader::init(DeviceQueryTask* device_query_task, int32_t SingleDeviceTsBlockReader::compute_dense_row_count( const std::vector& ts_indexes) { int64_t reference_time_count = -1; + // Single-chunk timeseries skip per-chunk statistic serialization + // (see TsFileIOWriter / TimeseriesIndex::deserialize_from); when the + // chunk-level statistic is null, fall back to the TimeseriesIndex's + // top-level statistic, which summarizes that lone chunk. + auto chunk_count = [](const common::SimpleList& list, + Statistic* fallback) -> int64_t { + int64_t total = 0; + int nchunks = 0; + for (auto it = list.begin(); it != list.end(); it++) { + nchunks++; + if (it.get()->statistic_) { + total += it.get()->statistic_->count_; + } + } + if (total == 0 && nchunks == 1 && fallback != nullptr) { + total = fallback->count_; + } + return total; + }; for (const auto* ts_index : ts_indexes) { if (ts_index == nullptr) { continue; @@ -69,27 +98,30 @@ int32_t SingleDeviceTsBlockReader::compute_dense_row_count( if (time_list == nullptr || value_list == nullptr) { return -1; } - - for (auto it = time_list->begin(); it != time_list->end(); it++) { - if (it.get()->statistic_) { - time_count += it.get()->statistic_->count_; - } - } - for (auto it = value_list->begin(); it != value_list->end(); it++) { - if (it.get()->statistic_) { - value_count += it.get()->statistic_->count_; - } + // Use the time-side and value-side top stats independently: + // the value-side count_ excludes nulls, so reusing it for the + // time chunk would misclassify sparse data as dense. + const auto* aligned_ti = + dynamic_cast(ts_index); + if (aligned_ti == nullptr) { + return -1; } + Statistic* time_top_stat = + aligned_ti->time_ts_idx_ != nullptr + ? aligned_ti->time_ts_idx_->get_statistic() + : nullptr; + Statistic* value_top_stat = + aligned_ti->value_ts_idx_ != nullptr + ? aligned_ti->value_ts_idx_->get_statistic() + : nullptr; + time_count = chunk_count(*time_list, time_top_stat); + value_count = chunk_count(*value_list, value_top_stat); } else { auto* list = ts_index->get_chunk_meta_list(); if (list == nullptr) { return -1; } - for (auto it = list->begin(); it != list->end(); it++) { - if (it.get()->statistic_) { - time_count += it.get()->statistic_->count_; - } - } + time_count = chunk_count(*list, ts_index->get_statistic()); value_count = time_count; } @@ -149,32 +181,198 @@ int SingleDeviceTsBlockReader::init_internal(DeviceQueryTask* device_query_task, time_series_indexs, pa_))) { return ret; } - dense_row_count_ = compute_dense_row_count(time_series_indexs); - - if (dense_row_count_ >= 0 && remaining_offset_ >= dense_row_count_) { - remaining_offset_ -= dense_row_count_; - delete current_block_; - current_block_ = nullptr; - return common::E_OK; + // Fast path: when every aligned column is provably dense (same total row + // count across time + value chunks), bulk-copy from SSI tsblock to caller + // tsblock instead of per-row merging. compute_dense_row_count() returns + // -1 if the device is not provably dense, which gates safety. + const bool enable_dense_aligned_fast_path = true; + // Early device-level time skip: if time_filter is set and ALL chunks of + // this device have statistics that fall outside the filter range, skip the + // entire device. Chunks without statistics are assumed to satisfy. + // + // Skip the entire shortcut when time_series_indexs is empty (e.g. a + // time-only query that selects no value column): there's nothing to + // prove outside the filter, and dropping out here would lose the + // time-only fallback path that runs below. + if (time_filter != nullptr && !time_series_indexs.empty()) { + bool examined_any = false; + bool all_outside = true; + for (const auto* ts_idx : time_series_indexs) { + if (ts_idx == nullptr) continue; + auto* chunk_list = ts_idx->is_aligned() + ? ts_idx->get_time_chunk_meta_list() + : ts_idx->get_chunk_meta_list(); + if (chunk_list == nullptr) { + all_outside = false; + break; + } + examined_any = true; + for (auto it = chunk_list->begin(); it != chunk_list->end(); it++) { + if (it.get()->statistic_ == nullptr || + time_filter->satisfy(it.get()->statistic_)) { + all_outside = false; + break; + } + } + if (!all_outside) break; + } + if (examined_any && all_outside) { + // No data in this device matches the time filter. + delete current_block_; + current_block_ = nullptr; + return common::E_OK; + } } + // Try multi-value aligned path: one VectorMeasurementColumnContext (and + // the SSI it owns) reads all aligned value columns at once. This is the + // entry point for AlignedChunkReader's per-column parallel decode pool + // (created in TsFileSeriesScanIterator::init_chunk_reader_multi when + // num_cols > 1 && parallel_read_enabled_); per-column + // SingleMeasurementColumnContext siblings would each open their own + // single-column SSI and never reach it. Falls back to the per-column path + // if ctx->init() fails (e.g. the device mixes aligned and non-aligned + // chunks). + bool used_multi = false; + std::set multi_names; + { + bool can_multi = !time_series_indexs.empty(); + auto& meas_cols = + device_query_task->get_column_mapping()->get_measurement_columns(); + for (const auto& ts_idx : time_series_indexs) { + if (ts_idx == nullptr || !ts_idx->is_aligned()) { + can_multi = false; + break; + } + } + if (can_multi) { + std::vector meas_names(meas_cols.begin(), + meas_cols.end()); + // Stable order by first appearance in the result schema so the + // shared SSI's column buffers line up with the result columns. + std::sort( + meas_names.begin(), meas_names.end(), + [device_query_task](const std::string& lhs, + const std::string& rhs) { + const auto& lhs_pos = + device_query_task->get_column_mapping()->get_column_pos( + lhs); + const auto& rhs_pos = + device_query_task->get_column_mapping()->get_column_pos( + rhs); + const int lhs_first = + lhs_pos.empty() ? INT32_MAX : lhs_pos.front(); + const int rhs_first = + rhs_pos.empty() ? INT32_MAX : rhs_pos.front(); + if (lhs_first != rhs_first) { + return lhs_first < rhs_first; + } + return lhs < rhs; + }); + std::vector> pos_list; + pos_list.reserve(meas_names.size()); + for (const auto& name : meas_names) { + const auto& pos = + device_query_task->get_column_mapping()->get_column_pos( + name); + pos_list.push_back( + std::vector(pos.begin(), pos.end())); + } - int ssi_offset = 0; - int ssi_limit = -1; - if (dense_row_count_ >= 0) { - ssi_offset = remaining_offset_; - ssi_limit = remaining_limit_; + auto* ctx = new VectorMeasurementColumnContext(tsfile_io_reader_); + if (common::E_OK == ctx->init(device_query_task_, meas_names, + time_filter, pos_list, pa_)) { + // The shared ctx is referenced from N map entries; close() + // and the merge loop dedupe by pointer (already in place). + for (const auto& name : meas_names) { + field_column_contexts_.insert(std::make_pair(name, ctx)); + multi_names.insert(name); + } + aligned_col_count_ = meas_names.size(); + used_multi = true; + } else { + delete ctx; + } + } } + // Per-column path for anything not absorbed by the multi-value ctx + // (e.g. fallback when init() failed, or a non-aligned column would have + // been added before we generalize this for mixed schemas). for (const auto& time_series_index : time_series_indexs) { - construct_column_context(time_series_index, time_filter, ssi_offset, - ssi_limit); + if (time_series_index == nullptr) { + continue; + } + const std::string measurement_name = + time_series_index->get_measurement_name().to_std_string(); + if (used_multi && multi_names.count(measurement_name) > 0) { + continue; + } + construct_column_context(time_series_index, time_filter, 0, -1); + } + + if (field_column_contexts_.empty()) { + // If value columns were actually requested but none produced a + // context, every one of them read empty under the current filter + // (e.g. an empty/inverted time range, or a filter that matches no + // rows). The result is simply empty -- return it directly. The + // time-only fallback below is only for genuine time-only queries (no + // value columns); routing an all-empty value query through it would + // call alloc_multi_ssi(), which is aligned-only and returns + // E_NOT_SUPPORT on non-aligned devices. + bool any_value_column_requested = false; + for (const auto* ts_idx : time_series_indexs) { + if (ts_idx != nullptr) { + any_value_column_requested = true; + break; + } + } + if (any_value_column_requested) { + delete current_block_; + current_block_ = nullptr; + return common::E_OK; + } + + std::vector empty_measurements; + std::vector> empty_positions; + auto* time_only_ctx = + new VectorMeasurementColumnContext(tsfile_io_reader_); + int time_only_ret = + time_only_ctx->init(device_query_task_, empty_measurements, + time_filter, empty_positions, pa_); + if (common::E_OK == time_only_ret) { + field_column_contexts_.insert( + std::make_pair(kTimeOnlyContextName, time_only_ctx)); + } else { + delete time_only_ctx; + // Only treat "no data" as an acceptable empty result; I/O + // errors, OOM, and corruption from the time-only init must + // propagate so the caller sees the actual failure instead of + // an empty resultset wearing E_OK. + if (time_only_ret != common::E_NO_MORE_DATA) { + delete current_block_; + current_block_ = nullptr; + return time_only_ret; + } + } } - if (dense_row_count_ >= 0 && !field_column_contexts_.empty()) { - auto* first_ctx = field_column_contexts_.begin()->second; - remaining_offset_ = first_ctx->get_ssi_row_offset(); - remaining_limit_ = first_ctx->get_ssi_row_limit(); + // Detect aligned fast path: every field column comes from an aligned chunk. + if (!field_column_contexts_.empty() && enable_dense_aligned_fast_path && + dense_row_count_ >= 0 && + aligned_col_count_ == field_column_contexts_.size()) { + all_aligned_ = true; + aligned_vec_.reserve(field_column_contexts_.size()); + if (used_multi) { + // Single shared VectorMeasurementColumnContext handles all + // columns — push it once, otherwise we'd schedule the same + // bulk_copy_into N times. + aligned_vec_.push_back(field_column_contexts_.begin()->second); + } else { + for (auto& kv : field_column_contexts_) { + aligned_vec_.push_back(kv.second); + } + } } if (field_column_contexts_.empty()) { @@ -218,18 +416,25 @@ int SingleDeviceTsBlockReader::has_next(bool& has_next) { current_block_->reset(); - uint32_t effective_block_size = block_size_; - if (remaining_limit_ > 0) { - effective_block_size = - std::min(block_size_, static_cast(remaining_limit_)); + if (all_aligned_) { + return has_next_aligned(has_next); } bool next_time_set = false; next_time_ = -1; std::vector min_time_columns; - while (current_block_->get_row_count() < effective_block_size) { + while (current_block_->get_row_count() < block_size_) { + if (remaining_limit_ > 0 && + current_block_->get_row_count() >= + static_cast(remaining_limit_)) { + break; + } + std::set visited_contexts; for (auto& column_context : field_column_contexts_) { + if (!visited_contexts.insert(column_context.second).second) { + continue; + } int64_t time; if (IS_FAIL(column_context.second->get_current_time(time))) { continue; @@ -293,6 +498,114 @@ int SingleDeviceTsBlockReader::has_next(bool& has_next) { return ret; } +int SingleDeviceTsBlockReader::has_next_aligned(bool& result_has_next) { + int ret = common::E_OK; + int time_in_query_index = tuple_desc_.get_time_column_index(); + + while (current_block_->get_row_count() < block_size_) { + if (aligned_vec_.empty()) break; + + if (remaining_limit_ == 0) break; + + // Check if first column has data. + uint32_t avail = aligned_vec_[0]->available_rows(); + if (avail == 0) { + for (auto* ctx : aligned_vec_) { + ctx->remove_from(field_column_contexts_); + } + aligned_vec_.clear(); + break; + } + + // Find the batch size: min of output capacity and all SSI + // availabilities. + uint32_t batch = block_size_ - current_block_->get_row_count(); + for (auto* ctx : aligned_vec_) { + uint32_t ctx_avail = ctx->available_rows(); + if (ctx_avail == 0) { + batch = 0; + break; + } + if (ctx_avail < batch) batch = ctx_avail; + } + if (batch == 0) { + for (auto* ctx : aligned_vec_) { + ctx->remove_from(field_column_contexts_); + } + aligned_vec_.clear(); + break; + } + + // Handle offset: skip rows before copying. + if (remaining_offset_ > 0) { + uint32_t skip = std::min(batch, (uint32_t)remaining_offset_); + for (auto* ctx : aligned_vec_) { + int sr = ctx->skip_rows(skip); + if (sr != common::E_OK) return sr; + } + remaining_offset_ -= skip; + continue; + } + + // Handle limit: cap the batch size. + if (remaining_limit_ > 0) { + batch = std::min(batch, (uint32_t)remaining_limit_); + } + + // First SSI: bulk copy time + values + row_count. + int copy_ret = aligned_vec_[0]->bulk_copy_into( + col_appenders_, col_appenders_[time_column_index_], row_appender_, + batch); + // E_NO_MORE_DATA is the normal end-of-stream signal; any other + // error (I/O, decode, corruption) must propagate to the caller + // instead of silently truncating the result with E_OK. + if (copy_ret != common::E_OK && copy_ret != common::E_NO_MORE_DATA) { + return copy_ret; + } + + // Also copy time to explicit time column if requested. + if (time_in_query_index != -1) { + common::Vector* time_vec = + current_block_->get_vector(time_column_index_); + char* time_src = + time_vec->get_value_data().get_data() + + (current_block_->get_row_count() - batch) * sizeof(int64_t); + col_appenders_[time_in_query_index]->bulk_append_fixed( + time_src, batch, sizeof(int64_t)); + } + + // Other SSIs: bulk copy values only (no time, no row_count). Any + // hard error from these columns also has to propagate; otherwise a + // truncated/corrupt value column would silently emit nulls. + for (size_t i = 1; i < aligned_vec_.size(); i++) { + int other_ret = aligned_vec_[i]->bulk_copy_into( + col_appenders_, nullptr, nullptr, batch); + if (other_ret != common::E_OK && + other_ret != common::E_NO_MORE_DATA) { + return other_ret; + } + } + + // Decrement limit for data already copied. + if (remaining_limit_ > 0) { + remaining_limit_ -= batch; + } + + // If first SSI signaled no-more-data, stop after accounting. + if (copy_ret == common::E_NO_MORE_DATA) break; + } + + if (current_block_->get_row_count() > 0) { + if (RET_FAIL(fill_ids())) return ret; + current_block_->fill_trailling_nulls(); + last_block_returned_ = false; + result_has_next = true; + } else { + result_has_next = false; + } + return ret; +} + int SingleDeviceTsBlockReader::fill_measurements( std::vector& column_contexts) { int ret = common::E_OK; @@ -400,8 +713,15 @@ int SingleDeviceTsBlockReader::next(common::TsBlock*& ret_block) { } void SingleDeviceTsBlockReader::close() { + aligned_vec_.clear(); // non-owning; owned by field_column_contexts_ + // De-duplicate pointers before deleting: VectorMeasurementColumnContext + // has multiple map entries pointing to the same object. + std::set unique_contexts; for (auto& column_context : field_column_contexts_) { - delete column_context.second; + unique_contexts.insert(column_context.second); + } + for (auto* ctx : unique_contexts) { + delete ctx; } for (auto& col_appender : col_appenders_) { if (col_appender) { @@ -413,9 +733,7 @@ void SingleDeviceTsBlockReader::close() { delete row_appender_; row_appender_ = nullptr; } - if (device_query_task_) { - device_query_task_->~DeviceQueryTask(); - } + device_query_task_ = nullptr; // owned by the task iterator arena if (current_block_) { delete current_block_; current_block_ = nullptr; @@ -430,24 +748,34 @@ int SingleDeviceTsBlockReader::construct_column_context( (!time_series_index->is_aligned() && time_series_index->get_chunk_meta_list()->empty())) { } else if (time_series_index->is_aligned()) { + const int effective_ssi_offset = dense_row_count_ >= 0 ? ssi_offset : 0; + const int effective_ssi_limit = dense_row_count_ >= 0 ? ssi_limit : -1; const AlignedTimeseriesIndex* aligned_time_series_index = dynamic_cast(time_series_index); if (aligned_time_series_index == nullptr) { assert(false); } + if (aligned_time_series_index->value_ts_idx_ != nullptr && + aligned_time_series_index->value_ts_idx_->get_statistic() != + nullptr && + aligned_time_series_index->value_ts_idx_->get_statistic()->count_ == + 0) { + return ret; + } SingleMeasurementColumnContext* column_context = new SingleMeasurementColumnContext(tsfile_io_reader_); if (RET_FAIL(column_context->init( device_query_task_, time_series_index, time_filter, device_query_task_->get_column_mapping()->get_column_pos( time_series_index->get_measurement_name().to_std_string()), - pa_, ssi_offset, ssi_limit))) { + pa_, effective_ssi_offset, effective_ssi_limit))) { delete column_context; return ret; } field_column_contexts_.insert(std::make_pair( time_series_index->get_measurement_name().to_std_string(), column_context)); + aligned_col_count_++; } else { SingleMeasurementColumnContext* column_context = new SingleMeasurementColumnContext(tsfile_io_reader_); @@ -568,4 +896,342 @@ void SingleMeasurementColumnContext::fill_into( } } +uint32_t SingleMeasurementColumnContext::available_rows() const { + if (!time_iter_ || time_iter_->end()) return 0; + return time_iter_->remaining(); +} + +int SingleMeasurementColumnContext::bulk_copy_into( + std::vector& col_appenders, + common::ColAppender* time_appender, common::RowAppender* row_appender, + uint32_t count) { + int ret = common::E_OK; + const uint32_t time_elem_size = sizeof(int64_t); + auto dt = value_iter_->get_data_type(); + bool is_varlen = + (dt == common::STRING || dt == common::TEXT || dt == common::BLOB); + + // Bulk copy time column (only first SSI does this). + if (time_appender) { + time_appender->bulk_append_fixed(time_iter_->data_ptr(), count, + time_elem_size); + } + + // Advance output row count (only first SSI does this). + if (row_appender) { + row_appender->add_rows(count); + } + + if (is_varlen || value_iter_->has_null()) { + for (uint32_t r = 0; r < count; r++) { + uint32_t len = 0; + bool is_null = false; + char* val = value_iter_->read(&len, &is_null); + for (int32_t pos : pos_in_result_) { + auto* appender = col_appenders[pos + 1]; + appender->add_row(); + if (is_null) { + appender->append_null(); + } else { + appender->append(val, len); + } + } + value_iter_->next(); + } + } else { + const uint32_t val_elem_size = common::get_data_type_size(dt); + char* val_ptr = value_iter_->data_ptr(); + for (int32_t pos : pos_in_result_) { + col_appenders[pos + 1]->bulk_append_fixed(val_ptr, count, + val_elem_size); + } + value_iter_->advance(count, val_elem_size); + } + + // Advance source iterators. + time_iter_->advance(count, time_elem_size); + + // If source TsBlock exhausted, load next. + if (time_iter_->end()) { + if (RET_FAIL(get_next_tsblock(false))) { + return ret; + } + } + return ret; +} + +int SingleMeasurementColumnContext::skip_rows(uint32_t count) { + if (!time_iter_ || time_iter_->end()) return common::E_OK; + const uint32_t time_elem_size = sizeof(int64_t); + auto dt = value_iter_->get_data_type(); + bool is_varlen = + (dt == common::STRING || dt == common::TEXT || dt == common::BLOB); + uint32_t to_skip = std::min(count, time_iter_->remaining()); + time_iter_->advance(to_skip, time_elem_size); + if (is_varlen || value_iter_->has_null()) { + for (uint32_t r = 0; r < to_skip; r++) { + value_iter_->next(); + } + } else { + const uint32_t val_elem_size = common::get_data_type_size(dt); + value_iter_->advance(to_skip, val_elem_size); + } + if (time_iter_->end()) { + // Propagate hard errors from the next-tsblock load; E_NO_MORE_DATA + // is the legitimate end-of-stream signal and gets squashed back to + // E_OK so the caller's outer loop notices via available_rows()==0. + int r = get_next_tsblock(false); + if (r != common::E_OK && r != common::E_NO_MORE_DATA) return r; + } + return common::E_OK; +} + +// ── VectorMeasurementColumnContext implementation ─────────────────────── + +VectorMeasurementColumnContext::~VectorMeasurementColumnContext() { + if (time_iter_) { + delete time_iter_; + time_iter_ = nullptr; + } + for (auto* vi : value_iters_) { + if (vi) delete vi; + } + value_iters_.clear(); + if (ssi_) { + ssi_->revert_tsblock(); + } + tsfile_io_reader_->revert_ssi(ssi_); + ssi_ = nullptr; +} + +int VectorMeasurementColumnContext::init( + DeviceQueryTask* device_query_task, + const std::vector& measurement_names, Filter* time_filter, + std::vector>& pos_in_result, common::PageArena& pa) { + int ret = common::E_OK; + pos_in_result_ = pos_in_result; + column_names_ = measurement_names; + if (RET_FAIL(tsfile_io_reader_->alloc_multi_ssi( + device_query_task->get_device_id(), measurement_names, ssi_, pa, + time_filter))) { + return ret; + } + if (RET_FAIL(get_next_tsblock(true))) { + return ret; + } + return ret; +} + +int VectorMeasurementColumnContext::get_next_tsblock(bool alloc_mem) { + int ret = common::E_OK; + if (tsblock_ != nullptr) { + if (time_iter_) { + delete time_iter_; + time_iter_ = nullptr; + } + for (auto* vi : value_iters_) { + if (vi) delete vi; + } + value_iters_.clear(); + tsblock_->reset(); + } + if (RET_FAIL(ssi_->get_next(tsblock_, alloc_mem))) { + if (time_iter_) { + delete time_iter_; + time_iter_ = nullptr; + } + for (auto* vi : value_iters_) { + if (vi) delete vi; + } + value_iters_.clear(); + if (tsblock_) { + ssi_->destroy(); + tsblock_ = nullptr; + } + } else { + time_iter_ = new common::ColIterator(0, tsblock_); + uint32_t num_value_cols = tsblock_->get_column_count() - 1; + value_iters_.reserve(num_value_cols); + for (uint32_t c = 0; c < num_value_cols; c++) { + value_iters_.push_back(new common::ColIterator(c + 1, tsblock_)); + } + } + return ret; +} + +int VectorMeasurementColumnContext::get_current_time(int64_t& time) { + if (!time_iter_ || time_iter_->end()) return common::E_NO_MORE_DATA; + uint32_t len = 0; + time = *(int64_t*)(time_iter_->read(&len)); + return common::E_OK; +} + +int VectorMeasurementColumnContext::get_current_value(char*& value, + uint32_t& len) { + if (value_iters_.empty() || value_iters_[0]->end()) + return common::E_NO_MORE_DATA; + bool is_null = false; + value = value_iters_[0]->read(&len, &is_null); + return common::E_OK; +} + +int VectorMeasurementColumnContext::move_iter() { + int ret = common::E_OK; + time_iter_->next(); + for (auto* vi : value_iters_) vi->next(); + if (time_iter_->end()) { + if (RET_FAIL(get_next_tsblock(false))) return ret; + } + return ret; +} + +void VectorMeasurementColumnContext::fill_into( + std::vector& col_appenders) { + for (uint32_t c = 0; c < value_iters_.size() && c < pos_in_result_.size(); + c++) { + uint32_t len = 0; + bool is_null = false; + char* val = value_iters_[c]->read(&len, &is_null); + for (int32_t pos : pos_in_result_[c]) { + col_appenders[pos + 1]->add_row(); + if (is_null) { + col_appenders[pos + 1]->append_null(); + } else { + col_appenders[pos + 1]->append(val, len); + } + } + } +} + +void VectorMeasurementColumnContext::remove_from( + std::map& column_context_map) { + if (column_names_.empty()) { + for (auto it = column_context_map.begin(); + it != column_context_map.end();) { + if (it->second == this) { + it = column_context_map.erase(it); + } else { + ++it; + } + } + delete this; + return; + } + for (const auto& name : column_names_) { + column_context_map.erase(name); + } + delete this; +} + +uint32_t VectorMeasurementColumnContext::available_rows() const { + if (!time_iter_ || time_iter_->end()) return 0; + return time_iter_->remaining(); +} + +int VectorMeasurementColumnContext::bulk_copy_into( + std::vector& col_appenders, + common::ColAppender* time_appender, common::RowAppender* row_appender, + uint32_t count) { + int ret = common::E_OK; + const uint32_t time_elem_size = sizeof(int64_t); + + // Bulk copy time column (only when time_appender is provided). + if (time_appender) { + time_appender->bulk_append_fixed(time_iter_->data_ptr(), count, + time_elem_size); + } + + // Advance output row count. + if (row_appender) { + row_appender->add_rows(count); + } + + // Bulk copy each value column to its output positions, propagating nulls. + for (uint32_t c = 0; c < value_iters_.size() && c < pos_in_result_.size(); + c++) { + auto dt = value_iters_[c]->get_data_type(); + bool is_varlen = + (dt == common::STRING || dt == common::TEXT || dt == common::BLOB); + bool src_has_null = value_iters_[c]->has_null(); + + if (is_varlen || src_has_null) { + // Row-by-row copy for variable-length columns using the + // ColIterator next()/read() which properly tracks offsets. Fixed + // length columns with nulls also need this path because their + // payload buffer only stores non-null values. + auto* iter = value_iters_[c]; + for (uint32_t r = 0; r < count; r++) { + uint32_t len = 0; + bool is_null = false; + char* val = iter->read(&len, &is_null); + for (int32_t pos : pos_in_result_[c]) { + auto* appender = col_appenders[pos + 1]; + appender->add_row(); + if (is_null) { + appender->append_null(); + } else { + appender->append(val, len); + } + } + iter->next(); + } + } else { + // Bulk copy for fixed-length columns + uint32_t val_elem_size = common::get_data_type_size(dt); + char* val_ptr = value_iters_[c]->data_ptr(); + for (int32_t pos : pos_in_result_[c]) { + col_appenders[pos + 1]->bulk_append_fixed(val_ptr, count, + val_elem_size); + } + } + } + + // Advance all source iterators. + time_iter_->advance(count, time_elem_size); + for (uint32_t c = 0; c < value_iters_.size(); c++) { + auto dt = value_iters_[c]->get_data_type(); + bool is_varlen = + (dt == common::STRING || dt == common::TEXT || dt == common::BLOB); + if (!is_varlen && !value_iters_[c]->has_null()) { + uint32_t val_elem_size = common::get_data_type_size(dt); + value_iters_[c]->advance(count, val_elem_size); + } + // Variable-length iterators and fixed-length iterators with nulls were + // already advanced in the copy loop above. + } + + // If source TsBlock exhausted, load next. + if (time_iter_->end()) { + if (RET_FAIL(get_next_tsblock(false))) return ret; + } + return ret; +} + +int VectorMeasurementColumnContext::skip_rows(uint32_t count) { + if (!time_iter_ || time_iter_->end()) return common::E_OK; + const uint32_t time_elem_size = sizeof(int64_t); + uint32_t to_skip = std::min(count, time_iter_->remaining()); + time_iter_->advance(to_skip, time_elem_size); + for (uint32_t c = 0; c < value_iters_.size(); c++) { + auto dt = value_iters_[c]->get_data_type(); + bool is_varlen = + (dt == common::STRING || dt == common::TEXT || dt == common::BLOB); + if (!is_varlen && !value_iters_[c]->has_null()) { + uint32_t val_elem_size = common::get_data_type_size(dt); + value_iters_[c]->advance(to_skip, val_elem_size); + } else { + // Variable-length and fixed-length-with-null vectors need next() + // to keep the payload offset aligned with non-null rows. + for (uint32_t r = 0; r < to_skip; r++) { + value_iters_[c]->next(); + } + } + } + if (time_iter_->end()) { + int r = get_next_tsblock(false); + if (r != common::E_OK && r != common::E_NO_MORE_DATA) return r; + } + return common::E_OK; +} + } // namespace storage diff --git a/cpp/src/reader/block/single_device_tsblock_reader.h b/cpp/src/reader/block/single_device_tsblock_reader.h index 07d16860c..e74304baf 100644 --- a/cpp/src/reader/block/single_device_tsblock_reader.h +++ b/cpp/src/reader/block/single_device_tsblock_reader.h @@ -65,6 +65,9 @@ class SingleDeviceTsBlockReader : public TsBlockReader { int advance_column(MeasurementColumnContext* column_context); int32_t compute_dense_row_count( const std::vector& ts_indexes); + // Fast path for aligned data: all columns share the same timestamps, + // so no per-row merge-sort is needed. + int has_next_aligned(bool& has_next); DeviceQueryTask* device_query_task_; Filter* field_filter_; @@ -83,6 +86,11 @@ class SingleDeviceTsBlockReader : public TsBlockReader { int remaining_offset_ = 0; int remaining_limit_ = -1; int32_t dense_row_count_ = -1; + // Populated in init() when every field column comes from an aligned chunk. + // Provides cache-friendly vector iteration for has_next_aligned(). + bool all_aligned_ = false; + uint32_t aligned_col_count_ = 0; + std::vector aligned_vec_; }; class MeasurementColumnContext { @@ -116,6 +124,13 @@ class MeasurementColumnContext { return ssi_ ? ssi_->get_row_limit() : -1; } + virtual uint32_t available_rows() const = 0; + virtual int bulk_copy_into(std::vector& col_appenders, + common::ColAppender* time_appender, + common::RowAppender* row_appender, + uint32_t count) = 0; + virtual int skip_rows(uint32_t count) = 0; + protected: TsFileIOReader* tsfile_io_reader_; TsFileSeriesScanIterator* ssi_ = nullptr; @@ -124,7 +139,7 @@ class MeasurementColumnContext { common::ColIterator* value_iter_ = nullptr; }; -class SingleMeasurementColumnContext final : public MeasurementColumnContext { +class SingleMeasurementColumnContext : public MeasurementColumnContext { public: explicit SingleMeasurementColumnContext(TsFileIOReader* tsfile_io_reader) : MeasurementColumnContext(tsfile_io_reader) {} @@ -155,6 +170,12 @@ class SingleMeasurementColumnContext final : public MeasurementColumnContext { int get_current_time(int64_t& time) override; int get_current_value(char*& value, uint32_t& len) override; int move_iter() override; + uint32_t available_rows() const override; + int bulk_copy_into(std::vector& col_appenders, + common::ColAppender* time_appender, + common::RowAppender* row_appender, + uint32_t count) override; + int skip_rows(uint32_t count) override; private: std::string column_name_; @@ -165,21 +186,31 @@ class VectorMeasurementColumnContext final : public MeasurementColumnContext { public: explicit VectorMeasurementColumnContext(TsFileIOReader* tsfile_io_reader) : MeasurementColumnContext(tsfile_io_reader) {} + ~VectorMeasurementColumnContext() override; void fill_into(std::vector& col_appenders) override; void remove_from(std::map& column_context_map) override; int init(DeviceQueryTask* device_query_task, - const ITimeseriesIndex* time_series_index, Filter* time_filter, + const std::vector& measurement_names, + Filter* time_filter, std::vector>& pos_in_result, common::PageArena& pa); int get_next_tsblock(bool alloc_mem) override; int get_current_time(int64_t& time) override; int get_current_value(char*& value, uint32_t& len) override; int move_iter() override; + uint32_t available_rows() const override; + int bulk_copy_into(std::vector& col_appenders, + common::ColAppender* time_appender, + common::RowAppender* row_appender, + uint32_t count) override; + int skip_rows(uint32_t count) override; private: + std::vector column_names_; std::vector> pos_in_result_; + std::vector value_iters_; }; class IdColumnContext { diff --git a/cpp/src/reader/bloom_filter.cc b/cpp/src/reader/bloom_filter.cc index 068c96e27..4aff4ecd3 100644 --- a/cpp/src/reader/bloom_filter.cc +++ b/cpp/src/reader/bloom_filter.cc @@ -208,6 +208,26 @@ int BloomFilter::add_path_entry(const String& device_name, return E_OK; } +bool BloomFilter::contains(const String& device_name, + const String& measurement_name) { + if (size_ == 0) { + return true; // empty filter — assume present + } + String entry = get_entry_string(device_name, measurement_name); + if (IS_NULL(entry.buf_)) { + return true; // OOM — conservatively assume present + } + for (uint32_t i = 0; i < hash_func_count_; i++) { + int32_t hv = hash_func_arr_[i].hash(entry); + if (!bitset_.get(hv)) { + free_entry_buf(entry.buf_); + return false; // definitely not present + } + } + free_entry_buf(entry.buf_); + return true; // probably present +} + int BloomFilter::serialize_to(ByteStream& out) { int ret = E_OK; uint8_t* filter_data_bytes = nullptr; diff --git a/cpp/src/reader/bloom_filter.h b/cpp/src/reader/bloom_filter.h index b00de4a84..323cfa8a4 100644 --- a/cpp/src/reader/bloom_filter.h +++ b/cpp/src/reader/bloom_filter.h @@ -74,6 +74,11 @@ class BitSet { int32_t word_offset = pos % 64; words_[word_idx] |= (1ull << word_offset); } + bool get(int32_t pos) const { + int32_t word_idx = pos / 64; + int32_t word_offset = pos % 64; + return (words_[word_idx] & (1ull << word_offset)) != 0; + } int32_t get_words_in_use() const { for (int32_t i = word_count_ - 1; i >= 0; i--) { if (words_[i] != 0) { @@ -107,8 +112,11 @@ class BloomFilter { void destroy() { bitset_.destroy(); } int add_path_entry(const common::String& device_name, const common::String& measurement_name); + bool contains(const common::String& device_name, + const common::String& measurement_name); int serialize_to(common::ByteStream& out); int deserialize_from(common::ByteStream& in); + bool is_empty() const { return size_ == 0; } BitSet* get_bit_set() { return &bitset_; } private: diff --git a/cpp/src/reader/chunk_reader.cc b/cpp/src/reader/chunk_reader.cc index b150f7851..7c36ea07f 100644 --- a/cpp/src/reader/chunk_reader.cc +++ b/cpp/src/reader/chunk_reader.cc @@ -422,8 +422,6 @@ int ChunkReader::i32_DECODE_TYPED_TV_INTO_TSBLOCK(ByteStream& time_in, row_appender.backoff_add_row(); continue; } else { - /*std::cout << "decoder: time=" << time << ", value=" << value - * << std::endl;*/ row_appender.append(0, (char*)&time, sizeof(time)); row_appender.append(1, (char*)&value, sizeof(value)); } @@ -432,6 +430,350 @@ int ChunkReader::i32_DECODE_TYPED_TV_INTO_TSBLOCK(ByteStream& time_in, return ret; } +int ChunkReader::i32_DECODE_TV_BATCH(ByteStream& time_in, ByteStream& value_in, + RowAppender& row_appender, + Filter* filter) { + int ret = E_OK; + const int BATCH = 129; + int64_t times[BATCH]; + int32_t values[BATCH]; + + while (time_decoder_->has_remaining(time_in)) { + // Cap each pass to what the appender can still hold; the old + // "remaining < BATCH → OVERFLOW" check made progress impossible on + // TsBlocks with capacity below BATCH. + int eff_batch = + std::min(BATCH, static_cast(row_appender.remaining())); + if (eff_batch <= 0) { + ret = E_OVERFLOW; + break; + } + + // Block-level time filter check + bool block_all_pass = false; + if (filter != nullptr) { + int64_t block_min, block_max; + int block_count; + if (time_decoder_->peek_next_block_range_int64( + time_in, block_min, block_max, block_count)) { + if (!filter->satisfy_start_end_time(block_min, block_max)) { + int skipped = 0; + time_decoder_->skip_peeked_block_int64(time_in, skipped); + value_decoder_->skip_int32(block_count, skipped, value_in); + continue; + } + if (filter->contain_start_end_time(block_min, block_max)) { + block_all_pass = true; + } + } + } + + int time_count = 0; + int value_count = 0; + + if (RET_FAIL(time_decoder_->read_batch_int64(times, eff_batch, + time_count, time_in))) { + break; + } + if (time_count == 0) break; + + bool time_mask[BATCH]; + int pass_count = time_count; + if (filter != nullptr && !block_all_pass) { + pass_count = + filter->satisfy_batch_time(times, time_count, time_mask); + } + + if (pass_count == 0) { + int skipped = 0; + value_decoder_->skip_int32(time_count, skipped, value_in); + continue; + } + + if (RET_FAIL(value_decoder_->read_batch_int32(values, time_count, + value_count, value_in))) { + break; + } + // Time and value chunks are written in lock-step; any discrepancy + // means the file is truncated or corrupted. Reading uninitialised + // values[i] would silently surface garbage as decoded rows. + if (value_count != time_count) { + ret = E_TSFILE_CORRUPTED; + break; + } + + for (int i = 0; i < time_count; ++i) { + if (filter != nullptr && !block_all_pass && !time_mask[i]) { + continue; + } + if (filter != nullptr && !block_all_pass && + !filter->satisfy(times[i], (int64_t)values[i])) { + continue; + } + if (UNLIKELY(!row_appender.add_row())) { + ret = E_OVERFLOW; + break; + } + row_appender.append(0, (char*)×[i], sizeof(int64_t)); + row_appender.append(1, (char*)&values[i], sizeof(int32_t)); + } + if (ret != E_OK) break; + } + return ret; +} + +int ChunkReader::i64_DECODE_TV_BATCH(ByteStream& time_in, ByteStream& value_in, + RowAppender& row_appender, + Filter* filter) { + int ret = E_OK; + const int BATCH = 129; + int64_t times[BATCH]; + int64_t values[BATCH]; + + while (time_decoder_->has_remaining(time_in)) { + int eff_batch = + std::min(BATCH, static_cast(row_appender.remaining())); + if (eff_batch <= 0) { + ret = E_OVERFLOW; + break; + } + + // Block-level time filter check + bool block_all_pass = false; + if (filter != nullptr) { + int64_t block_min, block_max; + int block_count; + if (time_decoder_->peek_next_block_range_int64( + time_in, block_min, block_max, block_count)) { + if (!filter->satisfy_start_end_time(block_min, block_max)) { + int skipped = 0; + time_decoder_->skip_peeked_block_int64(time_in, skipped); + value_decoder_->skip_int64(block_count, skipped, value_in); + continue; + } + if (filter->contain_start_end_time(block_min, block_max)) { + block_all_pass = true; + } + } + } + + int time_count = 0; + int value_count = 0; + + if (RET_FAIL(time_decoder_->read_batch_int64(times, eff_batch, + time_count, time_in))) { + break; + } + if (time_count == 0) break; + + bool time_mask[BATCH]; + int pass_count = time_count; + if (filter != nullptr && !block_all_pass) { + pass_count = + filter->satisfy_batch_time(times, time_count, time_mask); + } + + if (pass_count == 0) { + int skipped = 0; + value_decoder_->skip_int64(time_count, skipped, value_in); + continue; + } + + if (RET_FAIL(value_decoder_->read_batch_int64(values, time_count, + value_count, value_in))) { + break; + } + if (value_count != time_count) { + ret = E_TSFILE_CORRUPTED; + break; + } + + for (int i = 0; i < time_count; ++i) { + if (filter != nullptr && !block_all_pass && !time_mask[i]) { + continue; + } + if (filter != nullptr && !block_all_pass && + !filter->satisfy(times[i], values[i])) { + continue; + } + if (UNLIKELY(!row_appender.add_row())) { + ret = E_OVERFLOW; + break; + } + row_appender.append(0, (char*)×[i], sizeof(int64_t)); + row_appender.append(1, (char*)&values[i], sizeof(int64_t)); + } + if (ret != E_OK) break; + } + return ret; +} + +int ChunkReader::float_DECODE_TV_BATCH(ByteStream& time_in, + ByteStream& value_in, + RowAppender& row_appender, + Filter* filter) { + int ret = E_OK; + const int BATCH = 129; + int64_t times[BATCH]; + float values[BATCH]; + + while (time_decoder_->has_remaining(time_in)) { + int eff_batch = + std::min(BATCH, static_cast(row_appender.remaining())); + if (eff_batch <= 0) { + ret = E_OVERFLOW; + break; + } + + // Block-level time filter check + bool block_all_pass = false; + if (filter != nullptr) { + int64_t block_min, block_max; + int block_count; + if (time_decoder_->peek_next_block_range_int64( + time_in, block_min, block_max, block_count)) { + if (!filter->satisfy_start_end_time(block_min, block_max)) { + int skipped = 0; + time_decoder_->skip_peeked_block_int64(time_in, skipped); + value_decoder_->skip_float(block_count, skipped, value_in); + continue; + } + if (filter->contain_start_end_time(block_min, block_max)) { + block_all_pass = true; + } + } + } + + int time_count = 0; + int value_count = 0; + + if (RET_FAIL(time_decoder_->read_batch_int64(times, eff_batch, + time_count, time_in))) { + break; + } + if (time_count == 0) break; + + bool time_mask[BATCH]; + int pass_count = time_count; + if (filter != nullptr && !block_all_pass) { + pass_count = + filter->satisfy_batch_time(times, time_count, time_mask); + } + + if (pass_count == 0) { + int skipped = 0; + value_decoder_->skip_float(time_count, skipped, value_in); + continue; + } + + if (RET_FAIL(value_decoder_->read_batch_float(values, time_count, + value_count, value_in))) { + break; + } + if (value_count != time_count) { + ret = E_TSFILE_CORRUPTED; + break; + } + + for (int i = 0; i < time_count; ++i) { + if (filter != nullptr && !block_all_pass && !time_mask[i]) { + continue; + } + if (UNLIKELY(!row_appender.add_row())) { + ret = E_OVERFLOW; + break; + } + row_appender.append(0, (char*)×[i], sizeof(int64_t)); + row_appender.append(1, (char*)&values[i], sizeof(float)); + } + if (ret != E_OK) break; + } + return ret; +} + +int ChunkReader::double_DECODE_TV_BATCH(ByteStream& time_in, + ByteStream& value_in, + RowAppender& row_appender, + Filter* filter) { + int ret = E_OK; + const int BATCH = 129; + int64_t times[BATCH]; + double values[BATCH]; + + while (time_decoder_->has_remaining(time_in)) { + int eff_batch = + std::min(BATCH, static_cast(row_appender.remaining())); + if (eff_batch <= 0) { + ret = E_OVERFLOW; + break; + } + + // Block-level time filter check + bool block_all_pass = false; + if (filter != nullptr) { + int64_t block_min, block_max; + int block_count; + if (time_decoder_->peek_next_block_range_int64( + time_in, block_min, block_max, block_count)) { + if (!filter->satisfy_start_end_time(block_min, block_max)) { + int skipped = 0; + time_decoder_->skip_peeked_block_int64(time_in, skipped); + value_decoder_->skip_double(block_count, skipped, value_in); + continue; + } + if (filter->contain_start_end_time(block_min, block_max)) { + block_all_pass = true; + } + } + } + + int time_count = 0; + int value_count = 0; + + if (RET_FAIL(time_decoder_->read_batch_int64(times, eff_batch, + time_count, time_in))) { + break; + } + if (time_count == 0) break; + + bool time_mask[BATCH]; + int pass_count = time_count; + if (filter != nullptr && !block_all_pass) { + pass_count = + filter->satisfy_batch_time(times, time_count, time_mask); + } + + if (pass_count == 0) { + int skipped = 0; + value_decoder_->skip_double(time_count, skipped, value_in); + continue; + } + + if (RET_FAIL(value_decoder_->read_batch_double( + values, time_count, value_count, value_in))) { + break; + } + if (value_count != time_count) { + ret = E_TSFILE_CORRUPTED; + break; + } + + for (int i = 0; i < time_count; ++i) { + if (filter != nullptr && !block_all_pass && !time_mask[i]) { + continue; + } + if (UNLIKELY(!row_appender.add_row())) { + ret = E_OVERFLOW; + break; + } + row_appender.append(0, (char*)×[i], sizeof(int64_t)); + row_appender.append(1, (char*)&values[i], sizeof(double)); + } + if (ret != E_OK) break; + } + return ret; +} + int ChunkReader::STRING_DECODE_TYPED_TV_INTO_TSBLOCK(ByteStream& time_in, ByteStream& value_in, RowAppender& row_appender, @@ -472,23 +814,21 @@ int ChunkReader::decode_tv_buf_into_tsblock_by_datatype(ByteStream& time_in, break; case common::DATE: case common::INT32: - // DECODE_TYPED_TV_INTO_TSBLOCK(int32_t, int32, time_in_, value_in_, - // row_appender); - ret = i32_DECODE_TYPED_TV_INTO_TSBLOCK(time_in_, value_in_, - row_appender, filter); + ret = + i32_DECODE_TV_BATCH(time_in_, value_in_, row_appender, filter); break; case TIMESTAMP: case common::INT64: - DECODE_TYPED_TV_INTO_TSBLOCK(int64_t, int64, time_in_, value_in_, - row_appender); + ret = + i64_DECODE_TV_BATCH(time_in_, value_in_, row_appender, filter); break; case common::FLOAT: - DECODE_TYPED_TV_INTO_TSBLOCK(float, float, time_in_, value_in_, - row_appender); + ret = float_DECODE_TV_BATCH(time_in_, value_in_, row_appender, + filter); break; case common::DOUBLE: - DECODE_TYPED_TV_INTO_TSBLOCK(double, double, time_in_, value_in_, - row_appender); + ret = double_DECODE_TV_BATCH(time_in_, value_in_, row_appender, + filter); break; case common::TEXT: case common::BLOB: diff --git a/cpp/src/reader/chunk_reader.h b/cpp/src/reader/chunk_reader.h index 3acd9c3cf..a1196c330 100644 --- a/cpp/src/reader/chunk_reader.h +++ b/cpp/src/reader/chunk_reader.h @@ -105,6 +105,20 @@ class ChunkReader : public IChunkReader { common::ByteStream& value_in, common::RowAppender& row_appender, Filter* filter); + int i32_DECODE_TV_BATCH(common::ByteStream& time_in, + common::ByteStream& value_in, + common::RowAppender& row_appender, Filter* filter); + int i64_DECODE_TV_BATCH(common::ByteStream& time_in, + common::ByteStream& value_in, + common::RowAppender& row_appender, Filter* filter); + int float_DECODE_TV_BATCH(common::ByteStream& time_in, + common::ByteStream& value_in, + common::RowAppender& row_appender, + Filter* filter); + int double_DECODE_TV_BATCH(common::ByteStream& time_in, + common::ByteStream& value_in, + common::RowAppender& row_appender, + Filter* filter); int STRING_DECODE_TYPED_TV_INTO_TSBLOCK(common::ByteStream& time_in, common::ByteStream& value_in, common::RowAppender& row_appender, @@ -131,7 +145,7 @@ class ChunkReader : public IChunkReader { * also refer to offset within the chunk (including chunk header). * It advanced by step of a page header or a page tv data. */ - common::ByteStream in_stream_{common::MOD_CHUNK_READER}; + common::ByteStream in_stream_; int32_t file_data_buf_size_; uint32_t chunk_visit_offset_; @@ -141,8 +155,8 @@ class ChunkReader : public IChunkReader { Decoder* time_decoder_; Decoder* value_decoder_; - common::ByteStream time_in_{common::MOD_CHUNK_READER}; - common::ByteStream value_in_{common::MOD_CHUNK_READER}; + common::ByteStream time_in_; + common::ByteStream value_in_; char* uncompressed_buf_; }; diff --git a/cpp/src/reader/device_meta_iterator.cc b/cpp/src/reader/device_meta_iterator.cc index bf01b23a5..955965624 100644 --- a/cpp/src/reader/device_meta_iterator.cc +++ b/cpp/src/reader/device_meta_iterator.cc @@ -186,7 +186,17 @@ int DeviceMetaIterator::load_results_direct() { ret = io_reader_->load_device_index_entry(device_comparable, device_index_entry, end_offset); - if (ret != common::E_OK || device_index_entry == nullptr) { + // "Device not present in this file" is the only ret value we should + // suppress. Read failures and corrupt index entries used to be folded + // into "no matches"; the caller then couldn't distinguish a clean miss + // from a partial read that silently dropped real data. Surface them. + if (ret == common::E_DEVICE_NOT_EXIST || ret == common::E_NOT_EXIST) { + return common::E_OK; + } + if (ret != common::E_OK) { + return ret; + } + if (device_index_entry == nullptr) { return common::E_OK; } diff --git a/cpp/src/reader/filter/and_filter.h b/cpp/src/reader/filter/and_filter.h index b324a3f81..289115baf 100644 --- a/cpp/src/reader/filter/and_filter.h +++ b/cpp/src/reader/filter/and_filter.h @@ -19,6 +19,8 @@ #ifndef READER_FILTER_OPERATOR_AND_FILTER_H #define READER_FILTER_OPERATOR_AND_FILTER_H +#include + #include "binary_filter.h" // #include "storage/storage_utils.h" @@ -48,6 +50,27 @@ class AndFilter : public BinaryFilter { right_->contain_start_end_time(start_time, end_time); } + int satisfy_batch_time(const int64_t* times, int count, bool* mask) { + // Inline buffer covers the common per-page BATCH=129 callers; only + // out-of-spec larger counts fall back to a heap allocation. + constexpr int kInlineCap = 256; + bool inline_buf[kInlineCap]; + std::unique_ptr heap_buf; + bool* mask_right = inline_buf; + if (count > kInlineCap) { + heap_buf.reset(new bool[count]); + mask_right = heap_buf.get(); + } + left_->satisfy_batch_time(times, count, mask); + right_->satisfy_batch_time(times, count, mask_right); + int pass = 0; + for (int i = 0; i < count; ++i) { + mask[i] = mask[i] && mask_right[i]; + if (mask[i]) ++pass; + } + return pass; + } + std::vector* get_time_ranges() { std::vector* result = new std::vector(); std::vector* left_time_ranges = left_->get_time_ranges(); diff --git a/cpp/src/reader/filter/filter.h b/cpp/src/reader/filter/filter.h index f39dddbae..e53992308 100644 --- a/cpp/src/reader/filter/filter.h +++ b/cpp/src/reader/filter/filter.h @@ -63,6 +63,20 @@ class Filter { ASSERT(false); return nullptr; } + + // Batch time filter: evaluate time filter on an array of timestamps. + // Writes true/false into @mask for each element. + // Returns the number of elements that passed (mask[i] == true). + // Default: scalar fallback using satisfy_start_end_time. + virtual int satisfy_batch_time(const int64_t* times, int count, + bool* mask) { + int pass = 0; + for (int i = 0; i < count; ++i) { + mask[i] = satisfy_start_end_time(times[i], times[i]); + if (mask[i]) ++pass; + } + return pass; + } }; } // namespace storage diff --git a/cpp/src/reader/filter/or_filter.h b/cpp/src/reader/filter/or_filter.h index fc8d4a2cf..518308982 100644 --- a/cpp/src/reader/filter/or_filter.h +++ b/cpp/src/reader/filter/or_filter.h @@ -19,6 +19,8 @@ #ifndef READER_FILTER_OPERATOR_OR_FILTER_H #define READER_FILTER_OPERATOR_OR_FILTER_H +#include + #include "binary_filter.h" // #include "storage/storage_utils.h" @@ -48,6 +50,27 @@ class OrFilter : public BinaryFilter { right_->contain_start_end_time(start_time, end_time); } + int satisfy_batch_time(const int64_t* times, int count, bool* mask) { + // Inline buffer covers the common per-page BATCH=129 callers; only + // out-of-spec larger counts fall back to a heap allocation. + constexpr int kInlineCap = 256; + bool inline_buf[kInlineCap]; + std::unique_ptr heap_buf; + bool* mask_right = inline_buf; + if (count > kInlineCap) { + heap_buf.reset(new bool[count]); + mask_right = heap_buf.get(); + } + left_->satisfy_batch_time(times, count, mask); + right_->satisfy_batch_time(times, count, mask_right); + int pass = 0; + for (int i = 0; i < count; ++i) { + mask[i] = mask[i] || mask_right[i]; + if (mask[i]) ++pass; + } + return pass; + } + std::vector* get_time_ranges() { std::vector* result = new std::vector(); std::vector* left_time_ranges = left_->get_time_ranges(); diff --git a/cpp/src/reader/filter/time_operator.cc b/cpp/src/reader/filter/time_operator.cc index 19f33b599..0bb12e4ec 100644 --- a/cpp/src/reader/filter/time_operator.cc +++ b/cpp/src/reader/filter/time_operator.cc @@ -18,9 +18,17 @@ */ #include "time_operator.h" +#include + #include "common/statistic.h" #include "utils/storage_utils.h" +#if defined(__ARM_NEON) +#include +#elif defined(ENABLE_SIMD) +#include "simde/x86/avx2.h" +#endif + namespace storage { TimeBetween::TimeBetween(int64_t value1, int64_t value2, bool not_between) @@ -29,6 +37,15 @@ TimeBetween::TimeBetween(int64_t value1, int64_t value2, bool not_between) TimeBetween::~TimeBetween() {} bool TimeBetween::satisfy(Statistic* statistic) { + // An empty inner interval (value1_ > value2_) is unsatisfiable for BETWEEN + // (matches nothing) and trivially true for NOT BETWEEN (matches + // everything) -- i.e. the answer is exactly not_. Without this guard the + // overlap test below wrongly reports "maybe" for an empty range, + // disagreeing with the row-level satisfy() and letting empty/inverted + // ranges slip past statistic-level pruning. + if (value1_ > value2_) { + return not_; + } if (not_) { return statistic->end_time_ < value1_ || statistic->start_time_ > value2_; @@ -47,6 +64,10 @@ bool TimeBetween::satisfy(int64_t time, common::String value) { } bool TimeBetween::satisfy_start_end_time(int64_t start_time, int64_t end_time) { + // Empty inner interval: see satisfy(Statistic*). + if (value1_ > value2_) { + return not_; + } if (not_) { return start_time < value1_ || end_time > value2_; } else { @@ -55,6 +76,10 @@ bool TimeBetween::satisfy_start_end_time(int64_t start_time, int64_t end_time) { } bool TimeBetween::contain_start_end_time(int64_t start_time, int64_t end_time) { + // Empty inner interval: see satisfy(Statistic*). + if (value1_ > value2_) { + return not_; + } if (not_) { return end_time < value1_ || start_time > value2_; } else { @@ -64,6 +89,16 @@ bool TimeBetween::contain_start_end_time(int64_t start_time, int64_t end_time) { std::vector* TimeBetween::get_time_ranges() { std::vector* result = new std::vector(); + // Empty inner interval (value1_ > value2_): BETWEEN yields no ranges; + // NOT BETWEEN covers the whole timeline. + if (value1_ > value2_) { + if (not_) { + result->push_back( + new TimeRange(std::numeric_limits::min(), + std::numeric_limits::max())); + } + return result; + } if (not_) { if (value1_ != std::numeric_limits::min()) { result->push_back(new TimeRange(std::numeric_limits::min(), @@ -102,11 +137,42 @@ bool TimeIn::satisfy(int64_t time, common::String value) { } bool TimeIn::satisfy_start_end_time(int64_t start_time, int64_t end_time) { - return true; + // "Could any time in [s, e] satisfy the filter?" + // IN({v_i}): true iff some v_i lies in [s, e]. + // NOT IN: true unless the entire range [s, e] is one point and that + // point is in values_; for ranges wider than a single integer there is + // always at least one time not in values_, so we're conservative. + bool any_in_range = false; + for (int64_t v : values_) { + if (v >= start_time && v <= end_time) { + any_in_range = true; + break; + } + } + if (not_) { + if (start_time == end_time) return !any_in_range; + return true; + } + return any_in_range; } bool TimeIn::contain_start_end_time(int64_t start_time, int64_t end_time) { - return true; + // "Do ALL times in [s, e] satisfy the filter?" + // IN({v_i}): only when [s,e] collapses to a single point that is in + // values_; a sparse IN list can't cover a range otherwise. Returning + // true unconditionally would let the batch fast path skip per-row + // filtering and emit every row. + // NOT IN: true iff no v_i lies in [s, e]. + bool any_in_range = false; + for (int64_t v : values_) { + if (v >= start_time && v <= end_time) { + any_in_range = true; + break; + } + } + if (not_) return !any_in_range; + if (start_time == end_time) return any_in_range; + return false; } std::vector* TimeIn::get_time_ranges() { @@ -308,4 +374,269 @@ std::vector* TimeLtEq::get_time_ranges() { return result; } +// ============================================================================ +// SIMD batch time filter implementations +// ============================================================================ + +// Helper: extract 4-bit movemask from 256-bit comparison result (4 x i64) +#if !defined(__ARM_NEON) && defined(ENABLE_SIMD) +static inline int simd_movemask_epi64(simde__m256i v) { + // movemask_pd reinterprets as double and checks sign bit = high bit of each + // 64-bit lane + return simde_mm256_movemask_pd(simde_mm256_castsi256_pd(v)); +} +#endif + +int TimeGt::satisfy_batch_time(const int64_t* times, int count, bool* mask) { + int pass = 0; + int i = 0; +#if defined(__ARM_NEON) + int64x2_t vval = vdupq_n_s64(value_); + for (; i + 1 < count; i += 2) { + int64x2_t vt = vld1q_s64(times + i); + uint64x2_t cmp = vcgtq_s64(vt, vval); + mask[i] = vgetq_lane_u64(cmp, 0) != 0; + mask[i + 1] = vgetq_lane_u64(cmp, 1) != 0; + pass += mask[i] + mask[i + 1]; + } +#elif defined(ENABLE_SIMD) + simde__m256i vval = simde_mm256_set1_epi64x(value_); + for (; i + 3 < count; i += 4) { + simde__m256i vt = + simde_mm256_loadu_si256((const simde__m256i*)(times + i)); + // time > value_ => cmpgt(time, value_) + simde__m256i cmp = simde_mm256_cmpgt_epi64(vt, vval); + int bits = simd_movemask_epi64(cmp); + for (int j = 0; j < 4; ++j) { + mask[i + j] = (bits >> j) & 1; + pass += mask[i + j]; + } + } +#endif + for (; i < count; ++i) { + mask[i] = value_ < times[i]; + if (mask[i]) ++pass; + } + return pass; +} + +int TimeGtEq::satisfy_batch_time(const int64_t* times, int count, bool* mask) { + int pass = 0; + int i = 0; +#if defined(__ARM_NEON) + int64x2_t vval = vdupq_n_s64(value_); + for (; i + 1 < count; i += 2) { + int64x2_t vt = vld1q_s64(times + i); + uint64x2_t cmp = vcgeq_s64(vt, vval); + mask[i] = vgetq_lane_u64(cmp, 0) != 0; + mask[i + 1] = vgetq_lane_u64(cmp, 1) != 0; + pass += mask[i] + mask[i + 1]; + } +#elif defined(ENABLE_SIMD) + simde__m256i vval = simde_mm256_set1_epi64x(value_); + for (; i + 3 < count; i += 4) { + simde__m256i vt = + simde_mm256_loadu_si256((const simde__m256i*)(times + i)); + // time >= value_ => NOT(cmpgt(value_, time)) + simde__m256i cmp = simde_mm256_cmpgt_epi64(vval, vt); + simde__m256i ncmp = + simde_mm256_xor_si256(cmp, simde_mm256_set1_epi64x((int64_t)-1)); + int bits = simd_movemask_epi64(ncmp); + for (int j = 0; j < 4; ++j) { + mask[i + j] = (bits >> j) & 1; + pass += mask[i + j]; + } + } +#endif + for (; i < count; ++i) { + mask[i] = value_ <= times[i]; + if (mask[i]) ++pass; + } + return pass; +} + +int TimeLt::satisfy_batch_time(const int64_t* times, int count, bool* mask) { + int pass = 0; + int i = 0; +#if defined(__ARM_NEON) + int64x2_t vval = vdupq_n_s64(value_); + for (; i + 1 < count; i += 2) { + int64x2_t vt = vld1q_s64(times + i); + uint64x2_t cmp = vcltq_s64(vt, vval); + mask[i] = vgetq_lane_u64(cmp, 0) != 0; + mask[i + 1] = vgetq_lane_u64(cmp, 1) != 0; + pass += mask[i] + mask[i + 1]; + } +#elif defined(ENABLE_SIMD) + simde__m256i vval = simde_mm256_set1_epi64x(value_); + for (; i + 3 < count; i += 4) { + simde__m256i vt = + simde_mm256_loadu_si256((const simde__m256i*)(times + i)); + // time < value_ => cmpgt(value_, time) + simde__m256i cmp = simde_mm256_cmpgt_epi64(vval, vt); + int bits = simd_movemask_epi64(cmp); + for (int j = 0; j < 4; ++j) { + mask[i + j] = (bits >> j) & 1; + pass += mask[i + j]; + } + } +#endif + for (; i < count; ++i) { + mask[i] = value_ > times[i]; + if (mask[i]) ++pass; + } + return pass; +} + +int TimeLtEq::satisfy_batch_time(const int64_t* times, int count, bool* mask) { + int pass = 0; + int i = 0; +#if defined(__ARM_NEON) + int64x2_t vval = vdupq_n_s64(value_); + for (; i + 1 < count; i += 2) { + int64x2_t vt = vld1q_s64(times + i); + uint64x2_t cmp = vcleq_s64(vt, vval); + mask[i] = vgetq_lane_u64(cmp, 0) != 0; + mask[i + 1] = vgetq_lane_u64(cmp, 1) != 0; + pass += mask[i] + mask[i + 1]; + } +#elif defined(ENABLE_SIMD) + simde__m256i vval = simde_mm256_set1_epi64x(value_); + for (; i + 3 < count; i += 4) { + simde__m256i vt = + simde_mm256_loadu_si256((const simde__m256i*)(times + i)); + // time <= value_ => NOT(cmpgt(time, value_)) + simde__m256i cmp = simde_mm256_cmpgt_epi64(vt, vval); + simde__m256i ncmp = + simde_mm256_xor_si256(cmp, simde_mm256_set1_epi64x((int64_t)-1)); + int bits = simd_movemask_epi64(ncmp); + for (int j = 0; j < 4; ++j) { + mask[i + j] = (bits >> j) & 1; + pass += mask[i + j]; + } + } +#endif + for (; i < count; ++i) { + mask[i] = value_ >= times[i]; + if (mask[i]) ++pass; + } + return pass; +} + +int TimeEq::satisfy_batch_time(const int64_t* times, int count, bool* mask) { + int pass = 0; + int i = 0; +#if defined(__ARM_NEON) + int64x2_t vval = vdupq_n_s64(value_); + for (; i + 1 < count; i += 2) { + int64x2_t vt = vld1q_s64(times + i); + uint64x2_t cmp = vceqq_s64(vt, vval); + mask[i] = vgetq_lane_u64(cmp, 0) != 0; + mask[i + 1] = vgetq_lane_u64(cmp, 1) != 0; + pass += mask[i] + mask[i + 1]; + } +#elif defined(ENABLE_SIMD) + simde__m256i vval = simde_mm256_set1_epi64x(value_); + for (; i + 3 < count; i += 4) { + simde__m256i vt = + simde_mm256_loadu_si256((const simde__m256i*)(times + i)); + simde__m256i cmp = simde_mm256_cmpeq_epi64(vt, vval); + int bits = simd_movemask_epi64(cmp); + for (int j = 0; j < 4; ++j) { + mask[i + j] = (bits >> j) & 1; + pass += mask[i + j]; + } + } +#endif + for (; i < count; ++i) { + mask[i] = value_ == times[i]; + if (mask[i]) ++pass; + } + return pass; +} + +int TimeNotEq::satisfy_batch_time(const int64_t* times, int count, bool* mask) { + int pass = 0; + int i = 0; +#if defined(__ARM_NEON) + int64x2_t vval = vdupq_n_s64(value_); + uint64x2_t ones = vdupq_n_u64(UINT64_MAX); + for (; i + 1 < count; i += 2) { + int64x2_t vt = vld1q_s64(times + i); + uint64x2_t cmp = veorq_u64(vceqq_s64(vt, vval), ones); + mask[i] = vgetq_lane_u64(cmp, 0) != 0; + mask[i + 1] = vgetq_lane_u64(cmp, 1) != 0; + pass += mask[i] + mask[i + 1]; + } +#elif defined(ENABLE_SIMD) + simde__m256i vval = simde_mm256_set1_epi64x(value_); + for (; i + 3 < count; i += 4) { + simde__m256i vt = + simde_mm256_loadu_si256((const simde__m256i*)(times + i)); + simde__m256i eq = simde_mm256_cmpeq_epi64(vt, vval); + simde__m256i neq = + simde_mm256_xor_si256(eq, simde_mm256_set1_epi64x((int64_t)-1)); + int bits = simd_movemask_epi64(neq); + for (int j = 0; j < 4; ++j) { + mask[i + j] = (bits >> j) & 1; + pass += mask[i + j]; + } + } +#endif + for (; i < count; ++i) { + mask[i] = value_ != times[i]; + if (mask[i]) ++pass; + } + return pass; +} + +int TimeBetween::satisfy_batch_time(const int64_t* times, int count, + bool* mask) { + int pass = 0; + int i = 0; +#if defined(__ARM_NEON) + int64x2_t vlo = vdupq_n_s64(value1_); + int64x2_t vhi = vdupq_n_s64(value2_); + uint64x2_t ones = vdupq_n_u64(UINT64_MAX); + for (; i + 1 < count; i += 2) { + int64x2_t vt = vld1q_s64(times + i); + uint64x2_t ge_lo = vcgeq_s64(vt, vlo); + uint64x2_t le_hi = vcleq_s64(vt, vhi); + uint64x2_t between = vandq_u64(ge_lo, le_hi); + uint64x2_t result = not_ ? veorq_u64(between, ones) : between; + mask[i] = vgetq_lane_u64(result, 0) != 0; + mask[i + 1] = vgetq_lane_u64(result, 1) != 0; + pass += mask[i] + mask[i + 1]; + } +#elif defined(ENABLE_SIMD) + simde__m256i vlo = simde_mm256_set1_epi64x(value1_); + simde__m256i vhi = simde_mm256_set1_epi64x(value2_); + simde__m256i ones = simde_mm256_set1_epi64x((int64_t)-1); + for (; i + 3 < count; i += 4) { + simde__m256i vt = + simde_mm256_loadu_si256((const simde__m256i*)(times + i)); + // time >= lo => NOT(cmpgt(lo, time)) + simde__m256i ge_lo = + simde_mm256_xor_si256(simde_mm256_cmpgt_epi64(vlo, vt), ones); + // time <= hi => NOT(cmpgt(time, hi)) + simde__m256i le_hi = + simde_mm256_xor_si256(simde_mm256_cmpgt_epi64(vt, vhi), ones); + simde__m256i between = simde_mm256_and_si256(ge_lo, le_hi); + simde__m256i result = + not_ ? simde_mm256_xor_si256(between, ones) : between; + int bits = simd_movemask_epi64(result); + for (int j = 0; j < 4; ++j) { + mask[i + j] = (bits >> j) & 1; + pass += mask[i + j]; + } + } +#endif + for (; i < count; ++i) { + bool in_range = (value1_ <= times[i]) && (times[i] <= value2_); + mask[i] = not_ ? !in_range : in_range; + if (mask[i]) ++pass; + } + return pass; +} + } // namespace storage diff --git a/cpp/src/reader/filter/time_operator.h b/cpp/src/reader/filter/time_operator.h index 29930b88a..f972a4259 100644 --- a/cpp/src/reader/filter/time_operator.h +++ b/cpp/src/reader/filter/time_operator.h @@ -47,6 +47,9 @@ class TimeBetween : public Filter { bool contain_start_end_time(int64_t start_time, int64_t end_time); std::vector* get_time_ranges(); + + int satisfy_batch_time(const int64_t* times, int count, bool* mask); + FilterType get_filter_type() { return type_; } private: @@ -99,6 +102,8 @@ class TimeEq : public Filter { std::vector* get_time_ranges(); + int satisfy_batch_time(const int64_t* times, int count, bool* mask); + FilterType get_filter_type() { return type_; } private: @@ -122,6 +127,9 @@ class TimeNotEq : public Filter { bool contain_start_end_time(int64_t start_time, int64_t end_time); std::vector* get_time_ranges(); + + int satisfy_batch_time(const int64_t* times, int count, bool* mask); + FilterType get_filter_type() { return type_; } private: @@ -146,6 +154,8 @@ class TimeGt : public Filter { std::vector* get_time_ranges(); + int satisfy_batch_time(const int64_t* times, int count, bool* mask); + FilterType get_filter_type() { return type_; } private: @@ -169,6 +179,9 @@ class TimeGtEq : public Filter { bool contain_start_end_time(int64_t start_time, int64_t end_time); std::vector* get_time_ranges(); + + int satisfy_batch_time(const int64_t* times, int count, bool* mask); + void reset_value(int64_t val) { value_ = val; } FilterType get_filter_type() { return type_; } @@ -194,6 +207,8 @@ class TimeLt : public Filter { std::vector* get_time_ranges(); + int satisfy_batch_time(const int64_t* times, int count, bool* mask); + FilterType get_filter_type() { return type_; } private: @@ -217,6 +232,9 @@ class TimeLtEq : public Filter { bool contain_start_end_time(int64_t start_time, int64_t end_time); std::vector* get_time_ranges(); + + int satisfy_batch_time(const int64_t* times, int count, bool* mask); + FilterType get_filter_type() { return type_; } private: diff --git a/cpp/src/reader/qds_without_timegenerator.cc b/cpp/src/reader/qds_without_timegenerator.cc index 474e13b77..b612e5dc2 100644 --- a/cpp/src/reader/qds_without_timegenerator.cc +++ b/cpp/src/reader/qds_without_timegenerator.cc @@ -68,7 +68,12 @@ int QDSWithoutTimeGenerator::init_internal(TsFileIOReader* io_reader, ret = io_reader_->alloc_ssi(paths[i].device_id_, paths[i].measurement_, ssi, pa_, global_time_filter); if (ret == E_MEASUREMENT_NOT_EXIST || ret == E_DEVICE_NOT_EXIST || - ret == E_NOT_EXIST) { + ret == E_NOT_EXIST || ret == E_NO_MORE_DATA) { + // Java-aligned: silently skip paths whose device or measurement + // doesn't exist in this file. The bloom-filter optimization in + // alloc_ssi reports a missing series as E_NO_MORE_DATA, so treat + // that the same as the not-found codes. + ret = E_OK; continue; } if (ret != E_OK) { diff --git a/cpp/src/reader/result_set.h b/cpp/src/reader/result_set.h index 1f1653603..0b73595d4 100644 --- a/cpp/src/reader/result_set.h +++ b/cpp/src/reader/result_set.h @@ -162,6 +162,35 @@ class ResultSet : std::enable_shared_from_this { return common::E_INVALID_ARG; } + // Typed direct accessors. Default implementation routes through the + // generic RowRecord / Field path so existing subclasses keep working. + // Fast subclasses (TableResultSet) override these to read straight from + // the underlying columnar buffer, skipping the per-cell Field round-trip + // (and the eager materialization in next()). + virtual bool get_bool_at(uint32_t column_index) { + return get_row_record()->get_field(column_index - 1)->get_value(); + } + virtual int32_t get_int32_at(uint32_t column_index) { + return get_row_record() + ->get_field(column_index - 1) + ->get_value(); + } + virtual int64_t get_int64_at(uint32_t column_index) { + return get_row_record() + ->get_field(column_index - 1) + ->get_value(); + } + virtual float get_float_at(uint32_t column_index) { + return get_row_record() + ->get_field(column_index - 1) + ->get_value(); + } + virtual double get_double_at(uint32_t column_index) { + return get_row_record() + ->get_field(column_index - 1) + ->get_value(); + } + /** * @brief Get the row record of the result set * @@ -245,6 +274,29 @@ inline std::tm ResultSet::get_value(uint32_t column_index) { return row_record->get_field(column_index)->get_date_value(); } +// Index-based primitive specializations route to the typed virtual +// accessors so TableResultSet can serve them without materializing a Field. +template <> +inline bool ResultSet::get_value(uint32_t column_index) { + return get_bool_at(column_index); +} +template <> +inline int32_t ResultSet::get_value(uint32_t column_index) { + return get_int32_at(column_index); +} +template <> +inline int64_t ResultSet::get_value(uint32_t column_index) { + return get_int64_at(column_index); +} +template <> +inline float ResultSet::get_value(uint32_t column_index) { + return get_float_at(column_index); +} +template <> +inline double ResultSet::get_value(uint32_t column_index) { + return get_double_at(column_index); +} + /** * @brief Simple iterator for ResultSet with smart pointers */ @@ -306,7 +358,7 @@ inline ResultSetIterator ResultSet::iterator() { return ResultSetIterator(this); } -static MAYBE_UNUSED void print_table_result_set( +MAYBE_UNUSED static void print_table_result_set( storage::ResultSet* table_result_set) { if (table_result_set == nullptr) { std::cout << "TableResultSet is nullptr" << std::endl; diff --git a/cpp/src/reader/table_result_set.cc b/cpp/src/reader/table_result_set.cc index 81b58ce68..6de093d24 100644 --- a/cpp/src/reader/table_result_set.cc +++ b/cpp/src/reader/table_result_set.cc @@ -43,6 +43,16 @@ int TableResultSet::next(bool& has_next) { int ret = common::E_OK; + // Advance past the row yielded by the previous next() call, if any. + // Row iterator's next() advances all per-column offsets, so on the next + // read the vectors point to the new row's data. + if (row_ready_) { + row_iterator_->next(); + row_ready_ = false; + row_materialized_ = false; + } + + // Find the next non-empty TsBlock. while (row_iterator_ == nullptr || !row_iterator_->has_next()) { if (RET_FAIL(tsblock_reader_->has_next(has_next))) { return ret; @@ -68,23 +78,29 @@ int TableResultSet::next(bool& has_next) { } if (row_iterator_ == nullptr || !row_iterator_->has_next()) { has_next = false; + return ret; } - if (has_next && IS_SUCC(ret)) { - uint32_t len = 0; - bool null = false; - row_record_->reset(); - for (uint32_t i = 0; i < row_iterator_->get_column_count(); ++i) { - const auto value = row_iterator_->read(i, &len, &null); - if (!null) { - row_record_->get_field(i)->set_value( - row_iterator_->get_data_type(i), value, len, pa_); - row_iterator_->next(i); - } + // A row is now available at row_iterator_'s current row_id_; the per- + // column vector offsets are pointing at that row's data. We do NOT + // populate row_record_ here — typed accessors read straight from the + // vectors, and get_row_record() lazily materializes on demand. + has_next = true; + row_ready_ = true; + return ret; +} + +void TableResultSet::materialize_current_row() { + uint32_t len = 0; + bool null = false; + row_record_->reset(); + for (uint32_t i = 0; i < row_iterator_->get_column_count(); ++i) { + const auto value = row_iterator_->read(i, &len, &null); + if (!null) { + row_record_->get_field(i)->set_value( + row_iterator_->get_data_type(i), value, len, pa_); } - row_iterator_->update_row_id(); } - return ret; } bool TableResultSet::is_null(const std::string& column_name) { @@ -98,11 +114,57 @@ bool TableResultSet::is_null(const std::string& column_name) { bool TableResultSet::is_null(uint32_t column_index) { ASSERT(1 <= column_index && column_index <= row_record_->get_col_num()); - return row_record_->get_field(column_index - 1) == nullptr || - row_record_->get_field(column_index - 1)->is_type(common::NULL_TYPE); + if (!row_ready_) return true; + return row_iterator_->is_null_at(column_index - 1); +} + +// Direct buffer access — skips Vector::read's virtual dispatch. Caller is +// expected to have checked is_null() (we still null-guard for safety). +// For fixed-width primitives the vector keeps its value buffer in +// values_ and tracks the current row's byte offset in offset_; the +// element at the active row is simply *(T*)(values_.get_data() + offset_). +// The ASSERT enforces strict typed access: the requested C++ type must match +// the column's physical storage width (DATE is int32, not int64). On a +// mismatch it fires in debug instead of silently splicing the adjacent cell's +// bytes into the result. +#define TSFILE_FAST_PRIMITIVE_READ(TYPE, DFLT) \ + if (!row_ready_) return DFLT; \ + common::Vector* vec = row_iterator_->get_vector(column_index - 1); \ + ASSERT(common::TypeMatch(vec->get_vector_type())); \ + if (vec->has_null() && vec->is_null(row_iterator_->get_row_id())) \ + return DFLT; \ + return *reinterpret_cast(vec->get_value_data().get_data() + \ + vec->get_offset()) + +bool TableResultSet::get_bool_at(uint32_t column_index) { + TSFILE_FAST_PRIMITIVE_READ(bool, false); } -RowRecord* TableResultSet::get_row_record() { return row_record_; } +int32_t TableResultSet::get_int32_at(uint32_t column_index) { + TSFILE_FAST_PRIMITIVE_READ(int32_t, 0); +} + +int64_t TableResultSet::get_int64_at(uint32_t column_index) { + TSFILE_FAST_PRIMITIVE_READ(int64_t, 0); +} + +float TableResultSet::get_float_at(uint32_t column_index) { + TSFILE_FAST_PRIMITIVE_READ(float, 0.0f); +} + +double TableResultSet::get_double_at(uint32_t column_index) { + TSFILE_FAST_PRIMITIVE_READ(double, 0.0); +} + +#undef TSFILE_FAST_PRIMITIVE_READ + +RowRecord* TableResultSet::get_row_record() { + if (row_ready_ && !row_materialized_) { + materialize_current_row(); + row_materialized_ = true; + } + return row_record_; +} std::shared_ptr TableResultSet::get_metadata() { return result_set_metadata_; @@ -138,7 +200,13 @@ int TableResultSet::get_next_tsblock(common::TsBlock*& block) { } void TableResultSet::close() { - tsblock_reader_->close(); + if (closed_) { + return; + } + closed_ = true; + if (tsblock_reader_) { + tsblock_reader_->close(); + } pa_.destroy(); if (row_record_) { delete row_record_; @@ -150,4 +218,4 @@ void TableResultSet::close() { } } -} // namespace storage \ No newline at end of file +} // namespace storage diff --git a/cpp/src/reader/table_result_set.h b/cpp/src/reader/table_result_set.h index 072a63f6f..d92072934 100644 --- a/cpp/src/reader/table_result_set.h +++ b/cpp/src/reader/table_result_set.h @@ -48,8 +48,23 @@ class TableResultSet : public ResultSet { void close() override; int get_next_tsblock(common::TsBlock*& block) override; + // Fast typed accessors: read straight from the current TsBlock vector + // without going through RowRecord/Field. Caller is expected to have + // checked is_null() — when the cell is null the underlying buffer pointer + // is nullptr and these return a default (0 / 0.0 / false) without + // dereferencing it. + bool get_bool_at(uint32_t column_index) override; + int32_t get_int32_at(uint32_t column_index) override; + int64_t get_int64_at(uint32_t column_index) override; + float get_float_at(uint32_t column_index) override; + double get_double_at(uint32_t column_index) override; + private: void init(); + // Lazy materialization: fill row_record_ from the current row when a + // caller actually requests the RowRecord (or a non-fast accessor). + void materialize_current_row(); + std::unique_ptr tsblock_reader_; common::RowIterator* row_iterator_ = nullptr; common::TsBlock* tsblock_ = nullptr; @@ -58,6 +73,11 @@ class TableResultSet : public ResultSet { std::vector column_names_; std::vector data_types_; const int return_mode_; + bool closed_ = false; + // True when row_iterator_ points at a row that hasn't been consumed yet. + bool row_ready_ = false; + // True when row_record_ has been populated for the current row. + bool row_materialized_ = false; }; } // namespace storage -#endif // TABLE_RESULT_SET_H \ No newline at end of file +#endif // TABLE_RESULT_SET_H diff --git a/cpp/src/reader/task/device_query_task.cc b/cpp/src/reader/task/device_query_task.cc index c7e7091ff..6345c93fa 100644 --- a/cpp/src/reader/task/device_query_task.cc +++ b/cpp/src/reader/task/device_query_task.cc @@ -19,6 +19,8 @@ #include "reader/task/device_query_task.h" +#include "common/tsfile_common.h" + namespace storage { DeviceQueryTask* DeviceQueryTask::create_device_query_task( std::shared_ptr device_id, std::vector column_names, @@ -34,8 +36,14 @@ DeviceQueryTask* DeviceQueryTask::create_device_query_task( } DeviceQueryTask::~DeviceQueryTask() { - if (index_root_) { + // index_root_ was placement-new'd into DeviceMetaIterator's PageArena and + // ownership transferred here via DeviceMetaIterator::next; the arena only + // frees raw bytes, so we must invoke the destructor explicitly to release + // the heap-allocated children_ vector and its nested shared_ptr graph + // (DeviceMetaIndexEntry -> StringArrayDeviceID). + if (index_root_ != nullptr) { index_root_->~MetaIndexNode(); + index_root_ = nullptr; } } diff --git a/cpp/src/reader/task/device_task_iterator.cc b/cpp/src/reader/task/device_task_iterator.cc index dbe763303..e22fefb06 100644 --- a/cpp/src/reader/task/device_task_iterator.cc +++ b/cpp/src/reader/task/device_task_iterator.cc @@ -37,6 +37,9 @@ int DeviceTaskIterator::next(DeviceQueryTask*& task) { task = DeviceQueryTask::create_device_query_task( device_meta_pair.first, column_names_, column_mapping_, device_meta_pair.second, table_schema_, pa_); + if (task != nullptr) { + created_tasks_.push_back(task); + } } return ret; } diff --git a/cpp/src/reader/task/device_task_iterator.h b/cpp/src/reader/task/device_task_iterator.h index 061711c17..cc5a75562 100644 --- a/cpp/src/reader/task/device_task_iterator.h +++ b/cpp/src/reader/task/device_task_iterator.h @@ -58,7 +58,17 @@ class DeviceTaskIterator { pa_.init(512, common::MOD_DEVICE_TASK_ITER); } - ~DeviceTaskIterator() { pa_.destroy(); } + ~DeviceTaskIterator() { + // The tasks are placement-new'd into pa_ memory; pa_.destroy() only + // releases the raw bytes, so we must call their destructors here to + // release the heap-allocated members (std::vector, + // shared_ptr's, etc.) they own. + for (DeviceQueryTask* t : created_tasks_) { + t->~DeviceQueryTask(); + } + created_tasks_.clear(); + pa_.destroy(); + } void flush_remaining_device_meta_cache(); @@ -72,6 +82,7 @@ class DeviceTaskIterator { std::unique_ptr device_meta_iterator_; std::shared_ptr table_schema_; common::PageArena pa_; + std::vector created_tasks_; }; } // namespace storage diff --git a/cpp/src/reader/tsfile_reader.cc b/cpp/src/reader/tsfile_reader.cc index 8d9d9b5dc..540674f33 100644 --- a/cpp/src/reader/tsfile_reader.cc +++ b/cpp/src/reader/tsfile_reader.cc @@ -94,8 +94,7 @@ namespace storage { TsFileReader::TsFileReader() : read_file_(nullptr), tsfile_executor_(nullptr), - table_query_executor_(nullptr), - table_query_executor_batch_size_(0) { + table_query_executor_(nullptr) { tsfile_reader_meta_pa_.init(512, MOD_TSFILE_READER); } @@ -113,6 +112,22 @@ int TsFileReader::open(const std::string& file_path) { return ret; } +int TsFileReader::ensure_table_query_executor(int batch_size) { + if (table_query_executor_ != nullptr && + table_query_executor_batch_size_ == batch_size) { + return E_OK; + } + + if (table_query_executor_ != nullptr) { + delete table_query_executor_; + table_query_executor_ = nullptr; + } + + table_query_executor_ = new TableQueryExecutor(read_file_, batch_size); + table_query_executor_batch_size_ = batch_size; + return E_OK; +} + int TsFileReader::close() { int ret = E_OK; if (tsfile_executor_ != nullptr) { @@ -123,7 +138,6 @@ int TsFileReader::close() { delete table_query_executor_; table_query_executor_ = nullptr; } - table_query_executor_batch_size_ = 0; if (read_file_ != nullptr) { read_file_->close(); delete read_file_; @@ -132,22 +146,6 @@ int TsFileReader::close() { return ret; } -int TsFileReader::ensure_table_query_executor(int batch_size) { - if (table_query_executor_ != nullptr && - table_query_executor_batch_size_ == batch_size) { - return E_OK; - } - - if (table_query_executor_ != nullptr) { - delete table_query_executor_; - table_query_executor_ = nullptr; - } - - table_query_executor_ = new TableQueryExecutor(read_file_, batch_size); - table_query_executor_batch_size_ = batch_size; - return E_OK; -} - int TsFileReader::query(QueryExpression* qe, ResultSet*& ret_qds) { return tsfile_executor_->execute(qe, ret_qds); } @@ -411,16 +409,21 @@ int TsFileReader::get_timeseries_schema( device_id, timeseries_indexs, pa))) { } else { for (auto timeseries_index : timeseries_indexs) { - auto* aligned_timeseries_index = - dynamic_cast(timeseries_index); - auto data_type = - aligned_timeseries_index != nullptr && - aligned_timeseries_index->value_ts_idx_ != nullptr - ? aligned_timeseries_index->value_ts_idx_->get_data_type() - : timeseries_index->get_data_type(); + // AlignedTimeseriesIndex::get_data_type() returns the time + // column type (VECTOR) so the aligned/non-aligned dispatch in + // SSI can keep using the existing accessor. For schema + // exposure we need the actual value column type — without this + // unwrap, INT32/FLOAT/... would all surface as VECTOR. + common::TSDataType dt = timeseries_index->get_data_type(); + if (dt == common::VECTOR) { + auto* aligned = + dynamic_cast(timeseries_index); + if (aligned != nullptr && aligned->value_ts_idx_ != nullptr) { + dt = aligned->value_ts_idx_->get_data_type(); + } + } MeasurementSchema ms( - timeseries_index->get_measurement_name().to_std_string(), - data_type); + timeseries_index->get_measurement_name().to_std_string(), dt); result.push_back(ms); } } @@ -448,6 +451,15 @@ int TsFileReader::get_timeseries_metadata_impl( DeviceTimeseriesMetadataMap TsFileReader::get_timeseries_metadata( const std::vector>& device_ids) { + // Reset the shared meta arena up front: every call writes fresh + // timeseries-index metadata into it via _impl(), and the previous + // implementation only ever appended. A long-lived reader that repeats + // this query would grow tsfile_reader_meta_pa_ without bound (each call + // duplicates the per-device payload). Callers that need to retain prior + // results past this call must copy them out before invoking again — the + // shared_ptrs handed back use a noop deleter pointing into this arena. + tsfile_reader_meta_pa_.destroy(); + tsfile_reader_meta_pa_.init(512, MOD_TSFILE_READER); DeviceTimeseriesMetadataMap result; for (const auto& device_id : device_ids) { std::vector> list; @@ -466,6 +478,10 @@ DeviceTimeseriesMetadataMap TsFileReader::get_timeseries_metadata() { return result; } + // Same arena-reset rationale as the device_ids overload above. + tsfile_reader_meta_pa_.destroy(); + tsfile_reader_meta_pa_.init(512, MOD_TSFILE_READER); + PageArena pa; pa.init(512, MOD_TSFILE_READER); std::vector entries; diff --git a/cpp/src/reader/tsfile_reader.h b/cpp/src/reader/tsfile_reader.h index 19d83ec61..e2f9f3496 100644 --- a/cpp/src/reader/tsfile_reader.h +++ b/cpp/src/reader/tsfile_reader.h @@ -143,7 +143,6 @@ class TsFileReader { * @param offset Number of leading rows to skip (>= 0). * @param limit Maximum rows to return. < 0 means unlimited. * @param[out] result_set The result set containing query results. - * @param tag_filter Optional tag filter for filtering by tag columns. * @return Returns 0 on success, or a non-zero error code on failure. */ int queryByRow(const std::string& table_name, @@ -243,8 +242,10 @@ class TsFileReader { storage::ReadFile* read_file_; storage::TsFileExecutor* tsfile_executor_; storage::TableQueryExecutor* table_query_executor_; - int table_query_executor_batch_size_; + int table_query_executor_batch_size_ = -1; common::PageArena tsfile_reader_meta_pa_; + // Test-only hook for the unbounded-arena-growth regression check. + friend class TsFileReaderMetaArenaTest; }; } // namespace storage diff --git a/cpp/src/reader/tsfile_series_scan_iterator.cc b/cpp/src/reader/tsfile_series_scan_iterator.cc index 1d666bfc0..538b00d43 100644 --- a/cpp/src/reader/tsfile_series_scan_iterator.cc +++ b/cpp/src/reader/tsfile_series_scan_iterator.cc @@ -19,13 +19,37 @@ #include "reader/tsfile_series_scan_iterator.h" +#include + +#include "common/global.h" +#ifdef ENABLE_THREADS +#include "common/thread_pool.h" +#endif + using namespace common; namespace storage { void TsFileSeriesScanIterator::destroy() { + // MultiAlignedTimeseriesIndex is placement-new'd inside + // timeseries_index_pa_ (see TsFileIOReader::alloc_multi_ssi). The arena's + // destroy() frees raw memory without running destructors, so its + // value_ts_idxs_ std::vector backing buffer would leak. Release it + // explicitly before tearing down the arena. dynamic_cast is null-safe and + // returns nullptr for the single-value / non-aligned index types, which own + // no separate heap storage. + if (auto* multi = + dynamic_cast(itimeseries_index_)) { + std::vector().swap(multi->value_ts_idxs_); + } + itimeseries_index_ = nullptr; timeseries_index_pa_.destroy(); if (chunk_reader_ != nullptr) { + // destroy() already runs manual destructors on internal members + // (chunk_header_, decoders, compressor, ...), so calling + // chunk_reader_->~IChunkReader() here would double-destruct them. + // The vector-buffer leaks (e.g. chunk_pages_) are released inside + // AlignedChunkReader::destroy() via vector<>{}.swap(). chunk_reader_->destroy(); common::mem_free(chunk_reader_); chunk_reader_ = nullptr; @@ -65,20 +89,24 @@ bool TsFileSeriesScanIterator::should_skip_aligned_chunk_by_offset( if (row_offset_ <= 0) { return false; } - if (time_cm->statistic_ == nullptr || value_cm->statistic_ == nullptr) { + // Aligned value chunks' statistic_->count_ only counts non-null rows, + // not total rows. Using value_cm alone could skip an entire 100-row + // chunk for an offset of 10 just because it has 10 non-null values. + // Only apply the whole-chunk shortcut when time and value statistics + // agree on the row count (i.e. no sparse nulls in this chunk); fall + // through to per-page/per-row handling otherwise so the offset is + // applied against the real row stream. + if (time_cm == nullptr || value_cm == nullptr || + time_cm->statistic_ == nullptr || value_cm->statistic_ == nullptr) { return false; } int32_t tc = time_cm->statistic_->count_; int32_t vc = value_cm->statistic_->count_; - if (tc <= 0 || vc <= 0) { - return false; - } - if (tc != vc) { + if (tc <= 0 || vc <= 0 || tc != vc) { return false; } - int32_t count = tc; - if (row_offset_ >= count) { - row_offset_ -= count; + if (row_offset_ >= tc) { + row_offset_ -= tc; return true; } return false; @@ -91,74 +119,104 @@ int TsFileSeriesScanIterator::get_next(TsBlock*& ret_tsblock, bool alloc, Filter* filter = (oneshoot_filter != nullptr) ? oneshoot_filter : time_filter_; + // When get_next_page() reports E_NO_MORE_DATA but the chunk reader + // still claims has_more_data() (an aligned-chunk artifact where time + // and value pages report state differently), a bare `continue` would + // retry the exhausted chunk forever. Force the next iteration to + // advance to the next chunk-meta cursor instead. bool force_load_next_chunk = false; while (true) { - // When get_next_page() reports no more data for the current chunk but - // metadata still lists more chunks, we must load the next chunk. A - // bare continue would retry the exhausted reader forever if - // has_more_data() still returns true (e.g. aligned chunk state). if (!chunk_reader_->has_more_data() || force_load_next_chunk) { force_load_next_chunk = false; while (true) { if (!has_next_chunk()) { return E_NO_MORE_DATA; + } else if (is_multi_value_) { + // Multi-value aligned path + ChunkMeta* time_cm = time_chunk_meta_cursor_.get(); + std::vector value_cms; + value_cms.reserve(value_chunk_meta_cursors_.size()); + for (auto& cur : value_chunk_meta_cursors_) { + value_cms.push_back(cur.get()); + } + advance_to_next_chunk(); + // Skip chunk by time filter using time chunk statistics. + if (filter != nullptr && time_cm->statistic_ != nullptr && + !filter->satisfy(time_cm->statistic_)) { + continue; + } + if (should_skip_chunk_by_time(time_cm, min_time_hint)) { + continue; + } + chunk_reader_->reset(); + auto* acr = static_cast(chunk_reader_); + if (RET_FAIL(acr->load_by_aligned_meta_multi(time_cm, + value_cms))) { + } + break; + } else if (!is_aligned_) { + ChunkMeta* cm = get_current_chunk_meta(); + advance_to_next_chunk(); + if (filter != nullptr && cm->statistic_ != nullptr && + !filter->satisfy(cm->statistic_)) { + continue; + } + // Skip by min_time_hint (merge cursor). + if (should_skip_chunk_by_time(cm, min_time_hint)) { + continue; + } + // Single-path: skip entire chunk by offset using count. + if (should_skip_chunk_by_offset(cm)) { + continue; + } + chunk_reader_->reset(); + if (RET_FAIL(chunk_reader_->load_by_meta(cm))) { + } + break; } else { - if (!is_aligned_) { - ChunkMeta* cm = get_current_chunk_meta(); - advance_to_next_chunk(); - // Skip by time filter. - if (filter != nullptr && cm->statistic_ != nullptr && - !filter->satisfy(cm->statistic_)) { - continue; - } - // Skip by min_time_hint (merge cursor). - if (should_skip_chunk_by_time(cm, min_time_hint)) { - continue; - } - // Single-path: skip entire chunk by offset using count. - if (should_skip_chunk_by_offset(cm)) { - continue; - } - chunk_reader_->reset(); - if (RET_FAIL(chunk_reader_->load_by_meta(cm))) { - } - break; - } else { - ChunkMeta* value_cm = value_chunk_meta_cursor_.get(); - ChunkMeta* time_cm = time_chunk_meta_cursor_.get(); - advance_to_next_chunk(); - if (filter != nullptr && - value_cm->statistic_ != nullptr && - !filter->satisfy(value_cm->statistic_)) { - continue; - } - if (should_skip_chunk_by_time(value_cm, - min_time_hint)) { - continue; - } - if (should_skip_aligned_chunk_by_offset(time_cm, - value_cm)) { - continue; - } - chunk_reader_->reset(); - if (RET_FAIL(chunk_reader_->load_by_aligned_meta( - time_cm, value_cm))) { - } - break; + ChunkMeta* value_cm = value_chunk_meta_cursor_.get(); + ChunkMeta* time_cm = time_chunk_meta_cursor_.get(); + advance_to_next_chunk(); + // Use time chunk statistics for time-based filtering. + ChunkMeta* filter_cm = + (time_cm->statistic_ != nullptr) ? time_cm : value_cm; + if (filter != nullptr && filter_cm->statistic_ != nullptr && + !filter->satisfy(filter_cm->statistic_)) { + continue; + } + if (should_skip_chunk_by_time(filter_cm, min_time_hint)) { + continue; } + if (should_skip_aligned_chunk_by_offset(time_cm, + value_cm)) { + continue; + } + chunk_reader_->reset(); + if (RET_FAIL(chunk_reader_->load_by_aligned_meta( + time_cm, value_cm))) { + } + break; } } } if (IS_SUCC(ret)) { if (alloc && ret_tsblock == nullptr) { - ret_tsblock = alloc_tsblock(); + ret_tsblock = + is_multi_value_ ? alloc_tsblock_multi() : alloc_tsblock(); } ret = chunk_reader_->get_next_page(ret_tsblock, filter, *data_pa_, min_time_hint, row_offset_, row_limit_); } + if (ret == common::E_NO_MORE_DATA && ret_tsblock != nullptr && + ret_tsblock->get_row_count() > 0) { + return E_OK; + } // When current chunk is exhausted (e.g. all pages skipped by offset) - // but there are more chunks, load next chunk and retry. + // but there are more chunks, load next chunk and retry. Set the + // force flag so the next iteration bypasses has_more_data() (which + // can still report true on an aligned chunk that has actually + // yielded all its rows). if (ret == common::E_NO_MORE_DATA && has_next_chunk()) { ret = E_OK; force_load_next_chunk = true; @@ -179,9 +237,19 @@ void TsFileSeriesScanIterator::revert_tsblock() { int TsFileSeriesScanIterator::init_chunk_reader() { int ret = E_OK; is_aligned_ = itimeseries_index_->is_aligned(); + + // Check if this is a multi-value aligned index. alloc_multi_ssi() creates + // MultiAlignedTimeseriesIndex even when the query selects one value column, + // so keep that path consistent with wider aligned reads. + if (is_aligned_ && dynamic_cast( + itimeseries_index_) != nullptr) { + return init_chunk_reader_multi(); + } + if (!is_aligned_) { void* buf = common::mem_alloc(sizeof(ChunkReader), common::MOD_CHUNK_READER); + if (IS_NULL(buf)) return E_OOM; chunk_reader_ = new (buf) ChunkReader; chunk_meta_cursor_ = itimeseries_index_->get_chunk_meta_list()->begin(); if (RET_FAIL(chunk_reader_->init( @@ -191,6 +259,7 @@ int TsFileSeriesScanIterator::init_chunk_reader() { } else { void* buf = common::mem_alloc(sizeof(AlignedChunkReader), common::MOD_CHUNK_READER); + if (IS_NULL(buf)) return E_OOM; chunk_reader_ = new (buf) AlignedChunkReader; time_chunk_meta_cursor_ = itimeseries_index_->get_time_chunk_meta_list()->begin(); @@ -205,6 +274,96 @@ int TsFileSeriesScanIterator::init_chunk_reader() { return ret; } +int TsFileSeriesScanIterator::init_chunk_reader_multi() { + int ret = E_OK; + is_multi_value_ = true; + + void* buf = + common::mem_alloc(sizeof(AlignedChunkReader), common::MOD_CHUNK_READER); + if (IS_NULL(buf)) { + // The single-value path (init_chunk_reader) silently dereferenced + // the null pointer on OOM; this path is new in the multi-value + // reader work and would do the same via placement-new(nullptr) → + // undefined behavior the moment any AlignedChunkReader field is + // touched. Surface E_OOM instead. + is_multi_value_ = false; + return E_OOM; + } + auto* acr = new (buf) AlignedChunkReader; + chunk_reader_ = acr; + + uint32_t num_cols = itimeseries_index_->get_value_column_count(); +#ifdef ENABLE_THREADS + // Borrow the single process-wide worker pool (created in init_common()) for + // multi-column decode. Null when libtsfile_init() hasn't run; combined + // with parallel_read_enabled_ this gates the parallel decode path — the + // reader falls back to serial decode otherwise. + if (num_cols > 1 && common::g_config_value_.parallel_read_enabled_ && + common::g_thread_pool_ != nullptr) { + acr->set_decode_pool(common::g_thread_pool_); + } +#endif + + // Per-column chunk lists must align 1:1 with the time chunk list: + // load_by_aligned_meta_multi pairs them by index and the downstream + // reader has no notion of a "missing" value chunk for a CGM. If a + // file evolved its schema and some column has fewer (or more) chunks + // than the time column, naive index pairing would mate chunks from + // different chunk groups, returning garbage and dereferencing past + // end() once the shorter list ran out. Refuse upfront with a clear + // error rather than producing wrong data. + uint32_t time_chunk_count = + itimeseries_index_->get_time_chunk_meta_list()->size(); + for (uint32_t c = 0; c < num_cols; c++) { + if (itimeseries_index_->get_value_chunk_meta_list(c)->size() != + time_chunk_count) { + return E_NOT_SUPPORT; + } + } + + // Init time cursor + time_chunk_meta_cursor_ = + itimeseries_index_->get_time_chunk_meta_list()->begin(); + + // Init all value cursors + value_chunk_meta_cursors_.resize(num_cols); + for (uint32_t c = 0; c < num_cols; c++) { + value_chunk_meta_cursors_[c] = + itimeseries_index_->get_value_chunk_meta_list(c)->begin(); + } + + // Init chunk reader + if (RET_FAIL( + acr->init(read_file_, itimeseries_index_->get_measurement_name(), + itimeseries_index_->get_data_type(), time_filter_))) { + return ret; + } + + // No chunks → nothing to load; iteration short-circuits via + // has_next_chunk() returning false. + if (time_chunk_count == 0) { + return ret; + } + + // Load first chunk set + ChunkMeta* time_cm = time_chunk_meta_cursor_.get(); + std::vector value_cms; + value_cms.reserve(num_cols); + for (uint32_t c = 0; c < num_cols; c++) { + value_cms.push_back(value_chunk_meta_cursors_[c].get()); + } + + if (RET_FAIL(acr->load_by_aligned_meta_multi(time_cm, value_cms))) { + return ret; + } + + // Advance cursors + time_chunk_meta_cursor_++; + for (auto& cur : value_chunk_meta_cursors_) cur++; + + return ret; +} + TsBlock* TsFileSeriesScanIterator::alloc_tsblock() { ChunkHeader& ch = chunk_reader_->get_chunk_header(); @@ -225,4 +384,29 @@ TsBlock* TsFileSeriesScanIterator::alloc_tsblock() { return tsblock_; } -} // end namespace storage \ No newline at end of file +TsBlock* TsFileSeriesScanIterator::alloc_tsblock_multi() { + auto* acr = static_cast(chunk_reader_); + + // Time column + ColumnSchema time_cd("time", common::INT64, common::SNAPPY, + common::TS_2DIFF); + tuple_desc_.push_back(time_cd); + + // Value columns + uint32_t num_cols = acr->get_value_column_count(); + for (uint32_t c = 0; c < num_cols; c++) { + ChunkHeader& ch = acr->get_value_chunk_header(c); + ColumnSchema value_cd(ch.measurement_name_, ch.data_type_, + ch.compression_type_, ch.encoding_type_); + tuple_desc_.push_back(value_cd); + } + + tsblock_ = new TsBlock(&tuple_desc_); + if (E_OK != tsblock_->init()) { + delete tsblock_; + tsblock_ = nullptr; + } + return tsblock_; +} + +} // end namespace storage diff --git a/cpp/src/reader/tsfile_series_scan_iterator.h b/cpp/src/reader/tsfile_series_scan_iterator.h index 9e790a3d1..77037d8e1 100644 --- a/cpp/src/reader/tsfile_series_scan_iterator.h +++ b/cpp/src/reader/tsfile_series_scan_iterator.h @@ -50,6 +50,7 @@ class TsFileSeriesScanIterator { tsblock_(nullptr), time_filter_(nullptr), is_aligned_(false), + is_multi_value_(false), row_offset_(0), row_limit_(-1) {} ~TsFileSeriesScanIterator() { destroy(); } @@ -93,11 +94,42 @@ class TsFileSeriesScanIterator { int64_t min_time_hint = std::numeric_limits::min()); void revert_tsblock(); + // Multi-value: number of value columns in the TsBlock + uint32_t get_value_column_count() const { + if (is_multi_value_ && chunk_reader_) { + auto* acr = static_cast(chunk_reader_); + return acr->get_value_column_count(); + } + return 1; + } + + bool is_multi_value() const { return is_multi_value_; } + friend class TsFileIOReader; private: int init_chunk_reader(); + int init_chunk_reader_multi(); FORCE_INLINE bool has_next_chunk() const { + if (is_multi_value_) { + // Anchor on the time chunk list and require every value column + // to still have a chunk available. Checking only value[0] used + // to read past end() for columns with fewer chunks (e.g. a + // column added after some chunk groups had already been + // flushed), which dereferenced freed memory and paired the + // wrong time/value chunks. + if (time_chunk_meta_cursor_ == + itimeseries_index_->get_time_chunk_meta_list()->end()) { + return false; + } + for (uint32_t c = 0; c < value_chunk_meta_cursors_.size(); c++) { + if (value_chunk_meta_cursors_[c] == + itimeseries_index_->get_value_chunk_meta_list(c)->end()) { + return false; + } + } + return true; + } if (is_aligned_) { return value_chunk_meta_cursor_ != itimeseries_index_->get_value_chunk_meta_list()->end(); @@ -107,7 +139,21 @@ class TsFileSeriesScanIterator { } } FORCE_INLINE void advance_to_next_chunk() { - if (is_aligned_) { + if (is_multi_value_) { + // Guard each cursor against advancing past end(). Same defense + // as has_next_chunk(): per-column chunk counts can diverge in + // files with schema evolution. + auto time_end = + itimeseries_index_->get_time_chunk_meta_list()->end(); + if (time_chunk_meta_cursor_ != time_end) time_chunk_meta_cursor_++; + for (uint32_t c = 0; c < value_chunk_meta_cursors_.size(); c++) { + auto end = + itimeseries_index_->get_value_chunk_meta_list(c)->end(); + if (value_chunk_meta_cursors_[c] != end) { + value_chunk_meta_cursors_[c]++; + } + } + } else if (is_aligned_) { time_chunk_meta_cursor_++; value_chunk_meta_cursor_++; } else { @@ -119,15 +165,10 @@ class TsFileSeriesScanIterator { } bool should_skip_chunk_by_time(ChunkMeta* cm, int64_t min_time_hint); bool should_skip_chunk_by_offset(ChunkMeta* cm); - /** - * Aligned (VECTOR): whole-chunk skip by row count is only safe when the - * time ChunkMeta and value ChunkMeta agree on statistic count (>0). If - * either side lacks count or counts differ, skip is disabled for this - * chunk; pages are loaded and page/row-level offset handling applies. - */ bool should_skip_aligned_chunk_by_offset(ChunkMeta* time_cm, ChunkMeta* value_cm); common::TsBlock* alloc_tsblock(); + common::TsBlock* alloc_tsblock_multi(); private: ReadFile* read_file_; @@ -140,12 +181,16 @@ class TsFileSeriesScanIterator { common::SimpleList::Iterator chunk_meta_cursor_; common::SimpleList::Iterator time_chunk_meta_cursor_; common::SimpleList::Iterator value_chunk_meta_cursor_; + // Multi-value: one cursor per value column + std::vector::Iterator> + value_chunk_meta_cursors_; IChunkReader* chunk_reader_; common::TupleDesc tuple_desc_; common::TsBlock* tsblock_; Filter* time_filter_; bool is_aligned_ = false; + bool is_multi_value_ = false; int row_offset_; int row_limit_; }; diff --git a/cpp/src/utils/db_utils.h b/cpp/src/utils/db_utils.h index 4ffc4d138..b3cb1943e 100644 --- a/cpp/src/utils/db_utils.h +++ b/cpp/src/utils/db_utils.h @@ -195,8 +195,6 @@ struct ColumnSchema { }; FORCE_INLINE int64_t get_cur_timestamp() { - // Milliseconds since the Unix epoch. Uses the C++11 standard library so it - // is portable across platforms (gettimeofday is not available on MSVC). return std::chrono::duration_cast( std::chrono::system_clock::now().time_since_epoch()) .count(); diff --git a/cpp/src/writer/chunk_writer.cc b/cpp/src/writer/chunk_writer.cc index da1811336..acdb4951d 100644 --- a/cpp/src/writer/chunk_writer.cc +++ b/cpp/src/writer/chunk_writer.cc @@ -138,6 +138,9 @@ int ChunkWriter::seal_cur_page(bool end_chunk) { void ChunkWriter::save_first_page_data(PageWriter& first_page_writer) { first_page_data_ = first_page_writer.get_cur_page_data(); first_page_statistic_->deep_copy_from(first_page_writer.get_statistic()); + // See ValueChunkWriter::save_first_page_data: avoid double-free on the + // shallow-copied buffer pointers. + first_page_writer.release_cur_page_data(); } int ChunkWriter::write_first_page_data(ByteStream& pages_data, diff --git a/cpp/src/writer/chunk_writer.h b/cpp/src/writer/chunk_writer.h index 6eb3f5418..a65f0537f 100644 --- a/cpp/src/writer/chunk_writer.h +++ b/cpp/src/writer/chunk_writer.h @@ -103,6 +103,68 @@ class ChunkWriter { CW_DO_WRITE_FOR_TYPE(); } + template + int write_batch(const int64_t* timestamps, const T* values, + uint32_t count) { + int ret = common::E_OK; + uint32_t offset = 0; + const uint32_t page_cap = + common::g_config_value_.page_writer_max_point_num_; + while (offset < count) { + uint32_t cur_points = page_writer_.get_point_numer(); + // Seal whenever cur_points is at or past the cap; the counter is + // size_ (rows including the just-written batch) and may exceed + // page_cap, so a plain subtraction would underflow uint32_t. + if (cur_points >= page_cap) { + if (RET_FAIL(seal_cur_page(false))) { + return ret; + } + cur_points = 0; + } + uint32_t page_remaining = page_cap - cur_points; + uint32_t batch_size = std::min(count - offset, page_remaining); + if (RET_FAIL(page_writer_.write_batch( + timestamps + offset, values + offset, batch_size))) { + return ret; + } + offset += batch_size; + if (RET_FAIL(seal_cur_page_if_full())) { + return ret; + } + } + return ret; + } + + int write_string_batch(const int64_t* timestamps, const char* buffer, + const uint32_t* offsets, uint32_t start_idx, + uint32_t count) { + int ret = common::E_OK; + uint32_t offset = 0; + const uint32_t page_cap = + common::g_config_value_.page_writer_max_point_num_; + while (offset < count) { + uint32_t cur_points = page_writer_.get_point_numer(); + if (cur_points >= page_cap) { + if (RET_FAIL(seal_cur_page(false))) { + return ret; + } + cur_points = 0; + } + uint32_t page_remaining = page_cap - cur_points; + uint32_t batch_size = std::min(count - offset, page_remaining); + if (RET_FAIL(page_writer_.write_string_batch( + timestamps + offset, buffer, offsets, start_idx + offset, + batch_size))) { + return ret; + } + offset += batch_size; + if (RET_FAIL(seal_cur_page_if_full())) { + return ret; + } + } + return ret; + } + int end_encode_chunk(); common::ByteStream& get_chunk_data() { return chunk_data_; } Statistic* get_chunk_statistic() { return chunk_statistic_; } diff --git a/cpp/src/writer/page_writer.cc b/cpp/src/writer/page_writer.cc index 7766e14c4..eebe5b400 100644 --- a/cpp/src/writer/page_writer.cc +++ b/cpp/src/writer/page_writer.cc @@ -126,6 +126,11 @@ void PageWriter::reset() { } time_out_stream_.reset(); value_out_stream_.reset(); + // Without this, a page that was poisoned by a mid-batch encode failure + // would stay refused forever even after ChunkWriter calls reset() to + // start a fresh page — `partial_failure_` would still be true and + // write_to_chunk() would return E_DATA_INCONSISTENCY indefinitely. + partial_failure_ = false; } void PageWriter::destroy() { @@ -156,6 +161,14 @@ int PageWriter::write_to_chunk(ByteStream& pages_data, bool write_header, << pages_data.total_size() << " of chunk_data." << std::endl; #endif int ret = E_OK; + // Refuse to seal a page whose time and value streams diverged because of + // a mid-batch encode failure (see PageWriter::write_batch). The higher + // layer (TsFileWriter::unrecoverable_) is the authoritative place to + // surface this to the caller; this guard prevents a misaligned page from + // ever entering the chunk stream. + if (UNLIKELY(partial_failure_)) { + return common::E_DATA_INCONSISTENCY; + } if (RET_FAIL(prepare_end_page())) { return ret; } diff --git a/cpp/src/writer/page_writer.h b/cpp/src/writer/page_writer.h index d3966d865..47c958913 100644 --- a/cpp/src/writer/page_writer.h +++ b/cpp/src/writer/page_writer.h @@ -150,10 +150,63 @@ class PageWriter { PW_DO_WRITE_FOR_TYPE(); } + template + FORCE_INLINE int write_batch(const int64_t* timestamps, const T* values, + uint32_t count) { + int ret = common::E_OK; + if (count == 0) return ret; + if (UNLIKELY(partial_failure_)) return common::E_DATA_INCONSISTENCY; + if (RET_FAIL(time_encoder_->encode_batch(timestamps, count, + time_out_stream_))) { + // Time stream wasn't advanced (encode_batch is atomic w.r.t. the + // stream cursor on failure for these encoders) — leave the page + // intact so the caller can retry. + } else if (RET_FAIL(value_encoder_->encode_batch(values, count, + value_out_stream_))) { + // Time stream already advanced; we can't roll it back here. + // Mark the page poisoned so write_to_chunk() refuses to seal a + // page where time and value rows are out of sync. + partial_failure_ = true; + } else { + statistic_->update_batch(timestamps, values, count); + } + return ret; + } + + // Batch write strings from Arrow-style offset+buffer layout. + FORCE_INLINE int write_string_batch(const int64_t* timestamps, + const char* buffer, + const uint32_t* offsets, + uint32_t start_idx, uint32_t count) { + int ret = common::E_OK; + if (count == 0) return ret; + if (UNLIKELY(partial_failure_)) return common::E_DATA_INCONSISTENCY; + if (RET_FAIL(time_encoder_->encode_batch(timestamps, count, + time_out_stream_))) { + } else if (RET_FAIL(value_encoder_->encode_string_batch( + buffer, offsets, start_idx, count, value_out_stream_))) { + partial_failure_ = true; + } else { + for (uint32_t i = 0; i < count; i++) { + uint32_t idx = start_idx + i; + uint32_t len = offsets[idx + 1] - offsets[idx]; + common::String val(buffer + offsets[idx], len); + statistic_->update(timestamps[i], val); + } + } + return ret; + } + + FORCE_INLINE bool has_partial_failure() const { return partial_failure_; } + FORCE_INLINE uint32_t get_point_numer() const { return statistic_->count_; } FORCE_INLINE uint32_t get_time_out_stream_size() const { return time_out_stream_.total_size(); } + // Logical bytes written — used by the page-seal-when-full heuristic. + // Memory-pressure accounting should use estimate_max_mem_size() below, + // which reflects the real 64 KiB-page footprint of the underlying + // ByteStreams. FORCE_INLINE uint32_t get_page_memory_size() const { return time_out_stream_.total_size() + value_out_stream_.total_size(); } @@ -162,10 +215,17 @@ class PageWriter { * outputStream and value outputStream, because size outputStream is never * used until flushing. * + * Reports the *allocated* stream footprint (sum of backing 64 KiB pages) + * rather than the logical bytes written. Sparse workloads with many + * measurements would otherwise look like they hold ~0 memory while + * actually pinning a full 64 KiB page per stream, so chunk-group memory + * thresholds couldn't keep peak memory under the configured cap. + * * @return allocated size in time, value and outputStream */ FORCE_INLINE uint32_t estimate_max_mem_size() const { - return time_out_stream_.total_size() + value_out_stream_.total_size() + + return static_cast(time_out_stream_.allocated_bytes() + + value_out_stream_.allocated_bytes()) + time_encoder_->get_max_byte_size() + value_encoder_->get_max_byte_size(); } @@ -179,6 +239,11 @@ class PageWriter { } FORCE_INLINE Statistic* get_statistic() { return statistic_; } PageData get_cur_page_data() { return cur_page_data_; } + // See ValuePageWriter::release_cur_page_data for rationale. + void release_cur_page_data() { + cur_page_data_.uncompressed_buf_ = nullptr; + cur_page_data_.compressed_buf_ = nullptr; + } void destroy_page_data() { cur_page_data_.destroy(); } private: @@ -193,7 +258,6 @@ class PageWriter { common::ByteStream& pages_data); private: - // static const uint32_t OUT_STREAM_PAGE_SIZE = 48; static const uint32_t OUT_STREAM_PAGE_SIZE = 1024; private: @@ -206,6 +270,11 @@ class PageWriter { PageData cur_page_data_; Compressor* compressor_; bool is_inited_; + // Set when write_batch advanced the time stream but value encoding + // failed. We can't unwind the partial time write, so refuse further + // writes and surface the poisoning to the higher layer via + // write_to_chunk(). + bool partial_failure_ = false; }; } // end namespace storage diff --git a/cpp/src/writer/time_chunk_writer.cc b/cpp/src/writer/time_chunk_writer.cc index 0c7e3b212..0a0623686 100644 --- a/cpp/src/writer/time_chunk_writer.cc +++ b/cpp/src/writer/time_chunk_writer.cc @@ -144,6 +144,9 @@ int TimeChunkWriter::seal_cur_page(bool end_chunk) { void TimeChunkWriter::save_first_page_data(TimePageWriter& first_page_writer) { first_page_data_ = first_page_writer.get_cur_page_data(); first_page_statistic_->deep_copy_from(first_page_writer.get_statistic()); + // See ValueChunkWriter::save_first_page_data: avoid double-free on the + // shallow-copied buffer pointers. + first_page_writer.release_cur_page_data(); } int TimeChunkWriter::write_first_page_data(ByteStream& pages_data, @@ -173,9 +176,6 @@ int TimeChunkWriter::end_encode_chunk() { chunk_header_.data_size_ = chunk_data_.total_size(); chunk_header_.num_of_pages_ = num_of_pages_; } - } else if (num_of_pages_ > 0) { - chunk_header_.data_size_ = chunk_data_.total_size(); - chunk_header_.num_of_pages_ = num_of_pages_; } #if DEBUG_SE std::cout << "end_encode_time_chunk: num_of_pages_=" << num_of_pages_ diff --git a/cpp/src/writer/time_chunk_writer.h b/cpp/src/writer/time_chunk_writer.h index c67516ba5..e6b2894e2 100644 --- a/cpp/src/writer/time_chunk_writer.h +++ b/cpp/src/writer/time_chunk_writer.h @@ -42,8 +42,7 @@ class TimeChunkWriter { first_page_data_(), first_page_statistic_(nullptr), chunk_header_(), - num_of_pages_(0), - enable_page_seal_if_full_(true) {} + num_of_pages_(0) {} ~TimeChunkWriter() { destroy(); } int init(const common::ColumnSchema& col_schema); int init(const std::string& measurement_name, common::TSEncoding encoding, @@ -58,9 +57,35 @@ class TimeChunkWriter { if (RET_FAIL(time_page_writer_.write(timestamp))) { return ret; } - if (UNLIKELY(!enable_page_seal_if_full_)) { + if (RET_FAIL(seal_cur_page_if_full())) { return ret; - } else { + } + return ret; + } + + int write_batch(const int64_t* timestamps, uint32_t count) { + int ret = common::E_OK; + uint32_t offset = 0; + const uint32_t page_cap = + common::g_config_value_.page_writer_max_point_num_; + while (offset < count) { + uint32_t cur_points = time_page_writer_.get_point_numer(); + // Seal whenever cur_points is at or past the cap; the counter is + // size_ (rows including the just-written batch) and may exceed + // page_cap, so a plain subtraction would underflow uint32_t. + if (cur_points >= page_cap) { + if (RET_FAIL(seal_cur_page(false))) { + return ret; + } + cur_points = 0; + } + uint32_t page_remaining = page_cap - cur_points; + uint32_t batch_size = std::min(count - offset, page_remaining); + if (RET_FAIL(time_page_writer_.write_batch(timestamps + offset, + batch_size))) { + return ret; + } + offset += batch_size; if (RET_FAIL(seal_cur_page_if_full())) { return ret; } @@ -73,29 +98,25 @@ class TimeChunkWriter { Statistic* get_chunk_statistic() { return chunk_statistic_; } FORCE_INLINE int32_t num_of_pages() const { return num_of_pages_; } + int64_t estimate_max_series_mem_size(); + + bool hasData(); + // Current (unsealed) page point count. FORCE_INLINE uint32_t get_point_numer() const { return time_page_writer_.get_point_numer(); } - int64_t estimate_max_series_mem_size(); - - bool hasData(); - /** True if the current (unsealed) page has at least one point. */ bool has_current_page_data() const { return time_page_writer_.get_point_numer() > 0; } - /** - * Force seal the current page (for aligned model: when any aligned page - * seals due to memory/point threshold, all pages must seal together). - * @return E_OK on success. - */ + /** Force seal the current page. */ int seal_current_page() { return seal_cur_page(false); } - // For aligned writer: allow disabling the automatic page-size/point-number - // check so the caller can seal pages at chosen boundaries. + // Allow disabling the automatic page-size/point-number check so the + // caller can seal pages at chosen boundaries. FORCE_INLINE void set_enable_page_seal_if_full(bool enable) { enable_page_seal_if_full_ = enable; } @@ -109,6 +130,9 @@ class TimeChunkWriter { common::g_config_value_.page_writer_max_memory_bytes_); } FORCE_INLINE int seal_cur_page_if_full() { + if (UNLIKELY(!enable_page_seal_if_full_)) { + return common::E_OK; + } if (UNLIKELY(is_cur_page_full())) { return seal_cur_page(false); } @@ -138,8 +162,7 @@ class TimeChunkWriter { ChunkHeader chunk_header_; int32_t num_of_pages_; - // If false, write() won't auto-seal when the current page becomes full. - bool enable_page_seal_if_full_; + bool enable_page_seal_if_full_ = true; }; } // end namespace storage diff --git a/cpp/src/writer/time_page_writer.h b/cpp/src/writer/time_page_writer.h index d9dcecff1..bda9a5023 100644 --- a/cpp/src/writer/time_page_writer.h +++ b/cpp/src/writer/time_page_writer.h @@ -84,15 +84,40 @@ class TimePageWriter { return ret; } + int write_batch(const int64_t* timestamps, uint32_t count) { + int ret = common::E_OK; + if (count == 0) return ret; + // Check order: first timestamp vs existing end_time + if (statistic_->count_ != 0 && is_inited_ && + timestamps[0] <= statistic_->end_time_) { + return common::E_OUT_OF_ORDER; + } + // Check monotonicity within batch + for (uint32_t i = 1; i < count; i++) { + if (timestamps[i] <= timestamps[i - 1]) { + return common::E_OUT_OF_ORDER; + } + } + if (RET_FAIL(time_encoder_->encode_batch(timestamps, count, + time_out_stream_))) { + } else { + statistic_->update_time_batch(timestamps, count); + } + return ret; + } + FORCE_INLINE uint32_t get_point_numer() const { return statistic_->count_; } FORCE_INLINE uint32_t get_time_out_stream_size() const { return time_out_stream_.total_size(); } + // Logical bytes written — used by the page-seal-when-full heuristic. FORCE_INLINE uint32_t get_page_memory_size() const { return time_out_stream_.total_size(); } + // Allocated 64 KiB-page footprint — used by chunk-group memory pressure + // accounting. See PageWriter::estimate_max_mem_size. FORCE_INLINE uint32_t estimate_max_mem_size() const { - return time_out_stream_.total_size() + + return static_cast(time_out_stream_.allocated_bytes()) + time_encoder_->get_max_byte_size(); } int write_to_chunk(common::ByteStream& pages_data, bool write_header, @@ -102,6 +127,11 @@ class TimePageWriter { } FORCE_INLINE Statistic* get_statistic() { return statistic_; } TimePageData get_cur_page_data() { return cur_page_data_; } + // See ValuePageWriter::release_cur_page_data for rationale. + void release_cur_page_data() { + cur_page_data_.uncompressed_buf_ = nullptr; + cur_page_data_.compressed_buf_ = nullptr; + } void destroy_page_data() { cur_page_data_.destroy(); } private: diff --git a/cpp/src/writer/tsfile_table_writer.cc b/cpp/src/writer/tsfile_table_writer.cc index eb0319af8..b1b7911bd 100644 --- a/cpp/src/writer/tsfile_table_writer.cc +++ b/cpp/src/writer/tsfile_table_writer.cc @@ -45,7 +45,7 @@ TsFileTableWriter::TsFileTableWriter( } // namespace storage -storage::TsFileTableWriter::~TsFileTableWriter() = default; +storage::TsFileTableWriter::~TsFileTableWriter() { close(); } int storage::TsFileTableWriter::register_table( const std::shared_ptr& table_schema) { @@ -66,21 +66,48 @@ int storage::TsFileTableWriter::write_table(storage::Tablet& tablet) const { tablet.get_table_name() != exclusive_table_name_) { return common::E_TABLE_NOT_EXIST; } + // Always lowercase the incoming tablet's table / column / schema-map + // names: each call may carry a fresh tablet with mixed-case identifiers, + // and the underlying engine expects lowercase. Lowering is idempotent so + // reusing the same tablet across calls remains cheap. tablet.set_table_name(to_lower(tablet.get_table_name())); for (size_t i = 0; i < tablet.get_column_count(); i++) { tablet.set_column_name(i, to_lower(tablet.get_column_name(i))); } auto schema_map = tablet.get_schema_map(); - std::map schema_map_; + std::map new_schema_map; for (auto iter = schema_map.begin(); iter != schema_map.end(); iter++) { - schema_map_[to_lower(iter->first)] = iter->second; + new_schema_map[to_lower(iter->first)] = iter->second; } - tablet.set_schema_map(schema_map_); + tablet.set_schema_map(new_schema_map); return tsfile_writer_->write_table(tablet); } -int storage::TsFileTableWriter::flush() { return tsfile_writer_->flush(); } +int storage::TsFileTableWriter::flush() { + if (closed_) { + return common::E_OK; + } + return tsfile_writer_->flush(); +} -int storage::TsFileTableWriter::close() { return tsfile_writer_->close(); } +int storage::TsFileTableWriter::close() { + if (closed_) { + return common::E_OK; + } + if (!tsfile_writer_) { + closed_ = true; + return common::E_OK; + } + // Don't latch closed_ until the underlying writer reports success: a + // failed footer write / sync / file close should be retryable, and the + // destructor must still be able to drive a final close attempt. The + // previous order returned E_OK on every retry after the first failure, + // potentially leaving the file unfinished and leaking the fd. + int ret = tsfile_writer_->close(); + if (ret == common::E_OK) { + closed_ = true; + } + return ret; +} diff --git a/cpp/src/writer/tsfile_table_writer.h b/cpp/src/writer/tsfile_table_writer.h index ce18bc007..a2d2a5fd9 100644 --- a/cpp/src/writer/tsfile_table_writer.h +++ b/cpp/src/writer/tsfile_table_writer.h @@ -124,6 +124,8 @@ class TsFileTableWriter { // Some errors may not be conveyed during the construction phase, so it's // necessary to maintain an internal error code. int error_number = common::E_OK; + + bool closed_ = false; }; } // namespace storage diff --git a/cpp/src/writer/tsfile_writer.cc b/cpp/src/writer/tsfile_writer.cc index bc3398d98..c469faaec 100644 --- a/cpp/src/writer/tsfile_writer.cc +++ b/cpp/src/writer/tsfile_writer.cc @@ -25,8 +25,12 @@ #include #endif +#include +#include + #include "chunk_writer.h" #include "common/config/config.h" +#include "common/global.h" #ifdef ENABLE_THREADS #include "common/thread_pool.h" #endif @@ -56,23 +60,19 @@ int libtsfile_init() { } void libtsfile_destroy() { + ModStat::get_instance().destroy(); #ifdef ENABLE_THREADS - delete common::g_write_thread_pool_; - common::g_write_thread_pool_ = nullptr; + delete common::g_thread_pool_; + common::g_thread_pool_ = nullptr; #endif - ModStat::get_instance().destroy(); libtsfile::g_s_is_inited = false; } -void set_page_max_point_count(uint32_t page_max_ponint_count) { - config_set_page_max_point_count(page_max_ponint_count); +int set_page_max_point_count(uint32_t page_max_ponint_count) { + return config_set_page_max_point_count(page_max_ponint_count); } -void set_max_degree_of_index_node(uint32_t max_degree_of_index_node) { - config_set_max_degree_of_index_node(max_degree_of_index_node); -} - -void set_strict_page_size(bool strict_page_size) { - config_set_strict_page_size(strict_page_size); +int set_max_degree_of_index_node(uint32_t max_degree_of_index_node) { + return config_set_max_degree_of_index_node(max_degree_of_index_node); } TsFileWriter::TsFileWriter() @@ -84,8 +84,7 @@ TsFileWriter::TsFileWriter() record_count_for_next_mem_check_( g_config_value_.record_count_for_next_mem_check_), write_file_created_(false), - io_writer_owned_(true), - enforce_recovered_last_time_order_(false) {} + io_writer_owned_(true) {} TsFileWriter::~TsFileWriter() { destroy(); } @@ -131,7 +130,19 @@ int TsFileWriter::init(WriteFile* write_file) { write_file_ = write_file; write_file_created_ = false; io_writer_owned_ = true; + // Re-arm per-lifecycle state when the writer is reused after a + // destroy(). enforce_recovered_last_time_order_ may have been set + // true by a previous recovery init; without resetting it we'd refuse + // valid writes whose timestamps don't satisfy a long-stale anchor. + // unrecoverable_ from a previous partial-write failure would otherwise + // make every operation on the new file fail immediately. + // start_file_done_ is true after the previous lifecycle's first flush, + // so without resetting it flush() would skip the magic/version write on + // the new file and produce headerless output. enforce_recovered_last_time_order_ = false; + unrecoverable_ = false; + start_file_done_ = false; + record_count_since_last_flush_ = 0; io_writer_ = new TsFileIOWriter(); io_writer_->init(write_file_); return E_OK; @@ -151,6 +162,10 @@ int TsFileWriter::init(RestorableTsFileIOWriter* rw) { write_file_ = rw->get_write_file(); write_file_created_ = false; io_writer_owned_ = false; + // Clear any unrecoverable_ latched from a previous lifecycle so the + // re-init isn't immediately poisoned. + unrecoverable_ = false; + // Reject new writes whose timestamps fall back into the recovered range. enforce_recovered_last_time_order_ = true; io_writer_ = rw; @@ -188,6 +203,8 @@ int TsFileWriter::init(RestorableTsFileIOWriter* rw) { if (cm == nullptr) { continue; } + // Track the highest end_time across recovered chunks so that + // appending writes can refuse out-of-order timestamps. if (cm->statistic_ != nullptr && cm->statistic_->count_ > 0) { group->last_time_ = std::max(group->last_time_, cm->statistic_->end_time_); @@ -682,6 +699,10 @@ int64_t TsFileWriter::calculate_mem_size_for_all_group() { return mem_total_size; } +int64_t TsFileWriter::calculate_meta_mem_size() const { + return io_writer_->get_meta_size(); +} + /** * check occupied memory size, if it exceeds the chunkGroupSize threshold, flush * them to given OutputStream. @@ -689,7 +710,15 @@ int64_t TsFileWriter::calculate_mem_size_for_all_group() { int TsFileWriter::check_memory_size_and_may_flush_chunks() { int ret = E_OK; if (record_count_since_last_flush_ >= record_count_for_next_mem_check_) { - int64_t mem_size = calculate_mem_size_for_all_group(); + // chunk-writer memory drops to ~0 after flush, but chunk metadata + // (ChunkMeta / ChunkGroupMeta / per-statistic PageArenas) keeps + // accumulating until end_file(). Wide-schema or many-flush + // workloads can pile up tens of MB of metadata that the old + // threshold check ignored entirely — flush would never fire even + // though total writer memory was well past chunk_group_size_threshold_. + int64_t chunk_size = calculate_mem_size_for_all_group(); + int64_t meta_size = calculate_meta_mem_size(); + int64_t mem_size = chunk_size + meta_size; record_count_for_next_mem_check_ = record_count_since_last_flush_ * common::g_config_value_.chunk_group_size_threshold_ / mem_size; @@ -701,16 +730,17 @@ int TsFileWriter::check_memory_size_and_may_flush_chunks() { } int TsFileWriter::write_record(const TsRecord& record) { + if (UNLIKELY(unrecoverable_)) return E_DATA_INCONSISTENCY; int ret = E_OK; auto device_id = std::make_shared(record.device_id_); - auto schema_it = schemas_.find(device_id); - if (schema_it == schemas_.end() || schema_it->second == nullptr) { - return E_DEVICE_NOT_EXIST; - } - MeasurementSchemaGroup* device_schema = schema_it->second; - if (enforce_recovered_last_time_order_ && - record.timestamp_ <= device_schema->last_time_) { - return E_OUT_OF_ORDER; + // After recovery, refuse writes whose timestamp would land at or before + // any already-flushed chunk's end_time for this device. + if (enforce_recovered_last_time_order_) { + auto schema_it = schemas_.find(device_id); + if (schema_it != schemas_.end() && schema_it->second != nullptr && + record.timestamp_ <= schema_it->second->last_time_) { + return E_OUT_OF_ORDER; + } } // std::vector chunk_writers; SimpleVector chunk_writers; @@ -732,24 +762,28 @@ int TsFileWriter::write_record(const TsRecord& record) { record.points_[c]); } - device_schema->last_time_ = - std::max(device_schema->last_time_, record.timestamp_); + if (enforce_recovered_last_time_order_) { + auto schema_it = schemas_.find(device_id); + if (schema_it != schemas_.end() && schema_it->second != nullptr) { + schema_it->second->last_time_ = + std::max(schema_it->second->last_time_, record.timestamp_); + } + } record_count_since_last_flush_++; ret = check_memory_size_and_may_flush_chunks(); return ret; } int TsFileWriter::write_record_aligned(const TsRecord& record) { + if (UNLIKELY(unrecoverable_)) return E_DATA_INCONSISTENCY; int ret = E_OK; auto device_id = std::make_shared(record.device_id_); - auto schema_it = schemas_.find(device_id); - if (schema_it == schemas_.end() || schema_it->second == nullptr) { - return E_DEVICE_NOT_EXIST; - } - MeasurementSchemaGroup* device_schema = schema_it->second; - if (enforce_recovered_last_time_order_ && - record.timestamp_ <= device_schema->last_time_) { - return E_OUT_OF_ORDER; + if (enforce_recovered_last_time_order_) { + auto schema_it = schemas_.find(device_id); + if (schema_it != schemas_.end() && schema_it->second != nullptr && + record.timestamp_ <= schema_it->second->last_time_) { + return E_OUT_OF_ORDER; + } } SimpleVector value_chunk_writers; SimpleVector data_types; @@ -763,6 +797,8 @@ int TsFileWriter::write_record_aligned(const TsRecord& record) { if (value_chunk_writers.size() != record.points_.size()) { return E_INVALID_ARG; } + // Snapshot page counters before the write so we can detect any column + // that crossed a page boundary and seal the rest in lockstep. int32_t time_pages_before = time_chunk_writer->num_of_pages(); std::vector value_pages_before(value_chunk_writers.size(), 0); for (uint32_t c = 0; c < value_chunk_writers.size(); c++) { @@ -771,22 +807,40 @@ int TsFileWriter::write_record_aligned(const TsRecord& record) { value_pages_before[c] = value_chunk_writer->num_of_pages(); } } - time_chunk_writer->write(record.timestamp_); + // Time first: a rejected timestamp (E_OUT_OF_ORDER, OOM, etc.) must + // not silently advance the value writers — that would leave the time + // chunk one row behind every value chunk for the rest of the file. + if (RET_FAIL(time_chunk_writer->write(record.timestamp_))) { + return ret; + } for (uint32_t c = 0; c < value_chunk_writers.size(); c++) { ValueChunkWriter* value_chunk_writer = value_chunk_writers[c]; if (IS_NULL(value_chunk_writer)) { continue; } - write_point_aligned(value_chunk_writer, record.timestamp_, - data_types[c], record.points_[c]); + if (RET_FAIL(write_point_aligned(value_chunk_writer, record.timestamp_, + data_types[c], record.points_[c]))) { + // Time wrote the row but at least one value column failed + // mid-record; the per-column row counts no longer agree. + // Mark the writer unrecoverable so flush/close refuses to + // seal a misaligned chunk group. + unrecoverable_ = true; + return ret; + } } if (RET_FAIL(maybe_seal_aligned_pages_together( time_chunk_writer, value_chunk_writers, time_pages_before, value_pages_before))) { + unrecoverable_ = true; return ret; } - device_schema->last_time_ = - std::max(device_schema->last_time_, record.timestamp_); + if (enforce_recovered_last_time_order_) { + auto schema_it = schemas_.find(device_id); + if (schema_it != schemas_.end() && schema_it->second != nullptr) { + schema_it->second->last_time_ = + std::max(schema_it->second->last_time_, record.timestamp_); + } + } return ret; } @@ -815,39 +869,10 @@ int TsFileWriter::write_point(ChunkWriter* chunk_writer, int64_t timestamp, } } -int TsFileWriter::write_point_aligned(ValueChunkWriter* value_chunk_writer, - int64_t timestamp, - common::TSDataType data_type, - const DataPoint& point) { - bool isnull = point.isnull; - switch (data_type) { - case common::BOOLEAN: - return value_chunk_writer->write(timestamp, point.u_.bool_val_, - isnull); - case common::INT32: - case common::DATE: - return value_chunk_writer->write(timestamp, point.u_.i32_val_, - isnull); - case common::TIMESTAMP: - case common::INT64: - return value_chunk_writer->write(timestamp, point.u_.i64_val_, - isnull); - case common::FLOAT: - return value_chunk_writer->write(timestamp, point.u_.float_val_, - isnull); - case common::DOUBLE: - return value_chunk_writer->write(timestamp, point.u_.double_val_, - isnull); - case common::BLOB: - case common::TEXT: - case common::STRING: - return value_chunk_writer->write(timestamp, point.text_val_, - isnull); - default: - return E_INVALID_DATA_POINT; - } -} - +// After writing one record / batch to the time chunk and every value chunk, +// keep their page boundaries aligned: if any of them autosealed a page on +// memory pressure, seal the rest of the open pages too so an aligned reader +// can still pair position N across time + every value column. int TsFileWriter::maybe_seal_aligned_pages_together( TimeChunkWriter* time_chunk_writer, common::SimpleVector& value_chunk_writers, @@ -883,19 +908,52 @@ int TsFileWriter::maybe_seal_aligned_pages_together( return ret; } +int TsFileWriter::write_point_aligned(ValueChunkWriter* value_chunk_writer, + int64_t timestamp, + common::TSDataType data_type, + const DataPoint& point) { + bool isnull = point.isnull; + switch (data_type) { + case common::BOOLEAN: + return value_chunk_writer->write(timestamp, point.u_.bool_val_, + isnull); + case common::INT32: + case common::DATE: + return value_chunk_writer->write(timestamp, point.u_.i32_val_, + isnull); + case common::TIMESTAMP: + case common::INT64: + return value_chunk_writer->write(timestamp, point.u_.i64_val_, + isnull); + case common::FLOAT: + return value_chunk_writer->write(timestamp, point.u_.float_val_, + isnull); + case common::DOUBLE: + return value_chunk_writer->write(timestamp, point.u_.double_val_, + isnull); + case common::BLOB: + case common::TEXT: + case common::STRING: + return value_chunk_writer->write(timestamp, point.text_val_, + isnull); + default: + return E_INVALID_DATA_POINT; + } +} + int TsFileWriter::write_tablet_aligned(const Tablet& tablet) { + if (UNLIKELY(unrecoverable_)) return E_DATA_INCONSISTENCY; int ret = E_OK; auto device_id = std::make_shared(tablet.insert_target_name_); - auto schema_it = schemas_.find(device_id); - if (schema_it == schemas_.end() || schema_it->second == nullptr) { - return E_DEVICE_NOT_EXIST; - } - MeasurementSchemaGroup* device_schema = schema_it->second; const uint32_t total_rows = tablet.get_cur_row_size(); if (enforce_recovered_last_time_order_ && total_rows > 0 && - tablet.timestamps_[0] <= device_schema->last_time_) { - return E_OUT_OF_ORDER; + tablet.timestamps_ != nullptr) { + auto schema_it = schemas_.find(device_id); + if (schema_it != schemas_.end() && schema_it->second != nullptr && + tablet.timestamps_[0] <= schema_it->second->last_time_) { + return E_OUT_OF_ORDER; + } } SimpleVector value_chunk_writers; TimeChunkWriter* time_chunk_writer = nullptr; @@ -906,247 +964,109 @@ int TsFileWriter::write_tablet_aligned(const Tablet& tablet) { data_types))) { return ret; } - const bool strict_page_size = common::g_config_value_.strict_page_size_; - - // Decide whether we have string/blob/text columns. - bool has_varlen_column = false; - for (uint32_t i = 0; i < data_types.size(); i++) { - if (data_types[i] == common::STRING || data_types[i] == common::TEXT || - data_types[i] == common::BLOB) { - has_varlen_column = true; - break; - } - } - - // Keep writers' seal-check behavior consistent across calls. - time_chunk_writer->set_enable_page_seal_if_full(strict_page_size); - for (uint32_t c = 0; c < value_chunk_writers.size(); c++) { - if (!IS_NULL(value_chunk_writers[c])) { - value_chunk_writers[c]->set_enable_page_seal_if_full( - strict_page_size); - } - } - - if (strict_page_size) { - // Strict mode: keep the original row-based insertion to ensure aligned - // pages seal together when either side becomes full. - for (uint32_t row = 0; row < total_rows; row++) { - int32_t time_pages_before = time_chunk_writer->num_of_pages(); - std::vector value_pages_before(value_chunk_writers.size(), - 0); - for (uint32_t c = 0; c < value_chunk_writers.size(); c++) { - ValueChunkWriter* value_chunk_writer = value_chunk_writers[c]; - if (!IS_NULL(value_chunk_writer)) { - value_pages_before[c] = value_chunk_writer->num_of_pages(); - } - } - - if (RET_FAIL(time_chunk_writer->write(tablet.timestamps_[row]))) { - return ret; - } - ASSERT(value_chunk_writers.size() == tablet.get_column_count()); - for (uint32_t c = 0; c < value_chunk_writers.size(); c++) { - ValueChunkWriter* value_chunk_writer = value_chunk_writers[c]; - if (IS_NULL(value_chunk_writer)) { - continue; - } - if (RET_FAIL(value_write_column(value_chunk_writer, tablet, c, - row, row + 1))) { - return ret; - } - } - if (RET_FAIL(maybe_seal_aligned_pages_together( - time_chunk_writer, value_chunk_writers, time_pages_before, - value_pages_before))) { - return ret; - } + ASSERT(data_types.size() == tablet.get_column_count()); + for (uint32_t c = 0; c < data_types.size(); c++) { + if (data_types[c] == common::NULL_TYPE) { + continue; } - if (total_rows > 0) { - device_schema->last_time_ = std::max( - device_schema->last_time_, tablet.timestamps_[total_rows - 1]); + if (data_types[c] != tablet.schema_vec_->at(c).data_type_) { + return E_TYPE_NOT_MATCH; } - return ret; } - - // Non-strict mode: switch to column-based insertion. - if (!has_varlen_column) { - // Optimization: when there is no string/blob/text column, we only need - // to split by point-number so that each split will trigger a page - // seal (and avoid the per-row page-size check). - const uint32_t points_per_page = - common::g_config_value_.page_writer_max_point_num_; - - // Disable auto page sealing. We will seal pages at split boundaries. - time_chunk_writer->set_enable_page_seal_if_full(false); - for (uint32_t c = 0; c < value_chunk_writers.size(); c++) { - if (!IS_NULL(value_chunk_writers[c])) { - value_chunk_writers[c]->set_enable_page_seal_if_full(false); - } - } - - // Determine how many points we need to fill the current unsealed time - // page (it may already contain data from previous tablets). - uint32_t time_cur_points = time_chunk_writer->get_point_numer(); - if (time_cur_points >= points_per_page && - time_chunk_writer->has_current_page_data()) { - // Close the already-full page together with all aligned value - // pages. - if (RET_FAIL(time_chunk_writer->seal_current_page())) { - return ret; - } - for (uint32_t c = 0; c < value_chunk_writers.size(); c++) { - ValueChunkWriter* value_chunk_writer = value_chunk_writers[c]; - if (!IS_NULL(value_chunk_writer) && - value_chunk_writer->has_current_page_data()) { - if (RET_FAIL(value_chunk_writer->seal_current_page())) { - return ret; - } - } - } - time_cur_points = 0; - } - const uint32_t first_seg_len = - (time_cur_points > 0 && time_cur_points < points_per_page) - ? (points_per_page - time_cur_points) - : points_per_page; - - // 1) Write time in segments and seal all full segments (except the - // last remaining segment). - uint32_t seg_start = 0; - uint32_t seg_len = first_seg_len; - while (seg_start < total_rows) { - const uint32_t seg_end = std::min(seg_start + seg_len, total_rows); - if (RET_FAIL(time_write_column(time_chunk_writer, tablet, seg_start, - seg_end))) { - return ret; - } - seg_start = seg_end; - if (seg_start < total_rows) { - if (RET_FAIL(time_chunk_writer->seal_current_page())) { - return ret; - } - } - seg_len = points_per_page; - } - - // 2) Write each value column in the same segments. - ASSERT(value_chunk_writers.size() == tablet.get_column_count()); - for (uint32_t col = 0; col < value_chunk_writers.size(); col++) { - ValueChunkWriter* value_chunk_writer = value_chunk_writers[col]; - if (IS_NULL(value_chunk_writer)) { - continue; - } - - seg_start = 0; - seg_len = first_seg_len; - while (seg_start < total_rows) { - const uint32_t seg_end = - std::min(seg_start + seg_len, total_rows); - if (RET_FAIL(value_write_column(value_chunk_writer, tablet, col, - seg_start, seg_end))) { - return ret; - } - seg_start = seg_end; - if (seg_start < total_rows) { - if (value_chunk_writer->has_current_page_data() && - RET_FAIL(value_chunk_writer->seal_current_page())) { - return ret; - } - } - seg_len = points_per_page; - } - } - if (total_rows > 0) { - device_schema->last_time_ = std::max( - device_schema->last_time_, tablet.timestamps_[total_rows - 1]); + // Snapshot page counters before the batch so we can detect any column + // that crossed a page boundary mid-tablet and seal the rest in lockstep. + int32_t time_pages_before = time_chunk_writer->num_of_pages(); + std::vector value_pages_before(value_chunk_writers.size(), 0); + for (uint32_t c = 0; c < value_chunk_writers.size(); c++) { + ValueChunkWriter* value_chunk_writer = value_chunk_writers[c]; + if (!IS_NULL(value_chunk_writer)) { + value_pages_before[c] = value_chunk_writer->num_of_pages(); } - return ret; } - - // General non-strict (may have varlen STRING/TEXT/BLOB columns): - // time auto-seals to provide aligned page boundaries; value writers - // skip auto page sealing and are sealed manually at time boundaries. - // Attention: since value-side auto-seal is disabled, if a varlen value - // page hits the memory threshold earlier, it may not seal immediately - // and instead will be sealed later at the recorded time-page boundaries - // (this may sacrifice the strict page size limit for performance). - time_chunk_writer->set_enable_page_seal_if_full(true); + // Suppress memory-driven page sealing on every column for the duration of + // the batch. The count-driven seals inside write_batch still fire at the + // same `page_writer_max_point_num_` boundary on every writer (time + + // values), which keeps aligned page boundaries in lock-step. Re-enable + // both before returning so subsequent record-by-record writes restore the + // normal memory-pressure behavior, and let the final + // maybe_seal_aligned_pages_together pick up any count-driven divergence + // (e.g. when a sealed value column ended a page that the time column did + // not). + time_chunk_writer->set_enable_page_seal_if_full(false); for (uint32_t c = 0; c < value_chunk_writers.size(); c++) { - if (!IS_NULL(value_chunk_writers[c])) { - value_chunk_writers[c]->set_enable_page_seal_if_full(false); + ValueChunkWriter* value_chunk_writer = value_chunk_writers[c]; + if (!IS_NULL(value_chunk_writer)) { + value_chunk_writer->set_enable_page_seal_if_full(false); } } - - std::vector time_page_row_ends; - const uint32_t page_max_points = std::max( - 1, common::g_config_value_.page_writer_max_point_num_); - time_page_row_ends.reserve(total_rows / page_max_points + 1); - - // Write time and record where a time page is sealed. - for (uint32_t row = 0; row < total_rows; row++) { - const int32_t pages_before = time_chunk_writer->num_of_pages(); - if (RET_FAIL(time_chunk_writer->write(tablet.timestamps_[row]))) { - return ret; - } - const int32_t pages_after = time_chunk_writer->num_of_pages(); - if (pages_after > pages_before) { - const uint32_t boundary_end = row + 1; - if (time_page_row_ends.empty() || - time_page_row_ends.back() != boundary_end) { - time_page_row_ends.push_back(boundary_end); + auto restore_seal = [&]() { + time_chunk_writer->set_enable_page_seal_if_full(true); + for (uint32_t k = 0; k < value_chunk_writers.size(); k++) { + if (!IS_NULL(value_chunk_writers[k])) { + value_chunk_writers[k]->set_enable_page_seal_if_full(true); } } + }; + // Any failure (out-of-order timestamps, OOM, etc.) must abort before we + // write a single value column — otherwise the time chunk would record + // fewer rows than each value chunk and the chunk-group would deserialize + // as misaligned data. + if (RET_FAIL(time_write_column_batch(time_chunk_writer, tablet, 0, + total_rows))) { + restore_seal(); + return ret; } - - // Write values column-by-column and seal at recorded boundaries. ASSERT(value_chunk_writers.size() == tablet.get_column_count()); - for (uint32_t col = 0; col < value_chunk_writers.size(); col++) { - ValueChunkWriter* value_chunk_writer = value_chunk_writers[col]; + for (uint32_t c = 0; c < value_chunk_writers.size(); c++) { + ValueChunkWriter* value_chunk_writer = value_chunk_writers[c]; if (IS_NULL(value_chunk_writer)) { continue; } - uint32_t seg_start = 0; - for (uint32_t boundary_end : time_page_row_ends) { - if (boundary_end <= seg_start) { - continue; - } - if (RET_FAIL(value_write_column(value_chunk_writer, tablet, col, - seg_start, boundary_end))) { - return ret; - } - if (value_chunk_writer->has_current_page_data() && - RET_FAIL(value_chunk_writer->seal_current_page())) { - return ret; - } - seg_start = boundary_end; - } - if (seg_start < total_rows) { - if (RET_FAIL(value_write_column(value_chunk_writer, tablet, col, - seg_start, total_rows))) { - return ret; - } + if (RET_FAIL(value_write_column_batch(value_chunk_writer, tablet, c, 0, + total_rows))) { + restore_seal(); + // Time chunk has the full row count but at least one value + // column stopped early. Mark the writer unrecoverable so no + // later flush/close seals the divergent state. + unrecoverable_ = true; + return ret; } } - if (total_rows > 0) { - device_schema->last_time_ = std::max( - device_schema->last_time_, tablet.timestamps_[total_rows - 1]); + restore_seal(); + if (RET_FAIL(maybe_seal_aligned_pages_together( + time_chunk_writer, value_chunk_writers, time_pages_before, + value_pages_before))) { + unrecoverable_ = true; + return ret; + } + if (enforce_recovered_last_time_order_ && total_rows > 0 && + tablet.timestamps_ != nullptr) { + auto schema_it = schemas_.find(device_id); + if (schema_it != schemas_.end() && schema_it->second != nullptr) { + schema_it->second->last_time_ = + std::max(schema_it->second->last_time_, + tablet.timestamps_[total_rows - 1]); + } } return ret; } int TsFileWriter::write_tablet(const Tablet& tablet) { + if (UNLIKELY(unrecoverable_)) return E_DATA_INCONSISTENCY; int ret = E_OK; auto device_id = std::make_shared(tablet.insert_target_name_); - auto schema_it = schemas_.find(device_id); - if (schema_it == schemas_.end() || schema_it->second == nullptr) { - return E_DEVICE_NOT_EXIST; - } - MeasurementSchemaGroup* device_schema = schema_it->second; + // Use the actual filled row count — max_row_num_ is the buffer capacity + // and would let uninitialized timestamps/values past the live range leak + // into the chunk. const uint32_t total_rows = tablet.get_cur_row_size(); if (enforce_recovered_last_time_order_ && total_rows > 0 && - tablet.timestamps_[0] <= device_schema->last_time_) { - return E_OUT_OF_ORDER; + tablet.timestamps_ != nullptr) { + auto schema_it = schemas_.find(device_id); + if (schema_it != schemas_.end() && schema_it->second != nullptr && + tablet.timestamps_[0] <= schema_it->second->last_time_) { + return E_OUT_OF_ORDER; + } } SimpleVector chunk_writers; SimpleVector data_types; @@ -1155,22 +1075,44 @@ int TsFileWriter::write_tablet(const Tablet& tablet) { data_types))) { return ret; } + ASSERT(data_types.size() == tablet.get_column_count()); + for (uint32_t c = 0; c < data_types.size(); c++) { + if (data_types[c] == common::NULL_TYPE) { + continue; + } + if (data_types[c] != tablet.schema_vec_->at(c).data_type_) { + return E_TYPE_NOT_MATCH; + } + } ASSERT(chunk_writers.size() == tablet.get_column_count()); + uint32_t columns_written = 0; for (uint32_t c = 0; c < chunk_writers.size(); c++) { ChunkWriter* chunk_writer = chunk_writers[c]; if (IS_NULL(chunk_writer)) { continue; } - if (RET_FAIL(write_column(chunk_writer, tablet, c))) { + if (RET_FAIL( + write_column_batch(chunk_writer, tablet, c, 0, total_rows))) { + // Earlier columns already advanced their chunk writers; this + // column failed mid-write, so per-column row counts diverge. + // Mark unrecoverable so flush/close refuse to seal the + // misaligned tree chunk group. + if (columns_written > 0) unrecoverable_ = true; return ret; } + columns_written++; } - if (total_rows > 0) { - device_schema->last_time_ = std::max( - device_schema->last_time_, tablet.timestamps_[total_rows - 1]); + if (enforce_recovered_last_time_order_ && total_rows > 0 && + tablet.timestamps_ != nullptr) { + auto schema_it = schemas_.find(device_id); + if (schema_it != schemas_.end() && schema_it->second != nullptr) { + schema_it->second->last_time_ = + std::max(schema_it->second->last_time_, + tablet.timestamps_[total_rows - 1]); + } } - record_count_since_last_flush_ += tablet.max_row_num_; + record_count_since_last_flush_ += total_rows; ret = check_memory_size_and_may_flush_chunks(); return ret; } @@ -1201,6 +1143,7 @@ int TsFileWriter::write_tree(const TsRecord& record) { } int TsFileWriter::write_table(Tablet& tablet) { + if (UNLIKELY(unrecoverable_)) return E_DATA_INCONSISTENCY; int ret = E_OK; if (io_writer_->get_schema()->table_schema_map_.find( tablet.insert_target_name_) == @@ -1213,175 +1156,332 @@ int TsFileWriter::write_table(Tablet& tablet) { } auto device_id_end_index_pairs = split_tablet_by_device(tablet); - int start_idx = 0; - for (auto& device_id_end_index_pair : device_id_end_index_pairs) { - auto device_id = device_id_end_index_pair.first; - int end_idx = device_id_end_index_pair.second; - if (end_idx == 0) continue; - - SimpleVector value_chunk_writers; - TimeChunkWriter* time_chunk_writer = nullptr; - if (RET_FAIL(do_check_schema_table(device_id, tablet, time_chunk_writer, - value_chunk_writers))) { - return ret; - } - auto schema_it = schemas_.find(device_id); - MeasurementSchemaGroup* device_schema = - (schema_it == schemas_.end()) ? nullptr : schema_it->second; - std::vector field_columns; - field_columns.reserve(tablet.get_column_count()); - for (uint32_t col = 0; col < tablet.get_column_count(); ++col) { - if (tablet.column_categories_[col] == - common::ColumnCategory::FIELD) { - field_columns.push_back(col); - } - } - ASSERT(field_columns.size() == value_chunk_writers.size()); - - // Precompute page boundaries from point counts — no serial write - // needed. The first segment may be shorter if the time page already - // holds data from a previous write_table call. - const uint32_t page_max_points = std::max( - 1, common::g_config_value_.page_writer_max_point_num_); - const uint32_t si = static_cast(start_idx); - const uint32_t ei = static_cast(end_idx); - if (enforce_recovered_last_time_order_ && device_schema != nullptr && - si < ei && tablet.timestamps_[si] <= device_schema->last_time_) { - return E_OUT_OF_ORDER; - } + if (table_aligned_) { + struct ValueTask { + ValueChunkWriter* vcw; + uint32_t col_idx; + }; + struct SegmentRange { + uint32_t si; + uint32_t ei; + }; + struct DeviceWriteCtx { + TimeChunkWriter* tcw; + std::vector value_tasks; + std::vector segments; + uint32_t initial_page_points; + }; - // If the current unsealed page is already at or past capacity (from - // a previous write_table call), seal it before starting new segments. - uint32_t time_cur_points = time_chunk_writer->get_point_numer(); - if (time_cur_points >= page_max_points) { - if (time_chunk_writer->has_current_page_data()) { - if (RET_FAIL(time_chunk_writer->seal_current_page())) { - return ret; + const uint32_t page_max_points = + std::max(1, g_config_value_.page_writer_max_point_num_); + + std::vector device_ctxs; + std::map, size_t, IDeviceIDComparator> + device_ctx_index; + int start_idx = 0; + for (auto& pair : device_id_end_index_pairs) { + auto device_id = pair.first; + int end_idx = pair.second; + if (end_idx == 0) continue; + + const uint32_t si = static_cast(start_idx); + const uint32_t ei = static_cast(end_idx); + // Recovery: refuse any segment whose first timestamp would land + // at or before a flushed chunk's end_time for this device. This + // mirrors the per-record / per-tablet check on the tree path. + if (enforce_recovered_last_time_order_ && tablet.timestamps_ && + ei > si) { + auto schema_it = schemas_.find(device_id); + if (schema_it != schemas_.end() && + schema_it->second != nullptr && + tablet.timestamps_[si] <= schema_it->second->last_time_) { + return E_OUT_OF_ORDER; } } - for (uint32_t k = 0; k < value_chunk_writers.size(); k++) { - if (!IS_NULL(value_chunk_writers[k]) && - value_chunk_writers[k]->has_current_page_data()) { - if (RET_FAIL(value_chunk_writers[k]->seal_current_page())) { - return ret; + auto idx_it = device_ctx_index.find(device_id); + if (idx_it == device_ctx_index.end()) { + SimpleVector value_chunk_writers; + TimeChunkWriter* time_chunk_writer = nullptr; + if (RET_FAIL(do_check_schema_table(device_id, tablet, + time_chunk_writer, + value_chunk_writers))) { + return ret; + } + + uint32_t time_cur_points = time_chunk_writer->get_point_numer(); + if (time_cur_points >= page_max_points) { + // Seal the time page first, then every value page in + // lockstep. Any failure leaves columns at different + // page boundaries and the chunk group can no longer be + // sealed coherently — mark the writer unrecoverable. + if (time_chunk_writer->has_current_page_data()) { + if (RET_FAIL(time_chunk_writer->seal_current_page())) { + unrecoverable_ = true; + return ret; + } + } + for (uint32_t k = 0; k < value_chunk_writers.size(); k++) { + if (!IS_NULL(value_chunk_writers[k]) && + value_chunk_writers[k]->has_current_page_data()) { + if (RET_FAIL(value_chunk_writers[k] + ->seal_current_page())) { + unrecoverable_ = true; + return ret; + } + } } + time_cur_points = 0; } - } - time_cur_points = 0; - } - const uint32_t first_seg_cap = - (time_cur_points > 0 && time_cur_points < page_max_points) - ? (page_max_points - time_cur_points) - : page_max_points; - std::vector page_boundaries; // row indices where a page - // should seal - { - uint32_t pos = si; - uint32_t seg_cap = first_seg_cap; - while (pos < ei) { - uint32_t seg_end = std::min(pos + seg_cap, ei); - if (seg_end < ei) { - page_boundaries.push_back(seg_end); + DeviceWriteCtx ctx; + ctx.tcw = time_chunk_writer; + ctx.initial_page_points = time_cur_points; + uint32_t field_col_count = 0; + for (uint32_t i = 0; i < tablet.get_column_count(); ++i) { + if (tablet.column_categories_[i] == + common::ColumnCategory::FIELD) { + ValueChunkWriter* vcw = + value_chunk_writers[field_col_count]; + if (!IS_NULL(vcw)) { + ctx.value_tasks.push_back({vcw, i}); + } + field_col_count++; + } } - pos = seg_end; - seg_cap = page_max_points; + device_ctxs.push_back(std::move(ctx)); + idx_it = device_ctx_index + .insert(std::make_pair(device_id, + device_ctxs.size() - 1)) + .first; } + + device_ctxs[idx_it->second].segments.push_back({si, ei}); + start_idx = end_idx; } - // We control page sealing explicitly at precomputed boundaries, so - // auto-seal must be disabled during segmented writes — otherwise a - // segment of exactly page_max_points would trigger auto-seal AND - // our explicit seal, double-sealing (sealing an empty page → crash). - // Note: with auto-seal off, the memory-based threshold - // (page_writer_max_memory_bytes_) is not enforced within a segment. - // For varlen columns (STRING/TEXT/BLOB), individual pages may exceed - // the memory limit. Each segment is still bounded by - // page_max_points rows, keeping pages within a reasonable size. - auto write_time_in_segments = [this, &tablet, &page_boundaries, si, - ei](TimeChunkWriter* tcw) -> int { + auto write_time_segments = + [this, &tablet, page_max_points]( + TimeChunkWriter* tcw, const std::vector& segments, + uint32_t initial_page_points) -> int { int r = E_OK; tcw->set_enable_page_seal_if_full(false); - uint32_t seg_start = si; - for (uint32_t boundary : page_boundaries) { - if ((r = time_write_column(tcw, tablet, seg_start, boundary)) != - E_OK) - return r; - if ((r = tcw->seal_current_page()) != E_OK) return r; - seg_start = boundary; - } - if (seg_start < ei) { - r = time_write_column(tcw, tablet, seg_start, ei); + uint32_t page_remaining = + (initial_page_points > 0 && + initial_page_points < page_max_points) + ? (page_max_points - initial_page_points) + : page_max_points; + for (const auto& segment : segments) { + uint32_t seg_pos = segment.si; + while (seg_pos < segment.ei) { + uint32_t batch = + std::min(page_remaining, segment.ei - seg_pos); + if ((r = time_write_column_batch( + tcw, tablet, seg_pos, seg_pos + batch)) != E_OK) { + tcw->set_enable_page_seal_if_full(true); + return r; + } + seg_pos += batch; + page_remaining -= batch; + if (page_remaining == 0) { + if ((r = tcw->seal_current_page()) != E_OK) { + tcw->set_enable_page_seal_if_full(true); + return r; + } + page_remaining = page_max_points; + } + } } tcw->set_enable_page_seal_if_full(true); return r; }; - auto write_value_in_segments = [this, &tablet, &page_boundaries, si, - ei](ValueChunkWriter* vcw, - uint32_t col_idx) -> int { + auto write_value_segments = + [this, &tablet, page_max_points]( + ValueChunkWriter* vcw, uint32_t col_idx, + const std::vector& segments, + uint32_t initial_page_points) -> int { int r = E_OK; vcw->set_enable_page_seal_if_full(false); - uint32_t seg_start = si; - for (uint32_t boundary : page_boundaries) { - if ((r = value_write_column(vcw, tablet, col_idx, seg_start, - boundary)) != E_OK) - return r; - if (vcw->has_current_page_data() && - (r = vcw->seal_current_page()) != E_OK) - return r; - seg_start = boundary; - } - if (seg_start < ei) { - r = value_write_column(vcw, tablet, col_idx, seg_start, ei); + uint32_t page_remaining = + (initial_page_points > 0 && + initial_page_points < page_max_points) + ? (page_max_points - initial_page_points) + : page_max_points; + for (const auto& segment : segments) { + uint32_t seg_pos = segment.si; + while (seg_pos < segment.ei) { + uint32_t batch = + std::min(page_remaining, segment.ei - seg_pos); + if ((r = value_write_column_batch( + vcw, tablet, col_idx, seg_pos, seg_pos + batch)) != + E_OK) { + vcw->set_enable_page_seal_if_full(true); + return r; + } + seg_pos += batch; + page_remaining -= batch; + if (page_remaining == 0) { + if (vcw->has_current_page_data() && + (r = vcw->seal_current_page()) != E_OK) { + vcw->set_enable_page_seal_if_full(true); + return r; + } + page_remaining = page_max_points; + } + } } vcw->set_enable_page_seal_if_full(true); return r; }; - // All columns (time + values) write the same row segments and seal - // at the same boundaries — fully parallel. #ifdef ENABLE_THREADS - if (g_config_value_.parallel_write_enabled_) { + if (g_config_value_.parallel_write_enabled_ && + common::g_thread_pool_ != nullptr) { std::vector> futures; - futures.push_back(g_write_thread_pool_->submit( - [&write_time_in_segments, time_chunk_writer]() { - return write_time_in_segments(time_chunk_writer); - })); - for (uint32_t k = 0; k < value_chunk_writers.size(); k++) { - ValueChunkWriter* vcw = value_chunk_writers[k]; - if (IS_NULL(vcw)) continue; - uint32_t col_idx = field_columns[k]; - futures.push_back(g_write_thread_pool_->submit( - [&write_value_in_segments, vcw, col_idx]() { - return write_value_in_segments(vcw, col_idx); + for (auto& ctx : device_ctxs) { + futures.push_back(common::g_thread_pool_->submit( + [&write_time_segments, &ctx]() { + return write_time_segments(ctx.tcw, ctx.segments, + ctx.initial_page_points); })); + for (auto& vt : ctx.value_tasks) { + futures.push_back(common::g_thread_pool_->submit( + [&write_value_segments, &vt, &ctx]() { + return write_value_segments( + vt.vcw, vt.col_idx, ctx.segments, + ctx.initial_page_points); + })); + } } for (auto& f : futures) { int r = f.get(); if (r != E_OK && ret == E_OK) ret = r; } - if (ret != E_OK) return ret; + if (ret != E_OK) { + // One task aborted mid-batch while others may have written + // all of their rows; the per-column row counts no longer + // line up. Mark the writer unrecoverable so flush/close + // can't seal a corrupt aligned chunk group. + unrecoverable_ = true; + return ret; + } } else #endif { - if (RET_FAIL(write_time_in_segments(time_chunk_writer))) { + for (auto& ctx : device_ctxs) { + if (RET_FAIL(write_time_segments(ctx.tcw, ctx.segments, + ctx.initial_page_points))) { + // Time wrote partial rows before failing; value columns + // still hold the prior count. Same column-alignment + // hazard as the parallel path. + unrecoverable_ = true; + return ret; + } + for (auto& vt : ctx.value_tasks) { + if (RET_FAIL(write_value_segments( + vt.vcw, vt.col_idx, ctx.segments, + ctx.initial_page_points))) { + unrecoverable_ = true; + return ret; + } + } + } + } + } else { + int start_idx = 0; + for (auto& device_id_end_index_pair : device_id_end_index_pairs) { + auto device_id = device_id_end_index_pair.first; + int end_idx = device_id_end_index_pair.second; + if (end_idx == 0) continue; + + const uint32_t si = static_cast(start_idx); + if (enforce_recovered_last_time_order_ && tablet.timestamps_ && + end_idx > start_idx) { + auto schema_it = schemas_.find(device_id); + if (schema_it != schemas_.end() && + schema_it->second != nullptr && + tablet.timestamps_[si] <= schema_it->second->last_time_) { + return E_OUT_OF_ORDER; + } + } + MeasurementNamesFromTablet mnames_getter(tablet); + SimpleVector chunk_writers; + SimpleVector data_types; + if (RET_FAIL(do_check_schema(device_id, mnames_getter, + chunk_writers, data_types))) { return ret; } - for (uint32_t k = 0; k < value_chunk_writers.size(); k++) { - ValueChunkWriter* vcw = value_chunk_writers[k]; - if (IS_NULL(vcw)) continue; - if (RET_FAIL(write_value_in_segments(vcw, field_columns[k]))) { + ASSERT(chunk_writers.size() == tablet.get_column_count()); + +#ifdef ENABLE_THREADS + if (chunk_writers.size() >= 2 && + g_config_value_.parallel_write_enabled_ && + common::g_thread_pool_ != nullptr) { + const uint32_t si = start_idx; + const uint32_t ei = device_id_end_index_pair.second; + std::vector> futures; + for (uint32_t c = 0; c < chunk_writers.size(); c++) { + ChunkWriter* cw = chunk_writers[c]; + if (IS_NULL(cw)) continue; + futures.push_back(common::g_thread_pool_->submit( + [this, cw, &tablet, c, si, ei]() { + return write_column_batch(cw, tablet, c, si, ei); + })); + } + for (auto& f : futures) { + int r = f.get(); + if (r != E_OK && ret == E_OK) ret = r; + } + if (ret != E_OK) { + // One column aborted partway while sibling columns + // may have written all of their rows. The per-column + // chunk writers now disagree on row count, so subsequent + // flush/close would seal a corrupt non-aligned chunk + // group. Same hazard as the aligned parallel path — + // mark the writer unrecoverable so future ops refuse. + unrecoverable_ = true; return ret; } + } else +#endif + { + for (uint32_t c = 0; c < chunk_writers.size(); c++) { + ChunkWriter* chunk_writer = chunk_writers[c]; + if (IS_NULL(chunk_writer)) continue; + if (RET_FAIL(write_column_batch( + chunk_writer, tablet, c, start_idx, + device_id_end_index_pair.second))) { + // Sequential path: earlier columns already wrote + // their batch, this column failed → divergent row + // counts. Same unrecoverable contract. + if (c > 0) unrecoverable_ = true; + return ret; + } + } } + start_idx = device_id_end_index_pair.second; } - if (device_schema != nullptr && si < ei) { - device_schema->last_time_ = - std::max(device_schema->last_time_, tablet.timestamps_[ei - 1]); + } + // After all device segments wrote successfully, advance recovery's + // per-device last_time_ floor to the highest timestamp this tablet + // contributed for each device. + if (enforce_recovered_last_time_order_ && tablet.timestamps_) { + int update_start = 0; + for (auto& pair : device_id_end_index_pairs) { + int end_idx = pair.second; + if (end_idx == 0) continue; + if (end_idx > update_start) { + auto schema_it = schemas_.find(pair.first); + if (schema_it != schemas_.end() && + schema_it->second != nullptr) { + schema_it->second->last_time_ = + std::max(schema_it->second->last_time_, + tablet.timestamps_[end_idx - 1]); + } + } + update_start = end_idx; } - start_idx = end_idx; } record_count_since_last_flush_ += tablet.cur_row_size_; // Reset string column buffers so the tablet can be reused for the next @@ -1395,14 +1495,13 @@ std::vector, int>> TsFileWriter::split_tablet_by_device(const Tablet& tablet) { std::vector, int>> result; - if (tablet.id_column_indexes_.empty()) { + if (tablet.id_column_indexes_.empty() || tablet.single_device_) { + // No tag columns or caller guarantees single device — skip boundary + // detection entirely. auto sentinel = std::make_shared("last_device_id"); result.emplace_back(std::move(sentinel), 0); - std::vector id_array; - id_array.push_back(new std::string(tablet.insert_target_name_)); - auto res = std::make_shared(id_array); - delete id_array[0]; - result.emplace_back(std::move(res), tablet.get_cur_row_size()); + std::shared_ptr dev_id(tablet.get_device_id(0)); + result.emplace_back(std::move(dev_id), tablet.get_cur_row_size()); return result; } @@ -1428,41 +1527,49 @@ TsFileWriter::split_tablet_by_device(const Tablet& tablet) { int TsFileWriter::write_column(ChunkWriter* chunk_writer, const Tablet& tablet, int col_idx, uint32_t start_idx, uint32_t end_idx) { - int ret = E_OK; - common::TSDataType data_type = tablet.schema_vec_->at(col_idx).data_type_; int64_t* timestamps = tablet.timestamps_; Tablet::ValueMatrixEntry col_values = tablet.value_matrix_[col_idx]; BitMap& col_notnull_bitmap = tablet.bitmaps_[col_idx]; end_idx = std::min(end_idx, tablet.max_row_num_); - if (data_type == common::BOOLEAN) { - ret = write_typed_column(chunk_writer, timestamps, col_values.bool_data, - col_notnull_bitmap, start_idx, end_idx); - } else if (data_type == common::INT32) { - ret = - write_typed_column(chunk_writer, timestamps, col_values.int32_data, - col_notnull_bitmap, start_idx, end_idx); - } else if (data_type == common::INT64) { - ret = - write_typed_column(chunk_writer, timestamps, col_values.int64_data, - col_notnull_bitmap, start_idx, end_idx); - } else if (data_type == common::FLOAT) { - ret = - write_typed_column(chunk_writer, timestamps, col_values.float_data, - col_notnull_bitmap, start_idx, end_idx); - } else if (data_type == common::DOUBLE) { - ret = - write_typed_column(chunk_writer, timestamps, col_values.double_data, - col_notnull_bitmap, start_idx, end_idx); - } else if (data_type == common::STRING) { - ret = - write_typed_column(chunk_writer, timestamps, col_values.string_col, - col_notnull_bitmap, start_idx, end_idx); - } else { - ASSERT(false); + // Cover every storage type (DATE->int32, TIMESTAMP->int64, TEXT/BLOB-> + // string). This is the null fallback for the non-aligned batch path, so a + // column of any type that contains a null lands here; the old if/else only + // handled 6 types and ASSERT(false)'d (silently no-op in NDEBUG) on + // DATE/TIMESTAMP/TEXT/BLOB, dropping those rows. + switch (data_type) { + case common::BOOLEAN: + return write_typed_column(chunk_writer, timestamps, + col_values.bool_data, col_notnull_bitmap, + start_idx, end_idx); + case common::INT32: + case common::DATE: + return write_typed_column(chunk_writer, timestamps, + col_values.int32_data, col_notnull_bitmap, + start_idx, end_idx); + case common::INT64: + case common::TIMESTAMP: + return write_typed_column(chunk_writer, timestamps, + col_values.int64_data, col_notnull_bitmap, + start_idx, end_idx); + case common::FLOAT: + return write_typed_column(chunk_writer, timestamps, + col_values.float_data, col_notnull_bitmap, + start_idx, end_idx); + case common::DOUBLE: + return write_typed_column(chunk_writer, timestamps, + col_values.double_data, + col_notnull_bitmap, start_idx, end_idx); + case common::STRING: + case common::TEXT: + case common::BLOB: + return write_typed_column(chunk_writer, timestamps, + col_values.string_col, col_notnull_bitmap, + start_idx, end_idx); + default: + return E_NOT_SUPPORT; } - return ret; } int TsFileWriter::time_write_column(TimeChunkWriter* time_chunk_writer, @@ -1481,124 +1588,25 @@ int TsFileWriter::time_write_column(TimeChunkWriter* time_chunk_writer, return ret; } -int TsFileWriter::value_write_column(ValueChunkWriter* value_chunk_writer, - const Tablet& tablet, int col_idx, +// Non-aligned numeric column: a null row contributes no point, so null rows +// are skipped. Covers bool/int32/int64/float/double; instantiated only from +// write_column in this translation unit. +template +int TsFileWriter::write_typed_column(ChunkWriter* chunk_writer, + int64_t* timestamps, T* col_values, + BitMap& col_notnull_bitmap, uint32_t start_idx, uint32_t end_idx) { int ret = E_OK; - - TSDataType data_type = tablet.schema_vec_->at(col_idx).data_type_; - int64_t* timestamps = tablet.timestamps_; - Tablet::ValueMatrixEntry col_values = tablet.value_matrix_[col_idx]; - BitMap& col_notnull_bitmap = tablet.bitmaps_[col_idx]; - switch (data_type) { - case common::BOOLEAN: - ret = write_typed_column(value_chunk_writer, timestamps, - (bool*)col_values.bool_data, - col_notnull_bitmap, start_idx, end_idx); - break; - case common::DATE: - case common::INT32: - ret = write_typed_column(value_chunk_writer, timestamps, - (int32_t*)col_values.int32_data, - col_notnull_bitmap, start_idx, end_idx); - break; - case common::TIMESTAMP: - case common::INT64: - ret = write_typed_column(value_chunk_writer, timestamps, - (int64_t*)col_values.int64_data, - col_notnull_bitmap, start_idx, end_idx); - break; - case common::FLOAT: - ret = write_typed_column(value_chunk_writer, timestamps, - (float*)col_values.float_data, - col_notnull_bitmap, start_idx, end_idx); - break; - case common::DOUBLE: - ret = write_typed_column(value_chunk_writer, timestamps, - (double*)col_values.double_data, - col_notnull_bitmap, start_idx, end_idx); - break; - case common::STRING: - case common::TEXT: - case common::BLOB: - ret = write_typed_column(value_chunk_writer, timestamps, - col_values.string_col, col_notnull_bitmap, - start_idx, end_idx); - break; - default: - ret = E_NOT_SUPPORT; + for (uint32_t r = start_idx; r < end_idx; r++) { + if (LIKELY(!col_notnull_bitmap.test(r))) { + if (RET_FAIL(chunk_writer->write(timestamps[r], col_values[r]))) { + return ret; + } + } } return ret; } -#define DO_WRITE_TYPED_COLUMN() \ - do { \ - int ret = E_OK; \ - for (uint32_t r = start_idx; r < end_idx; r++) { \ - if (LIKELY(!col_notnull_bitmap.test(r))) { \ - if (RET_FAIL( \ - chunk_writer->write(timestamps[r], col_values[r]))) { \ - return ret; \ - } \ - } \ - } \ - return ret; \ - } while (false) - -#define DO_VALUE_WRITE_TYPED_COLUMN() \ - do { \ - int ret = E_OK; \ - for (uint32_t r = start_idx; r < end_idx; r++) { \ - if (LIKELY(col_notnull_bitmap.test(r))) { \ - if (RET_FAIL(value_chunk_writer->write( \ - timestamps[r], col_values[r], true))) { \ - return ret; \ - } \ - } else { \ - if (RET_FAIL(value_chunk_writer->write( \ - timestamps[r], col_values[r], false))) { \ - return ret; \ - } \ - } \ - } \ - return ret; \ - } while (false) - -int TsFileWriter::write_typed_column(ChunkWriter* chunk_writer, - int64_t* timestamps, bool* col_values, - BitMap& col_notnull_bitmap, - uint32_t start_idx, uint32_t end_idx) { - DO_WRITE_TYPED_COLUMN(); -} - -int TsFileWriter::write_typed_column(ChunkWriter* chunk_writer, - int64_t* timestamps, int32_t* col_values, - BitMap& col_notnull_bitmap, - uint32_t start_idx, uint32_t end_idx) { - DO_WRITE_TYPED_COLUMN(); -} - -int TsFileWriter::write_typed_column(ChunkWriter* chunk_writer, - int64_t* timestamps, int64_t* col_values, - BitMap& col_notnull_bitmap, - uint32_t start_idx, uint32_t end_idx) { - DO_WRITE_TYPED_COLUMN(); -} - -int TsFileWriter::write_typed_column(ChunkWriter* chunk_writer, - int64_t* timestamps, float* col_values, - BitMap& col_notnull_bitmap, - uint32_t start_idx, uint32_t end_idx) { - DO_WRITE_TYPED_COLUMN(); -} - -int TsFileWriter::write_typed_column(ChunkWriter* chunk_writer, - int64_t* timestamps, double* col_values, - BitMap& col_notnull_bitmap, - uint32_t start_idx, uint32_t end_idx) { - DO_WRITE_TYPED_COLUMN(); -} - int TsFileWriter::write_typed_column(ChunkWriter* chunk_writer, int64_t* timestamps, Tablet::StringColumn* string_col, @@ -1609,8 +1617,7 @@ int TsFileWriter::write_typed_column(ChunkWriter* chunk_writer, if (LIKELY(!col_notnull_bitmap.test(r))) { common::String val( string_col->buffer + string_col->offsets[r], - static_cast(string_col->offsets[r + 1] - - string_col->offsets[r])); + string_col->offsets[r + 1] - string_col->offsets[r]); if (RET_FAIL(chunk_writer->write(timestamps[r], val))) { return ret; } @@ -1619,67 +1626,161 @@ int TsFileWriter::write_typed_column(ChunkWriter* chunk_writer, return ret; } -int TsFileWriter::write_typed_column(ValueChunkWriter* value_chunk_writer, - int64_t* timestamps, bool* col_values, - BitMap& col_notnull_bitmap, - uint32_t start_idx, uint32_t end_idx) { - DO_VALUE_WRITE_TYPED_COLUMN(); -} - -int TsFileWriter::write_typed_column(ValueChunkWriter* value_chunk_writer, - int64_t* timestamps, int32_t* col_values, - BitMap& col_notnull_bitmap, - uint32_t start_idx, uint32_t end_idx) { - DO_VALUE_WRITE_TYPED_COLUMN(); -} - -int TsFileWriter::write_typed_column(ValueChunkWriter* value_chunk_writer, - int64_t* timestamps, int64_t* col_values, - BitMap& col_notnull_bitmap, - uint32_t start_idx, uint32_t end_idx) { - DO_VALUE_WRITE_TYPED_COLUMN(); +int TsFileWriter::time_write_column_batch(TimeChunkWriter* time_chunk_writer, + const Tablet& tablet, + uint32_t start_idx, + uint32_t end_idx) { + int64_t* timestamps = tablet.timestamps_; + int ret = E_OK; + if (IS_NULL(time_chunk_writer) || IS_NULL(timestamps)) { + return E_INVALID_ARG; + } + end_idx = std::min(end_idx, tablet.max_row_num_); + uint32_t count = end_idx - start_idx; + if (count == 0) return ret; + return time_chunk_writer->write_batch(timestamps + start_idx, count); } -int TsFileWriter::write_typed_column(ValueChunkWriter* value_chunk_writer, - int64_t* timestamps, float* col_values, - BitMap& col_notnull_bitmap, +int TsFileWriter::write_column_batch(ChunkWriter* chunk_writer, + const Tablet& tablet, int col_idx, uint32_t start_idx, uint32_t end_idx) { - DO_VALUE_WRITE_TYPED_COLUMN(); -} + int ret = E_OK; + common::TSDataType data_type = tablet.schema_vec_->at(col_idx).data_type_; + int64_t* timestamps = tablet.timestamps_; + Tablet::ValueMatrixEntry col_values = tablet.value_matrix_[col_idx]; + BitMap& col_notnull_bitmap = tablet.bitmaps_[col_idx]; + end_idx = std::min(end_idx, tablet.max_row_num_); + uint32_t count = end_idx - start_idx; + if (count == 0) return ret; + + bool has_null = false; + if (col_notnull_bitmap.may_have_set_bits()) { + for (uint32_t r = start_idx; r < end_idx; r++) { + if (col_notnull_bitmap.test(r)) { + has_null = true; + break; + } + } + } -int TsFileWriter::write_typed_column(ValueChunkWriter* value_chunk_writer, - int64_t* timestamps, double* col_values, - BitMap& col_notnull_bitmap, - uint32_t start_idx, uint32_t end_idx) { - DO_VALUE_WRITE_TYPED_COLUMN(); + if (!has_null) { + switch (data_type) { + case common::BOOLEAN: + ret = chunk_writer->write_batch( + timestamps + start_idx, col_values.bool_data + start_idx, + count); + break; + case common::INT32: + case common::DATE: + ret = chunk_writer->write_batch( + timestamps + start_idx, col_values.int32_data + start_idx, + count); + break; + case common::INT64: + case common::TIMESTAMP: + ret = chunk_writer->write_batch( + timestamps + start_idx, col_values.int64_data + start_idx, + count); + break; + case common::FLOAT: + ret = chunk_writer->write_batch( + timestamps + start_idx, col_values.float_data + start_idx, + count); + break; + case common::DOUBLE: + ret = chunk_writer->write_batch( + timestamps + start_idx, col_values.double_data + start_idx, + count); + break; + case common::STRING: + case common::TEXT: + case common::BLOB: { + auto* sc = col_values.string_col; + // sc->offsets is int32_t* (Arrow Utf8/Binary spec); + // write_string_batch still takes const uint32_t* through the + // page/encoder stack. Offsets are non-negative by + // construction so the bit pattern is identical — cast at the + // boundary until the downstream chain is converted in a + // follow-up. + ret = chunk_writer->write_string_batch( + timestamps + start_idx, sc->buffer, + reinterpret_cast(sc->offsets), start_idx, + count); + break; + } + default: + ret = write_column(chunk_writer, tablet, col_idx, start_idx, + end_idx); + break; + } + } else { + ret = write_column(chunk_writer, tablet, col_idx, start_idx, end_idx); + } + return ret; } -int TsFileWriter::write_typed_column(ValueChunkWriter* value_chunk_writer, - int64_t* timestamps, - Tablet::StringColumn* string_col, - common::BitMap& col_notnull_bitmap, - uint32_t start_idx, uint32_t end_idx) { +int TsFileWriter::value_write_column_batch(ValueChunkWriter* value_chunk_writer, + const Tablet& tablet, int col_idx, + uint32_t start_idx, + uint32_t end_idx) { int ret = E_OK; - for (uint32_t r = start_idx; r < end_idx; r++) { - common::String val(string_col->buffer + string_col->offsets[r], - static_cast(string_col->offsets[r + 1] - - string_col->offsets[r])); - if (LIKELY(col_notnull_bitmap.test(r))) { - if (RET_FAIL(value_chunk_writer->write(timestamps[r], val, true))) { - return ret; - } - } else { - if (RET_FAIL( - value_chunk_writer->write(timestamps[r], val, false))) { - return ret; - } + common::TSDataType data_type = tablet.schema_vec_->at(col_idx).data_type_; + int64_t* timestamps = tablet.timestamps_; + Tablet::ValueMatrixEntry col_values = tablet.value_matrix_[col_idx]; + BitMap& col_notnull_bitmap = tablet.bitmaps_[col_idx]; + end_idx = std::min(end_idx, tablet.max_row_num_); + uint32_t count = end_idx - start_idx; + if (count == 0) return ret; + + switch (data_type) { + case common::BOOLEAN: + ret = value_chunk_writer->write_batch( + timestamps, col_values.bool_data, col_notnull_bitmap, start_idx, + count); + break; + case common::DATE: + case common::INT32: + ret = value_chunk_writer->write_batch( + timestamps, col_values.int32_data, col_notnull_bitmap, + start_idx, count); + break; + case common::TIMESTAMP: + case common::INT64: + ret = value_chunk_writer->write_batch( + timestamps, col_values.int64_data, col_notnull_bitmap, + start_idx, count); + break; + case common::FLOAT: + ret = value_chunk_writer->write_batch( + timestamps, col_values.float_data, col_notnull_bitmap, + start_idx, count); + break; + case common::DOUBLE: + ret = value_chunk_writer->write_batch( + timestamps, col_values.double_data, col_notnull_bitmap, + start_idx, count); + break; + case common::STRING: + case common::TEXT: + case common::BLOB: { + auto* sc = col_values.string_col; + // See above: sc->offsets is int32_t*, downstream still uint32_t*. + ret = value_chunk_writer->write_string_batch( + timestamps, sc->buffer, + reinterpret_cast(sc->offsets), + col_notnull_bitmap, start_idx, count); + break; } + default: + ret = E_NOT_SUPPORT; + break; } return ret; } // TODO make sure ret is meaningful to SDK user int TsFileWriter::flush() { + if (UNLIKELY(unrecoverable_)) return E_DATA_INCONSISTENCY; int ret = E_OK; if (!start_file_done_) { if (RET_FAIL(io_writer_->start_file())) { @@ -1690,9 +1791,10 @@ int TsFileWriter::flush() { /* since @schemas_ used std::map which is rbtree underlying, so map itself is ordered by device name. */ + DeviceSchemasMapIter device_iter; for (device_iter = schemas_.begin(); device_iter != schemas_.end(); - device_iter++) { // cppcheck-suppress postfixOperator + device_iter++) { if (check_chunk_group_empty(device_iter->second, device_iter->second->is_aligned_)) { continue; @@ -1706,6 +1808,7 @@ int TsFileWriter::flush() { } else if (RET_FAIL(io_writer_->end_flush_chunk_group(is_aligned))) { } } + record_count_since_last_flush_ = 0; return ret; } @@ -1751,6 +1854,56 @@ bool TsFileWriter::check_chunk_group_empty(MeasurementSchemaGroup* chunk_group, writer->reset(); \ } +// Write already-encoded chunk data to stream (no compression — done earlier). +#define FLUSH_CHUNK_ENCODED(writer, io_writer, name, data_type, encoding, \ + compression, num_pages) \ + if (RET_FAIL(io_writer->start_flush_chunk(writer->get_chunk_data(), name, \ + data_type, encoding, \ + compression, num_pages))) { \ + } else if (RET_FAIL(io_writer->flush_chunk(writer->get_chunk_data()))) { \ + } else if (RET_FAIL(io_writer->end_flush_chunk( \ + writer->get_chunk_statistic()))) { \ + } else { \ + writer->reset(); \ + } + +int TsFileWriter::flush_chunk_group_encoded(MeasurementSchemaGroup* chunk_group, + bool is_aligned) { + int ret = E_OK; + MeasurementSchemaMap& map = chunk_group->measurement_schema_map_; + + if (chunk_group->is_aligned_) { + TimeChunkWriter*& time_chunk_writer = chunk_group->time_chunk_writer_; + ChunkHeader chunk_header = time_chunk_writer->get_chunk_header(); + FLUSH_CHUNK_ENCODED( + time_chunk_writer, io_writer_, chunk_header.measurement_name_, + chunk_header.data_type_, chunk_header.encoding_type_, + chunk_header.compression_type_, time_chunk_writer->num_of_pages()) + } + + for (MeasurementSchemaMapIter ms_iter = map.begin(); ms_iter != map.end(); + ms_iter++) { + MeasurementSchema* m_schema = ms_iter->second; + if (!chunk_group->is_aligned_ && m_schema->chunk_writer_ != nullptr) { + ChunkWriter*& chunk_writer = m_schema->chunk_writer_; + FLUSH_CHUNK_ENCODED( + chunk_writer, io_writer_, m_schema->measurement_name_, + m_schema->data_type_, m_schema->encoding_, + m_schema->compression_type_, chunk_writer->num_of_pages()) + } else if (m_schema->value_chunk_writer_ != nullptr && + m_schema->value_chunk_writer_->hasData()) { + ValueChunkWriter*& value_chunk_writer = + m_schema->value_chunk_writer_; + FLUSH_CHUNK_ENCODED( + value_chunk_writer, io_writer_, m_schema->measurement_name_, + m_schema->data_type_, m_schema->encoding_, + m_schema->compression_type_, value_chunk_writer->num_of_pages()) + } + } + + return ret; +} + int TsFileWriter::flush_chunk_group(MeasurementSchemaGroup* chunk_group, bool is_aligned) { int ret = E_OK; @@ -1774,7 +1927,8 @@ int TsFileWriter::flush_chunk_group(MeasurementSchemaGroup* chunk_group, m_schema->data_type_, m_schema->encoding_, m_schema->compression_type_, chunk_writer->num_of_pages()) - } else if (m_schema->value_chunk_writer_ != nullptr) { + } else if (m_schema->value_chunk_writer_ != nullptr && + m_schema->value_chunk_writer_->hasData()) { ValueChunkWriter*& value_chunk_writer = m_schema->value_chunk_writer_; FLUSH_CHUNK(value_chunk_writer, io_writer_, @@ -1787,6 +1941,9 @@ int TsFileWriter::flush_chunk_group(MeasurementSchemaGroup* chunk_group, return ret; } -int TsFileWriter::close() { return io_writer_->end_file(); } +int TsFileWriter::close() { + if (UNLIKELY(unrecoverable_)) return E_DATA_INCONSISTENCY; + return io_writer_->end_file(); +} } // end namespace storage diff --git a/cpp/src/writer/tsfile_writer.h b/cpp/src/writer/tsfile_writer.h index a2c8f2842..e0b102c97 100644 --- a/cpp/src/writer/tsfile_writer.h +++ b/cpp/src/writer/tsfile_writer.h @@ -33,7 +33,6 @@ #include "common/record.h" #include "common/schema.h" #include "common/tablet.h" -#include "utils/util_define.h" // mode_t and other platform-compat shims namespace storage { class WriteFile; @@ -46,9 +45,12 @@ namespace storage { extern int libtsfile_init(); extern void libtsfile_destroy(); -extern void set_page_max_point_count(uint32_t page_max_ponint_count); -extern void set_max_degree_of_index_node(uint32_t max_degree_of_index_node); -extern void set_strict_page_size(bool strict_page_size); +// Returns common::E_INVALID_ARG when count would freeze the chunk writers +// (i.e. less than 1); leaves the field untouched on rejection. +extern int set_page_max_point_count(uint32_t page_max_ponint_count); +// Returns common::E_INVALID_ARG when degree < 2 (which collapses the index +// tree); leaves the field untouched on rejection. +extern int set_max_degree_of_index_node(uint32_t max_degree_of_index_node); class TsFileWriter { public: @@ -98,6 +100,7 @@ class TsFileWriter { std::shared_ptr get_table_schema( const std::string& table_name) const; int64_t calculate_mem_size_for_all_group(); + int64_t calculate_meta_mem_size() const; int check_memory_size_and_may_flush_chunks(); /* * Flush buffer to disk file, but do not writer file index part. @@ -125,25 +128,15 @@ class TsFileWriter { int32_t time_pages_before, const std::vector& value_pages_before); int flush_chunk_group(MeasurementSchemaGroup* chunk_group, bool is_aligned); + int flush_chunk_group_encoded(MeasurementSchemaGroup* chunk_group, + bool is_aligned); + // Numeric columns (bool/int32/int64/float/double) share one body: + // non-aligned ChunkWriter skips null rows entirely. Defined in the .cc; + // every instantiation lives in that translation unit. + template int write_typed_column(storage::ChunkWriter* chunk_writer, - int64_t* timestamps, bool* col_values, - common::BitMap& col_notnull_bitmap, - uint32_t start_idx, uint32_t end_idx); - int write_typed_column(storage::ChunkWriter* chunk_writer, - int64_t* timestamps, int32_t* col_values, - common::BitMap& col_notnull_bitmap, - uint32_t start_idx, uint32_t end_idx); - int write_typed_column(storage::ChunkWriter* chunk_writer, - int64_t* timestamps, int64_t* col_values, - common::BitMap& col_notnull_bitmap, - uint32_t start_idx, uint32_t end_idx); - int write_typed_column(storage::ChunkWriter* chunk_writer, - int64_t* timestamps, float* col_values, - common::BitMap& col_notnull_bitmap, - uint32_t start_idx, uint32_t end_idx); - int write_typed_column(storage::ChunkWriter* chunk_writer, - int64_t* timestamps, double* col_values, + int64_t* timestamps, T* col_values, common::BitMap& col_notnull_bitmap, uint32_t start_idx, uint32_t end_idx); int write_typed_column(ChunkWriter* chunk_writer, int64_t* timestamps, @@ -196,41 +189,33 @@ class TsFileWriter { int64_t record_count_for_next_mem_check_; bool write_file_created_; bool io_writer_owned_; // false when init(RestorableTsFileIOWriter*) - bool enforce_recovered_last_time_order_; - - int write_typed_column(ValueChunkWriter* value_chunk_writer, - int64_t* timestamps, bool* col_values, - common::BitMap& col_notnull_bitmap, - uint32_t start_idx, uint32_t end_idx); - - int write_typed_column(ValueChunkWriter* value_chunk_writer, - int64_t* timestamps, double* col_values, - common::BitMap& col_notnull_bitmap, - uint32_t start_idx, uint32_t end_idx); - int write_typed_column(ValueChunkWriter* value_chunk_writer, - int64_t* timestamps, - Tablet::StringColumn* string_col, - common::BitMap& col_notnull_bitmap, - uint32_t start_idx, uint32_t end_idx); - - int write_typed_column(ValueChunkWriter* value_chunk_writer, - int64_t* timestamps, float* col_values, - common::BitMap& col_notnull_bitmap, - uint32_t start_idx, uint32_t end_idx); - - int write_typed_column(ValueChunkWriter* value_chunk_writer, - int64_t* timestamps, int32_t* col_values, - common::BitMap& col_notnull_bitmap, - uint32_t start_idx, uint32_t end_idx); - - int write_typed_column(ValueChunkWriter* value_chunk_writer, - int64_t* timestamps, int64_t* col_values, - common::BitMap& col_notnull_bitmap, - uint32_t start_idx, uint32_t end_idx); - - int value_write_column(ValueChunkWriter* value_chunk_writer, + // Only the recovery init path sets this true: subsequent writes must + // refuse timestamps <= the recovered per-device last_time_ so the chunk + // ordering invariants preserved by RestorableTsFileIOWriter are not + // broken by appending older data. + bool enforce_recovered_last_time_order_ = false; + bool table_aligned_ = true; + // Set once a partial-write failure leaves the per-column chunk writers + // out of sync (e.g. parallel aligned tablet write where one task fails + // mid-way while others succeed). Subsequent write/flush/close calls + // refuse to operate so that the on-disk file isn't sealed with row + // counts that disagree between time and value columns. + bool unrecoverable_ = false; + // Test-only accessor for the unrecoverable contract: real triggers + // (parallel task failure, out-of-order timestamps across multiple chunk + // writers) are hard to drive deterministically, but the contract — + // flush/close refuse — can be unit-tested directly. + friend class TsFileWriterUnrecoverableTest; + + int write_column_batch(storage::ChunkWriter* chunk_writer, const Tablet& tablet, int col_idx, uint32_t start_idx, uint32_t end_idx); + int time_write_column_batch(TimeChunkWriter* time_chunk_writer, + const Tablet& tablet, uint32_t start_idx, + uint32_t end_idx); + int value_write_column_batch(ValueChunkWriter* value_chunk_writer, + const Tablet& tablet, int col_idx, + uint32_t start_idx, uint32_t end_idx); }; } // end namespace storage diff --git a/cpp/src/writer/value_chunk_writer.cc b/cpp/src/writer/value_chunk_writer.cc index a59cf8d3f..182b0762b 100644 --- a/cpp/src/writer/value_chunk_writer.cc +++ b/cpp/src/writer/value_chunk_writer.cc @@ -110,7 +110,7 @@ int ValueChunkWriter::seal_cur_page(bool end_chunk) { /*stat*/ false, /*data*/ false); if (IS_SUCC(ret)) { save_first_page_data(value_page_writer_); - value_page_writer_.clear_page_data(); + // value_page_writer_.destroy_page_data(); value_page_writer_.reset(); } } @@ -145,6 +145,11 @@ void ValueChunkWriter::save_first_page_data( ValuePageWriter& first_page_writer) { first_page_data_ = first_page_writer.get_cur_page_data(); first_page_statistic_->deep_copy_from(first_page_writer.get_statistic()); + // Take ownership of the heap buffers: get_cur_page_data() returned a + // shallow copy, so without this we'd alias compressed_buf_ / + // uncompressed_buf_ between cur_page_data_ and first_page_data_ and + // double-free at destroy() time. + first_page_writer.release_cur_page_data(); } int ValueChunkWriter::write_first_page_data(ByteStream& pages_data, @@ -161,8 +166,7 @@ int ValueChunkWriter::write_first_page_data(ByteStream& pages_data, int ValueChunkWriter::end_encode_chunk() { int ret = E_OK; - if (value_page_writer_.get_point_numer() > 0 || - (has_current_page_data() && num_of_pages_ == 0)) { + if (has_current_page_data()) { ret = seal_cur_page(/*end_chunk*/ true); if (E_OK == ret) { chunk_header_.data_size_ = chunk_data_.total_size(); @@ -175,9 +179,6 @@ int ValueChunkWriter::end_encode_chunk() { chunk_header_.data_size_ = chunk_data_.total_size(); chunk_header_.num_of_pages_ = num_of_pages_; } - } else if (num_of_pages_ > 0) { - chunk_header_.data_size_ = chunk_data_.total_size(); - chunk_header_.num_of_pages_ = num_of_pages_; } #if DEBUG_SE std::cout << "end_encode_chunk: num_of_pages_=" << num_of_pages_ diff --git a/cpp/src/writer/value_chunk_writer.h b/cpp/src/writer/value_chunk_writer.h index 64eb4cc50..cd7c75a54 100644 --- a/cpp/src/writer/value_chunk_writer.h +++ b/cpp/src/writer/value_chunk_writer.h @@ -53,8 +53,7 @@ class ValueChunkWriter { first_page_data_(), first_page_statistic_(nullptr), chunk_header_(), - num_of_pages_(0), - enable_page_seal_if_full_(true) {} + num_of_pages_(0) {} ~ValueChunkWriter() { destroy(); } int init(const common::ColumnSchema& col_schema); int init(const std::string& measurement_name, common::TSDataType data_type, @@ -110,6 +109,71 @@ class ValueChunkWriter { VCW_DO_WRITE_FOR_TYPE(isnull); } + template + int write_batch(const int64_t* timestamps, const T* values, + const common::BitMap& col_notnull_bitmap, + uint32_t start_idx, uint32_t count) { + int ret = common::E_OK; + uint32_t offset = 0; + const uint32_t page_cap = + common::g_config_value_.page_writer_max_point_num_; + while (offset < count) { + uint32_t cur_points = value_page_writer_.get_point_numer(); + // get_point_numer() now returns size_ (rows including nulls and + // the just-written batch), so it can momentarily exceed page_cap; + // seal whenever we are at or past the cap to avoid uint32 wrap. + if (cur_points >= page_cap) { + if (RET_FAIL(seal_cur_page(false))) { + return ret; + } + cur_points = 0; + } + uint32_t page_remaining = page_cap - cur_points; + uint32_t batch_size = std::min(count - offset, page_remaining); + if (RET_FAIL(value_page_writer_.write_batch( + timestamps, values, col_notnull_bitmap, start_idx + offset, + batch_size))) { + return ret; + } + offset += batch_size; + if (RET_FAIL(seal_cur_page_if_full())) { + return ret; + } + } + return ret; + } + + int write_string_batch(const int64_t* timestamps, const char* buffer, + const uint32_t* offsets, + const common::BitMap& col_notnull_bitmap, + uint32_t start_idx, uint32_t count) { + int ret = common::E_OK; + uint32_t offset = 0; + const uint32_t page_cap = + common::g_config_value_.page_writer_max_point_num_; + while (offset < count) { + uint32_t cur_points = value_page_writer_.get_point_numer(); + if (cur_points >= page_cap) { + if (RET_FAIL(seal_cur_page(false))) { + return ret; + } + cur_points = 0; + } + uint32_t page_remaining = page_cap - cur_points; + uint32_t batch_size = std::min(count - offset, page_remaining); + if (RET_FAIL(value_page_writer_.write_string_batch( + timestamps, buffer, offsets, col_notnull_bitmap, + start_idx + offset, batch_size))) { + return ret; + } + offset += batch_size; + if (RET_FAIL(seal_cur_page_if_full())) { + return ret; + } + } + return ret; + } + int end_encode_chunk(); common::ByteStream& get_chunk_data() { return chunk_data_; } Statistic* get_chunk_statistic() { return chunk_statistic_; } @@ -119,25 +183,21 @@ class ValueChunkWriter { bool hasData(); - /** True if the current (unsealed) page has at least one write (including - * nulls). */ + /** True if the current (unsealed) page has at least one write + * (including NULLs). */ bool has_current_page_data() const { - return value_page_writer_.get_total_write_count() > 0; + return value_page_writer_.get_point_numer() > 0; } FORCE_INLINE uint32_t get_point_numer() const { return value_page_writer_.get_point_numer(); } - /** - * Force seal the current page (for aligned table model: when time page - * seals due to memory/point threshold, all value pages must seal together). - * @return E_OK on success. - */ + /** Force seal the current page. */ int seal_current_page() { return seal_cur_page(false); } - // For aligned writer: allow disabling the automatic page-size/point-number - // check so the caller can seal pages at chosen boundaries. + // Allow disabling the automatic page-size/point-number check so the + // caller can seal pages at chosen boundaries. FORCE_INLINE void set_enable_page_seal_if_full(bool enable) { enable_page_seal_if_full_ = enable; } @@ -183,8 +243,7 @@ class ValueChunkWriter { ChunkHeader chunk_header_; int32_t num_of_pages_; - // If false, write() won't auto-seal when the current page becomes full. - bool enable_page_seal_if_full_; + bool enable_page_seal_if_full_ = true; }; } // end namespace storage diff --git a/cpp/src/writer/value_page_writer.cc b/cpp/src/writer/value_page_writer.cc index a7bcd89c4..c538ea2fa 100644 --- a/cpp/src/writer/value_page_writer.cc +++ b/cpp/src/writer/value_page_writer.cc @@ -59,6 +59,10 @@ int ValuePageData::init(ByteStream& col_notnull_bitmap_bs, ByteStream& value_bs, uncompressed_buf_ + sizeof(size) + col_notnull_bitmap_buf_size_, value_buf_size_))) { + // value_buf_size_ == 0 is a fully-null value page: only the bitmap is + // written, value_out_stream_ is empty. Skip the copy — feeding an + // empty stream to copy_bs_to_buf trips ASSERT(b.len_ > 0) in the + // buffer iterator. (Restores the #734 aligned-page-seal fix.) } else { // TODO // NOTE: different compressor may have different compress API @@ -119,6 +123,8 @@ void ValuePageWriter::reset() { } col_notnull_bitmap_out_stream_.reset(); value_out_stream_.reset(); + col_notnull_bitmap_.clear(); + size_ = 0; } void ValuePageWriter::destroy() { diff --git a/cpp/src/writer/value_page_writer.h b/cpp/src/writer/value_page_writer.h index 97f8a5f0d..92c39b9b2 100644 --- a/cpp/src/writer/value_page_writer.h +++ b/cpp/src/writer/value_page_writer.h @@ -59,19 +59,6 @@ struct ValuePageData { compressor_->after_compress(compressed_buf_); compressed_buf_ = nullptr; } - compressor_ = nullptr; - } - - /** Clear pointers without freeing (transfer ownership to another holder). - */ - void clear() { - col_notnull_bitmap_buf_size_ = 0; - value_buf_size_ = 0; - uncompressed_size_ = 0; - compressed_size_ = 0; - uncompressed_buf_ = nullptr; - compressed_buf_ = nullptr; - compressor_ = nullptr; } }; @@ -163,11 +150,170 @@ class ValuePageWriter { VPW_DO_WRITE_FOR_TYPE(isnull); } - FORCE_INLINE uint32_t get_point_numer() const { return statistic_->count_; } - FORCE_INLINE uint32_t get_total_write_count() const { return size_; } + // Batch write for aligned/table model. + // In the tablet bitmap: bit=1 means null, bit=0 means not null. + // In VPW_DO_WRITE_FOR_TYPE: ISNULL=true skips encoding. + // So: tablet bitmap.test(r)=true -> isnull=true (null value) + // tablet bitmap.test(r)=false -> isnull=false (valid value) + template + int write_batch(const int64_t* timestamps, const T* values, + const common::BitMap& col_notnull_bitmap, + uint32_t start_idx, uint32_t count) { + int ret = common::E_OK; + if (count == 0) return ret; + + // Count the not-null rows but defer mutating size_ / + // col_notnull_bitmap_ until the value encode finishes successfully. + // Previously the bitmap and size_ were bumped first, so a half-failed + // encode_batch left the page claiming `count` rows had been written + // when only a prefix made it into value_out_stream_ — a subsequent + // re-encode would interleave with the stale stream and produce a + // misaligned page on disk. + uint32_t valid_count = 0; + for (uint32_t i = 0; i < count; i++) { + uint32_t row = start_idx + i; + // bit=1 in tablet bitmap means null; bit=0 means not null + if (!const_cast(col_notnull_bitmap).test(row)) { + valid_count++; + } + } + + if (valid_count == 0) { + // Still need to advance size_ so trailing null rows are tracked. + for (uint32_t i = 0; i < count; i++) { + if ((size_ / 8) + 1 > col_notnull_bitmap_.size()) { + col_notnull_bitmap_.push_back(0); + } + size_++; + } + return ret; + } + + // If all values are valid, we can encode the batch directly + if (valid_count == count) { + if (RET_FAIL(value_encoder_->encode_batch(values + start_idx, count, + value_out_stream_))) { + // Don't bump size_/bitmap on encode failure. + return ret; + } + statistic_->update_batch(timestamps + start_idx, values + start_idx, + count); + } else { + // Encode only non-null values one by one + for (uint32_t i = 0; i < count; i++) { + uint32_t row = start_idx + i; + if (!const_cast(col_notnull_bitmap) + .test(row)) { + if (RET_FAIL(value_encoder_->encode(values[row], + value_out_stream_))) { + return ret; + } + statistic_->update(timestamps[row], values[row]); + } + } + } + + // Commit size_ + page bitmap now that all encoding succeeded. + for (uint32_t i = 0; i < count; i++) { + uint32_t row = start_idx + i; + if ((size_ / 8) + 1 > col_notnull_bitmap_.size()) { + col_notnull_bitmap_.push_back(0); + } + if (!const_cast(col_notnull_bitmap).test(row)) { + col_notnull_bitmap_[size_ / 8] |= (MASK >> (size_ % 8)); + } + size_++; + } + return ret; + } + + // Batch write strings from Arrow-style offset+buffer layout with null + // bitmap. See write_batch above for the encode-before-commit rationale. + int write_string_batch(const int64_t* timestamps, const char* buffer, + const uint32_t* offsets, + const common::BitMap& col_notnull_bitmap, + uint32_t start_idx, uint32_t count) { + int ret = common::E_OK; + if (count == 0) return ret; + + // Count valid rows up-front without mutating size_ / page bitmap. + uint32_t valid_count = 0; + for (uint32_t i = 0; i < count; i++) { + uint32_t row = start_idx + i; + if (!const_cast(col_notnull_bitmap).test(row)) { + valid_count++; + } + } + + if (valid_count == 0) { + // Advance size_ so the trailing null rows still count. + for (uint32_t i = 0; i < count; i++) { + if ((size_ / 8) + 1 > col_notnull_bitmap_.size()) { + col_notnull_bitmap_.push_back(0); + } + size_++; + } + return ret; + } + + // Phase 2: encode non-null strings (no page-state mutation yet). + if (valid_count == count) { + // All valid — batch encode directly + if (RET_FAIL(value_encoder_->encode_string_batch( + buffer, offsets, start_idx, count, value_out_stream_))) { + return ret; + } + } else { + // Mixed — encode only non-null strings one by one + for (uint32_t i = 0; i < count; i++) { + uint32_t row = start_idx + i; + if (!const_cast(col_notnull_bitmap) + .test(row)) { + uint32_t len = offsets[row + 1] - offsets[row]; + common::String val(buffer + offsets[row], len); + if (RET_FAIL( + value_encoder_->encode(val, value_out_stream_))) { + return ret; + } + } + } + } + + // Phase 3: update statistics for non-null rows. + for (uint32_t i = 0; i < count; i++) { + uint32_t row = start_idx + i; + if (!const_cast(col_notnull_bitmap).test(row)) { + uint32_t len = offsets[row + 1] - offsets[row]; + common::String val(buffer + offsets[row], len); + statistic_->update(timestamps[row], val); + } + } + + // Phase 4: commit page-level state (bitmap + size_) only after the + // encoder calls all succeeded. + for (uint32_t i = 0; i < count; i++) { + uint32_t row = start_idx + i; + if ((size_ / 8) + 1 > col_notnull_bitmap_.size()) { + col_notnull_bitmap_.push_back(0); + } + if (!const_cast(col_notnull_bitmap).test(row)) { + col_notnull_bitmap_[size_ / 8] |= (MASK >> (size_ % 8)); + } + size_++; + } + return ret; + } + + // Rows in the current page including NULLs (NULLs advance size_ but not + // statistic_->count_). This is the count the page-seal logic uses so + // value-column page boundaries stay aligned with the time column. + FORCE_INLINE uint32_t get_point_numer() const { return size_; } FORCE_INLINE uint32_t get_col_notnull_bitmap_out_stream_size() const { return col_notnull_bitmap_out_stream_.total_size(); } + // Logical bytes written — used by the page-seal-when-full heuristic. + // Memory-pressure accounting uses estimate_max_mem_size() below, which + // counts the real 64 KiB-page footprint. FORCE_INLINE uint32_t get_page_memory_size() const { return col_notnull_bitmap_out_stream_.total_size() + value_out_stream_.total_size(); @@ -177,12 +323,16 @@ class ValuePageWriter { * outputStream and value outputStream, because size outputStream is never * used until flushing. * + * Reports the *allocated* stream footprint — see PageWriter:: + * estimate_max_mem_size for rationale. + * * @return allocated size in time, value and outputStream */ FORCE_INLINE uint32_t estimate_max_mem_size() const { return sizeof(int32_t) + 1 + - col_notnull_bitmap_out_stream_.total_size() + - value_out_stream_.total_size() + + static_cast( + col_notnull_bitmap_out_stream_.allocated_bytes() + + value_out_stream_.allocated_bytes()) + value_encoder_->get_max_byte_size(); } int write_to_chunk(common::ByteStream& pages_data, bool write_header, @@ -195,9 +345,16 @@ class ValuePageWriter { } FORCE_INLINE Statistic* get_statistic() { return statistic_; } ValuePageData get_cur_page_data() { return cur_page_data_; } + // Transfer ownership of cur_page_data_'s heap buffers (uncompressed_buf_ + // and compressed_buf_) out of this writer. Callers use this together with + // get_cur_page_data() to keep a long-lived copy of the data (e.g. as the + // first-page snapshot) without leaving an alias here that would cause a + // double free on destroy. + void release_cur_page_data() { + cur_page_data_.uncompressed_buf_ = nullptr; + cur_page_data_.compressed_buf_ = nullptr; + } void destroy_page_data() { cur_page_data_.destroy(); } - /** Clear cur_page_data_ without freeing (after ownership transferred). */ - void clear_page_data() { cur_page_data_.clear(); } private: FORCE_INLINE int prepare_end_page() { diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index 513cbd5ca..066e5accb 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -169,7 +169,7 @@ if (ENABLE_LZOKAY) endif() if (ENABLE_ZLIB) - include_directories(${CMAKE_SOURCE_DIR}/third_party/zlib-1.2.13) + include_directories(${THIRD_PARTY_INCLUDE}/zlib-1.3.1) endif() if (ENABLE_ANTLR4) @@ -186,6 +186,7 @@ file(GLOB_RECURSE TEST_SRCS "reader/*_test.cc" "writer/*_test.cc" "cwrapper/*_test.cc" + "compress/*_test.cc" ) # Parser tests depend on the ANTLR4 runtime; only build them when it is enabled. diff --git a/cpp/test/common/allocator/byte_stream_test.cc b/cpp/test/common/allocator/byte_stream_test.cc index b211803c3..3f57cbf84 100644 --- a/cpp/test/common/allocator/byte_stream_test.cc +++ b/cpp/test/common/allocator/byte_stream_test.cc @@ -87,7 +87,6 @@ TEST_F(ByteStreamTest, WriteReadLargeQuantities) { write_to_stream(&data, 1); } - // 1 MiB buffer: keep it off the stack (MSVC's default stack is only 1 MiB). static uint8_t read_buffer[1024 * 1024]; for (int i = 0; i < 1024 * 1024; i++) { uint32_t read_len = 0; @@ -186,6 +185,42 @@ TEST_F(ByteStreamTest, ReadMoreThanAvailableTest) { ASSERT_EQ(read_len, data_size); } +// Regression: the ctor used to take page_size verbatim, but hot read/write +// paths use `& (page_size-1)` as a bitmask. A non-power-of-2 page_size +// would cause page-crossing logic to misfire, corrupting written data. +// Constructing with 1000 should still round-trip cleanly across many pages. +// Regression: round_up_pow2 used `while (ps < n) ps <<= 1`, which overflows +// to 0 once ps passes 2^31 and never matches, looping forever. Verify the +// clamped helper returns the largest representable power of two instead. +TEST(ByteStreamCtorTest, RoundUpPow2ClampsHugeInput) { + EXPECT_EQ(round_up_pow2(0u), 1u); + EXPECT_EQ(round_up_pow2(1u), 1u); + EXPECT_EQ(round_up_pow2(1000u), 1024u); + EXPECT_EQ(round_up_pow2(1024u), 1024u); + EXPECT_EQ(round_up_pow2(0x80000000u), 0x80000000u); + EXPECT_EQ(round_up_pow2(0x80000001u), 0x80000000u); + EXPECT_EQ(round_up_pow2(0xFFFFFFFFu), 0x80000000u); +} + +TEST(ByteStreamCtorTest, NonPowerOfTwoPageSizeRoundTrip) { + ByteStream bs(1000, MOD_DEFAULT, false); + // Span ~5 pages: 1024 * 5 = 5120 bytes. + const uint32_t N = 5120; + std::vector data(N); + for (uint32_t i = 0; i < N; i++) { + data[i] = static_cast((i * 31 + 7) & 0xff); + } + ASSERT_EQ(bs.write_buf(data.data(), N), common::E_OK); + + std::vector out(N, 0); + uint32_t read_len = 0; + ASSERT_EQ(bs.read_buf(out.data(), N, read_len), common::E_OK); + ASSERT_EQ(read_len, N); + for (uint32_t i = 0; i < N; i++) { + ASSERT_EQ(out[i], data[i]) << "mismatch at idx " << i; + } +} + TEST_F(ByteStreamTest, WrapAndClearTest) { const char externalBuffer[] = "Hello, World!"; const int32_t bufferSize = sizeof(externalBuffer); @@ -316,4 +351,70 @@ TEST_F(SerializationUtilTest, WriteReadIntLEPaddedBitWidthBoundaryValue) { } } -} // namespace common \ No newline at end of file +// Regression: total_size_ was widened to uint64_t but the read-cursor APIs +// stayed uint32_t. A stream that legitimately reaches >4 GiB would have +// remaining_size() / read_pos() / set_read_pos() truncating to the low 32 +// bits and silently mis-positioning later reads. Lock the widened type at +// compile time so a partial revert can't reintroduce truncation, and +// round-trip a moderate value via the API to catch arithmetic mistakes. +TEST(ByteStreamWidthTest, ReadCursorApisAre64Bit) { + ByteStream s(64, common::MOD_DEFAULT); + static_assert(sizeof(decltype(s.read_pos())) >= sizeof(uint64_t), + "ByteStream::read_pos() must return a 64-bit type"); + static_assert(sizeof(decltype(s.remaining_size())) >= sizeof(uint64_t), + "ByteStream::remaining_size() must return a 64-bit type"); + static_assert(sizeof(decltype(s.get_mark_len())) >= sizeof(uint64_t), + "ByteStream::get_mark_len() must return a 64-bit type"); + + // Round-trip a position via set_read_pos / read_pos on a small wrapped + // buffer. Combined with the static_asserts above this guards the path + // arithmetic: a partial revert that kept the signature 64-bit but + // truncated read_pos_ to uint32_t internally would fail set_read_pos → + // read_pos on values near a 32-bit boundary. + constexpr int32_t kLen = 256; + std::vector backing(kLen, 0); + ByteStream wrapped(common::MOD_DEFAULT); + wrapped.wrap_from(backing.data(), kLen); + wrapped.set_read_pos(static_cast(kLen - 7)); + EXPECT_EQ(wrapped.read_pos(), static_cast(kLen - 7)); + EXPECT_EQ(wrapped.remaining_size(), 7u); +} + +// Regression for the 64 KiB page memory-pressure account: ByteStream pages +// are allocated up to OUT_STREAM_PAGE_SIZE bytes even when only a handful of +// bytes have been written, so a chunk-group with many sparse measurements +// can pin tens of megabytes that total_size() can't see. allocated_bytes() +// must reflect the real allocated footprint. +TEST(ByteStreamAllocatedBytesTest, ReportsPageAllocationsNotLogicalSize) { + constexpr uint32_t kPageSize = 4096; + ByteStream s(kPageSize, common::MOD_DEFAULT); + EXPECT_EQ(s.allocated_bytes(), 0u); + + // First write triggers one page allocation; logical size is 4 bytes but + // the real footprint should be the rounded page size. + uint8_t payload[4] = {1, 2, 3, 4}; + ASSERT_EQ(s.write_buf(payload, 4), common::E_OK); + EXPECT_EQ(s.total_size(), 4u); + EXPECT_GE(s.allocated_bytes(), kPageSize); + EXPECT_EQ(s.allocated_bytes() % kPageSize, 0u); +} + +// Regression for finding 21 (MSVC reinterpret_cast*> UB): the +// OptionalAtomic storage is now a real std::atomic, so atomic ops never +// observe a non-atomic backing object. Lock the storage type at compile +// time so a future refactor can't reintroduce the bare T fallback. +TEST(OptionalAtomicStorageTest, BackingStorageIsRealAtomic) { + OptionalAtomic oa(0, /*enable_atomic=*/true); + static_assert(!std::is_copy_constructible>::value, + "OptionalAtomic must not be copyable — the std::atomic " + "storage forces explicit load/store"); + EXPECT_EQ(oa.load(), 0u); + oa.store(42); + EXPECT_EQ(oa.load(), 42u); + EXPECT_EQ(oa.atomic_aaf(8), 50u); + EXPECT_EQ(oa.load(), 50u); + EXPECT_EQ(oa.atomic_faa(1), 50u); + EXPECT_EQ(oa.load(), 51u); +} + +} // namespace common diff --git a/cpp/test/common/tablet_test.cc b/cpp/test/common/tablet_test.cc index 71863f0c7..11dfa485f 100644 --- a/cpp/test/common/tablet_test.cc +++ b/cpp/test/common/tablet_test.cc @@ -46,6 +46,144 @@ TEST(TabletTest, BasicFunctionality) { EXPECT_EQ(tablet.add_value(1, 1, true), common::E_OK); } +// Regression: reset() must restore each column's bitmap to all-null. If the +// previous batch left some cells with non-null bits cleared and the next batch +// does not re-fill those cells, get_value() must report them as null so the +// writer does not emit stale leftover values. +TEST(TabletTest, ResetClearsBitmap) { + std::vector schema_vec; + schema_vec.push_back(MeasurementSchema( + "m_int", common::TSDataType::INT32, common::TSEncoding::PLAIN, + common::CompressionType::UNCOMPRESSED)); + schema_vec.push_back(MeasurementSchema( + "m_double", common::TSDataType::DOUBLE, common::TSEncoding::PLAIN, + common::CompressionType::UNCOMPRESSED)); + Tablet tablet("dev", + std::make_shared>(schema_vec)); + + // First batch fills row 5 in both columns. + ASSERT_EQ(tablet.add_value(5u, 0u, static_cast(42)), common::E_OK); + ASSERT_EQ(tablet.add_value(5u, 1u, 3.14), common::E_OK); + + common::TSDataType ty; + EXPECT_NE(tablet.get_value(5, 0u, ty), nullptr); + EXPECT_NE(tablet.get_value(5, 1u, ty), nullptr); + + // Reuse the tablet: reset and write a fresh, smaller batch that does not + // touch row 5 at all. Row 5 must come back as null, not as the stale 42. + tablet.reset(); + ASSERT_EQ(tablet.add_value(0u, 0u, static_cast(7)), common::E_OK); + EXPECT_NE(tablet.get_value(0, 0u, ty), nullptr); + EXPECT_EQ(tablet.get_value(5, 0u, ty), nullptr); + EXPECT_EQ(tablet.get_value(5, 1u, ty), nullptr); +} + +// Regression: set_column_values() with a non-null bitmap must update +// has_set_bits_, otherwise downstream may_have_set_bits() shortcuts treat the +// column as having no nulls and the writer emits stale/garbage values for the +// rows the bitmap was meant to mark null. +TEST(TabletTest, SetColumnValuesBitmapPreservesNullFlag) { + std::vector schema_vec; + schema_vec.push_back(MeasurementSchema( + "m_int", common::TSDataType::INT32, common::TSEncoding::PLAIN, + common::CompressionType::UNCOMPRESSED)); + Tablet tablet("dev", + std::make_shared>(schema_vec)); + + int32_t buf[8] = {1, 2, 3, 4, 5, 6, 7, 8}; + + // Step 1: write all 8 rows with no nulls -> clear_all() inside the tablet + // sets has_set_bits_=false, matching the state a real workload leaves + // behind for a fully-populated column. + ASSERT_EQ(tablet.set_column_values(0u, buf, /*bitmap=*/nullptr, 8u), + common::E_OK); + + // Step 2: rewrite with a bitmap that marks rows 0 and 7 as NULL. Tablet's + // BitMap layout is LSB-first within each byte (row i -> bit 1<<(i%8)). + uint8_t external_bitmap[] = {0x81}; // bit 0 (row 0) + bit 7 (row 7) set + ASSERT_EQ(tablet.set_column_values(0u, buf, external_bitmap, 8u), + common::E_OK); + + common::TSDataType ty; + EXPECT_EQ(tablet.get_value(0, 0u, ty), nullptr); + EXPECT_NE(tablet.get_value(1, 0u, ty), nullptr); + EXPECT_EQ(tablet.get_value(7, 0u, ty), nullptr); +} + +// Regression: set_column_string_values / set_column_string_repeated used to +// reinterpret value_matrix_[c].string_col without checking the schema type. +// Calling them on a numeric column would corrupt that column's numeric +// buffer. Verify both reject non-string columns with E_TYPE_NOT_MATCH. +TEST(TabletTest, StringApisRejectNonStringColumn) { + std::vector schema_vec; + schema_vec.push_back(MeasurementSchema( + "m_int", common::TSDataType::INT32, common::TSEncoding::PLAIN, + common::CompressionType::UNCOMPRESSED)); + Tablet tablet("dev", + std::make_shared>(schema_vec)); + + const char data[] = "hello"; + int32_t offsets[2] = {0, 5}; + EXPECT_EQ(tablet.set_column_string_values(0u, offsets, data, nullptr, 1u), + common::E_TYPE_NOT_MATCH); + EXPECT_EQ(tablet.set_column_string_repeated(0u, "x", 1u, 4u), + common::E_TYPE_NOT_MATCH); +} + +// Regression: str_len * count used to be computed in uint32_t and would wrap +// silently, leaving the loop to write past the truncated allocation. +// 65536 * 65537 = 4295032832 → wraps to 65536 in uint32_t. +TEST(TabletTest, StringRepeatedTotalBytesOverflowRejected) { + std::vector schema_vec; + schema_vec.push_back(MeasurementSchema( + "m_str", common::TSDataType::STRING, common::TSEncoding::PLAIN, + common::CompressionType::UNCOMPRESSED)); + Tablet tablet("dev", + std::make_shared>(schema_vec), + 100000u); + std::string big_str(65536, 'a'); + EXPECT_EQ(tablet.set_column_string_repeated(0u, big_str.c_str(), + /*str_len=*/65536u, + /*count=*/65537u), + common::E_OVERFLOW); +} + +// Regression: set_column_string_values only checked offsets[count] before; +// non-monotonic / negative / non-zero-start offsets would underflow the +// downstream `offsets[i+1] - offsets[i]` length calc and trigger wild +// memcpy. Verify each malformed input is rejected with E_INVALID_ARG. +TEST(TabletTest, StringValuesRejectsMalformedOffsets) { + std::vector schema_vec; + schema_vec.push_back(MeasurementSchema( + "m_str", common::TSDataType::STRING, common::TSEncoding::PLAIN, + common::CompressionType::UNCOMPRESSED)); + Tablet tablet("dev", + std::make_shared>(schema_vec)); + const char data[] = "abcdefghij"; + + // Non-zero start offset. + int32_t off_bad_start[3] = {1, 5, 10}; + EXPECT_EQ( + tablet.set_column_string_values(0u, off_bad_start, data, nullptr, 2u), + common::E_INVALID_ARG); + + // Non-monotonic: {0, 10, 5}. + int32_t off_non_mono[3] = {0, 10, 5}; + EXPECT_EQ( + tablet.set_column_string_values(0u, off_non_mono, data, nullptr, 2u), + common::E_INVALID_ARG); + + // Negative offset somewhere in the middle. + int32_t off_neg[3] = {0, -1, 5}; + EXPECT_EQ(tablet.set_column_string_values(0u, off_neg, data, nullptr, 2u), + common::E_INVALID_ARG); + + // Sanity: well-formed offsets succeed. + int32_t off_ok[3] = {0, 3, 7}; + EXPECT_EQ(tablet.set_column_string_values(0u, off_ok, data, nullptr, 2u), + common::E_OK); +} + TEST(TabletTest, LargeQuantities) { std::string device_name = "test_device"; std::vector schema_vec; diff --git a/cpp/test/common/thread_pool_test.cc b/cpp/test/common/thread_pool_test.cc new file mode 100644 index 000000000..1fe7465cf --- /dev/null +++ b/cpp/test/common/thread_pool_test.cc @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#ifdef ENABLE_THREADS + +#include "common/thread_pool.h" + +#include + +#include +#include +#include +#include + +// Regression: a zero-sized ThreadPool used to silently accept submit() but +// block wait_all() forever (no worker thread, so active_ never reaches 0). +// init_common() clamps thread_count_ to >= 1 before building the global pool, +// but the ctor normalizes zero to a single worker as a defensive backstop so +// any direct ThreadPool(0) still makes progress instead of hanging. +TEST(ThreadPoolTest, ZeroThreadPoolStillExecutesAndDrains) { + common::ThreadPool pool(0); + EXPECT_GE(pool.num_threads(), static_cast(1)); + + std::atomic ran{0}; + pool.submit([&ran]() { ran.fetch_add(1); }); + auto fut = pool.submit([]() { return 42; }); + + auto wait_with_timeout = [&pool]() { + // wait_all has no timeout; run it in a helper thread we can join(). + std::promise done; + auto fut = done.get_future(); + std::thread t([&pool, &done]() { + pool.wait_all(); + done.set_value(); + }); + auto status = fut.wait_for(std::chrono::seconds(2)); + if (status != std::future_status::ready) { + // Detach so a hung pool doesn't terminate the test process. + t.detach(); + return false; + } + t.join(); + return true; + }; + + ASSERT_TRUE(wait_with_timeout()) << "wait_all hung — zero-thread pool"; + EXPECT_EQ(ran.load(), 1); + EXPECT_EQ(fut.get(), 42); +} + +#endif // ENABLE_THREADS diff --git a/cpp/test/common/tsfile_common_test.cc b/cpp/test/common/tsfile_common_test.cc index 01e193f79..c451a8136 100644 --- a/cpp/test/common/tsfile_common_test.cc +++ b/cpp/test/common/tsfile_common_test.cc @@ -21,6 +21,9 @@ #include #include +#include "common/global.h" +#include "compress/compressor_factory.h" + namespace storage { TEST(PageHeaderTest, DefaultConstructor) { PageHeader header; @@ -471,4 +474,26 @@ TEST_F(TsFileMetaTest, SerializeDeserialize) { ASSERT_EQ(*new_meta.tsfile_properties_["key"], std::string("value")); ASSERT_EQ(new_meta.tsfile_properties_["null_key"], nullptr); } + +// Regression: the default-compression configuration must name a compressor +// that the build actually provides; otherwise CompressorFactory returns +// nullptr at write time. init_config_value() previously gated SNAPPY on +// ENABLE_LZ4, which broke --disable-snappy --enable-lz4 builds. +TEST(DefaultCompressorTest, DefaultIsAllocatable) { + common::init_config_value(); + Compressor* c = CompressorFactory::alloc_compressor( + common::g_config_value_.default_compression_type_); + ASSERT_NE(c, nullptr); +#ifdef ENABLE_SNAPPY + EXPECT_EQ(common::g_config_value_.default_compression_type_, + common::CompressionType::SNAPPY); +#elif defined(ENABLE_LZ4) + EXPECT_EQ(common::g_config_value_.default_compression_type_, + common::CompressionType::LZ4); +#else + EXPECT_EQ(common::g_config_value_.default_compression_type_, + common::CompressionType::UNCOMPRESSED); +#endif + CompressorFactory::free(c); +} } // namespace storage diff --git a/cpp/test/compress/lz4_compressor_test.cc b/cpp/test/compress/lz4_compressor_test.cc index c57ec0caf..0b2249f8d 100644 --- a/cpp/test/compress/lz4_compressor_test.cc +++ b/cpp/test/compress/lz4_compressor_test.cc @@ -126,4 +126,40 @@ TEST_F(LZ4Test, TestBytes2) { compressor.after_compress(compressed_buf); compressor.after_uncompress(decompressed_buf); } + +TEST_F(LZ4Test, AfterUncompressFreesParamNotMember) { + storage::LZ4Compressor compressor; + std::string input_a(1024, 'A'); + std::string input_b(2048, 'B'); + char* compressed_a = nullptr; + char* compressed_b = nullptr; + uint32_t compressed_a_len = 0; + uint32_t compressed_b_len = 0; + + ASSERT_EQ(compressor.compress(&input_a[0], input_a.size(), compressed_a, + compressed_a_len), + common::E_OK); + ASSERT_EQ(compressor.compress(&input_b[0], input_b.size(), compressed_b, + compressed_b_len), + common::E_OK); + + char* uncompressed_a = nullptr; + char* uncompressed_b = nullptr; + uint32_t uncompressed_a_len = 0; + uint32_t uncompressed_b_len = 0; + ASSERT_EQ(compressor.uncompress(compressed_a, compressed_a_len, + uncompressed_a, uncompressed_a_len), + common::E_OK); + ASSERT_EQ(compressor.uncompress(compressed_b, compressed_b_len, + uncompressed_b, uncompressed_b_len), + common::E_OK); + + compressor.after_uncompress(uncompressed_a); + EXPECT_EQ(uncompressed_b_len, input_b.size()); + EXPECT_EQ(memcmp(uncompressed_b, input_b.data(), uncompressed_b_len), 0); + + compressor.after_uncompress(uncompressed_b); + compressor.after_compress(compressed_a); + compressor.after_compress(compressed_b); +} } // namespace diff --git a/cpp/test/compress/snappy_compressor_test.cc b/cpp/test/compress/snappy_compressor_test.cc index d24915d70..249200cce 100644 --- a/cpp/test/compress/snappy_compressor_test.cc +++ b/cpp/test/compress/snappy_compressor_test.cc @@ -126,4 +126,40 @@ TEST_F(SnappyTest, TestBytes2) { compressor.after_compress(compressed_buf); compressor.after_uncompress(decompressed_buf); } + +TEST_F(SnappyTest, AfterUncompressFreesParamNotMember) { + storage::SnappyCompressor compressor; + std::string input_a(1024, 'A'); + std::string input_b(2048, 'B'); + char* compressed_a = nullptr; + char* compressed_b = nullptr; + uint32_t compressed_a_len = 0; + uint32_t compressed_b_len = 0; + + ASSERT_EQ(compressor.compress(&input_a[0], input_a.size(), compressed_a, + compressed_a_len), + common::E_OK); + ASSERT_EQ(compressor.compress(&input_b[0], input_b.size(), compressed_b, + compressed_b_len), + common::E_OK); + + char* uncompressed_a = nullptr; + char* uncompressed_b = nullptr; + uint32_t uncompressed_a_len = 0; + uint32_t uncompressed_b_len = 0; + ASSERT_EQ(compressor.uncompress(compressed_a, compressed_a_len, + uncompressed_a, uncompressed_a_len), + common::E_OK); + ASSERT_EQ(compressor.uncompress(compressed_b, compressed_b_len, + uncompressed_b, uncompressed_b_len), + common::E_OK); + + compressor.after_uncompress(uncompressed_a); + EXPECT_EQ(uncompressed_b_len, input_b.size()); + EXPECT_EQ(memcmp(uncompressed_b, input_b.data(), uncompressed_b_len), 0); + + compressor.after_uncompress(uncompressed_b); + compressor.after_compress(compressed_a); + compressor.after_compress(compressed_b); +} } // namespace diff --git a/cpp/test/compress/uncompressed_compressor_test.cc b/cpp/test/compress/uncompressed_compressor_test.cc new file mode 100644 index 000000000..c4f1e8ced --- /dev/null +++ b/cpp/test/compress/uncompressed_compressor_test.cc @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include "compress/uncompressed_compressor.h" + +#include + +#include + +namespace storage { + +// Regression: after_uncompress() used to free the cached uncompressed_buf_ +// member regardless of which buffer the caller actually passed in. Two +// successive uncompress() calls would cache only the second buffer; calling +// after_uncompress(first) then freed the still-live second buffer (UAF) and +// leaked the first. The fix frees the parameter and only clears the +// member when it matches. We can't directly observe UAF in a unit test, +// but we can verify the contract: a buffer the caller is releasing is no +// longer used after the call, and the second buffer's contents stay +// readable until its own after_uncompress() runs. +TEST(UncompressedCompressorTest, AfterUncompressFreesParamNotMember) { + UncompressedCompressor c; + + const char src_a[] = "AAAA-payload-A"; + const char src_b[] = "BBBB-payload-B-longer"; + + char* uA = nullptr; + uint32_t lenA = 0; + ASSERT_EQ( + c.uncompress(const_cast(src_a), sizeof(src_a) - 1, uA, lenA), + common::E_OK); + ASSERT_NE(uA, nullptr); + ASSERT_EQ(lenA, sizeof(src_a) - 1); + EXPECT_EQ(memcmp(uA, src_a, lenA), 0); + + char* uB = nullptr; + uint32_t lenB = 0; + ASSERT_EQ( + c.uncompress(const_cast(src_b), sizeof(src_b) - 1, uB, lenB), + common::E_OK); + ASSERT_NE(uB, nullptr); + EXPECT_NE(uA, uB); + EXPECT_EQ(memcmp(uB, src_b, lenB), 0); + + // Release the FIRST buffer. Under the old bug this would free uB + // (the member-cached pointer) and leak uA. Under the fix it frees uA + // and leaves uB intact for the next read. + c.after_uncompress(uA); + // uB must still be readable — if we had freed it above, the cached + // member pointer would now point into freed memory and most + // allocators would either return the byte back to the free list or + // poison it. Validate via the original content. + EXPECT_EQ(memcmp(uB, src_b, lenB), 0); + + // Releasing uB should be a clean no-op-after on the member. + c.after_uncompress(uB); +} + +} // namespace storage diff --git a/cpp/test/cwrapper/c_release_test.cc b/cpp/test/cwrapper/c_release_test.cc index 375c7e115..bb21483f7 100644 --- a/cpp/test/cwrapper/c_release_test.cc +++ b/cpp/test/cwrapper/c_release_test.cc @@ -40,6 +40,7 @@ class CReleaseTest : public testing::Test {}; TEST_F(CReleaseTest, TestCreateFile) { ERRNO error_no = RET_OK; + remove("create_file1.tsfile"); // Create File and Get RET_OK WriteFile file = write_file_new("create_file1.tsfile", &error_no); ASSERT_EQ(RET_OK, error_no); @@ -50,7 +51,8 @@ TEST_F(CReleaseTest, TestCreateFile) { ASSERT_EQ(RET_ALREADY_EXIST, error_no); ASSERT_EQ(nullptr, file); - // Folder + // Folder: rejected either as an open error (POSIX) or as already-existing + // (Windows / filesystems where the directory already exists). file = write_file_new("test/", &error_no); ASSERT_TRUE(error_no == RET_FILRET_OPEN_ERR || error_no == RET_ALREADY_EXIST); @@ -112,6 +114,17 @@ TEST_F(CReleaseTest, TsFileWriterNew) { free_write_file(&file); remove("test_empty_writer.tsfile"); + // Normal schema with memory threshold + file = write_file_new("test_memory_threshold_writer.tsfile", &error_code); + ASSERT_EQ(RET_OK, error_code); + writer = tsfile_writer_new_with_memory_threshold(file, &table_schema, 100, + &error_code); + ASSERT_NE(nullptr, writer); + ASSERT_EQ(RET_OK, error_code); + ASSERT_EQ(RET_OK, tsfile_writer_close(writer)); + free_write_file(&file); + remove("test_memory_threshold_writer.tsfile"); + free_table_schema(table_schema); free_table_schema(test_schema); } @@ -142,6 +155,10 @@ TEST_F(CReleaseTest, TsFileWriterWriteDataAbnormalColumn) { TsFileWriter writer = tsfile_writer_new(file, &abnormal_schema, &error_code); ASSERT_EQ(RET_INVALID_SCHEMA, error_code); + writer = tsfile_writer_new_with_memory_threshold(file, &abnormal_schema, + 100, &error_code); + ASSERT_EQ(nullptr, writer); + ASSERT_EQ(RET_INVALID_SCHEMA, error_code); free(abnormal_schema.column_schemas[2].column_name); abnormal_schema.column_schemas[2] = @@ -150,6 +167,10 @@ TEST_F(CReleaseTest, TsFileWriterWriteDataAbnormalColumn) { // datatype conflict writer = tsfile_writer_new(file, &abnormal_schema, &error_code); ASSERT_EQ(RET_INVALID_SCHEMA, error_code); + writer = tsfile_writer_new_with_memory_threshold(file, &abnormal_schema, + 100, &error_code); + ASSERT_EQ(nullptr, writer); + ASSERT_EQ(RET_INVALID_SCHEMA, error_code); free(abnormal_schema.column_schemas[1].column_name); abnormal_schema.column_schemas[1] = @@ -388,4 +409,4 @@ TEST_F(CReleaseTest, TsFileWriterConfTest) { remove("plain_file.tsfile"); } -} // namespace CReleaseTest \ No newline at end of file +} // namespace CReleaseTest diff --git a/cpp/test/cwrapper/cwrapper_test.cc b/cpp/test/cwrapper/cwrapper_test.cc index 9cf06d2f8..2ac6cad21 100644 --- a/cpp/test/cwrapper/cwrapper_test.cc +++ b/cpp/test/cwrapper/cwrapper_test.cc @@ -314,4 +314,155 @@ TEST_F(CWrapperTest, WriterFlushTabletAndReadData) { free(data_types); free_write_file(&file); } -} // namespace cwrapper \ No newline at end of file + +// Regression: tsfile_writer_new_with_memory_threshold() had its duplicate- +// column check inverted (`==` instead of `!=`), so the very first column +// always looked like a duplicate and the constructor returned +// E_INVALID_SCHEMA before any legitimate schema could be used. Compare to +// tsfile_writer_new() in the same file which had the correct check. +TEST(TsFileWriterCApiTest, NewWithMemoryThresholdAcceptsValidSchema) { + const char* path = "cwrapper_writer_with_threshold_smoke.tsfile"; + remove(path); + ERRNO code = 0; + WriteFile file = write_file_new(path, &code); + ASSERT_EQ(code, RET_OK); + + const int column_num = 3; + TableSchema schema; + schema.table_name = strdup("t"); + schema.column_num = column_num; + schema.column_schemas = + static_cast(malloc(sizeof(ColumnSchema) * column_num)); + schema.column_schemas[0] = + ColumnSchema{strdup("id1"), TS_DATATYPE_STRING, TAG}; + schema.column_schemas[1] = + ColumnSchema{strdup("s1"), TS_DATATYPE_INT64, FIELD}; + schema.column_schemas[2] = + ColumnSchema{strdup("s2"), TS_DATATYPE_DOUBLE, FIELD}; + + TsFileWriter writer = tsfile_writer_new_with_memory_threshold( + file, &schema, 1024 * 1024, &code); + EXPECT_NE(writer, nullptr) << "constructor refused a valid 3-column schema"; + EXPECT_EQ(code, RET_OK); + + // Duplicate column triggers the now-correct path. + TableSchema dup; + dup.table_name = strdup("t"); + dup.column_num = 2; + dup.column_schemas = + static_cast(malloc(sizeof(ColumnSchema) * 2)); + dup.column_schemas[0] = + ColumnSchema{strdup("s1"), TS_DATATYPE_INT64, FIELD}; + dup.column_schemas[1] = + ColumnSchema{strdup("s1"), TS_DATATYPE_INT64, FIELD}; + ERRNO dup_code = 0; + TsFileWriter dup_writer = tsfile_writer_new_with_memory_threshold( + file, &dup, 1024 * 1024, &dup_code); + EXPECT_EQ(dup_writer, nullptr); + EXPECT_EQ(dup_code, common::E_INVALID_SCHEMA); + + if (writer != nullptr) { + tsfile_writer_close(writer); + } + free_table_schema(schema); + free_table_schema(dup); + free_write_file(&file); + remove(path); +} + +// Regression: tsfile_writer_new / tsfile_writer_new_with_memory_threshold / +// _tsfile_writer_register_table used to dereference null inputs directly, +// crashing the host process. Each now reports E_INVALID_ARG (or returns +// nullptr when err_code itself is null) instead of segfaulting. +TEST(TsFileWriterCApiTest, RejectsNullInputs) { + ERRNO err = 0; + + // tsfile_writer_new: null file + EXPECT_EQ( + tsfile_writer_new(nullptr, reinterpret_cast(1), &err), + nullptr); + EXPECT_EQ(err, common::E_INVALID_ARG); + + // tsfile_writer_new: null schema + err = 0; + EXPECT_EQ(tsfile_writer_new(reinterpret_cast(1), nullptr, &err), + nullptr); + EXPECT_EQ(err, common::E_INVALID_ARG); + + // tsfile_writer_new: null err_code + EXPECT_EQ(tsfile_writer_new(nullptr, nullptr, nullptr), nullptr); + + // tsfile_writer_new_with_memory_threshold: same checks + err = 0; + EXPECT_EQ(tsfile_writer_new_with_memory_threshold( + nullptr, reinterpret_cast(1), 1024, &err), + nullptr); + EXPECT_EQ(err, common::E_INVALID_ARG); + + // _tsfile_writer_register_table: nulls + EXPECT_EQ(_tsfile_writer_register_table(nullptr, + reinterpret_cast(1)), + common::E_INVALID_ARG); + EXPECT_EQ(_tsfile_writer_register_table(reinterpret_cast(1), + nullptr), + common::E_INVALID_ARG); +} + +// Regression: the tag-filter C API used to dereference a null reader and +// pass null char pointers straight to std::string(), crashing the host +// process. Each entry point must now return nullptr / E_INVALID_ARG on +// missing inputs instead of segfaulting. This test only checks the guards +// are in place — it deliberately never touches a real reader. +TEST(TagFilterCApiTest, RejectsNullInputs) { + const char* table = "t"; + const char* col = "c"; + const char* val = "v"; + + EXPECT_EQ(tsfile_tag_filter_eq(nullptr, table, col, val), nullptr); + EXPECT_EQ(tsfile_tag_filter_eq(reinterpret_cast(1), nullptr, + col, val), + nullptr); + EXPECT_EQ(tsfile_tag_filter_eq(reinterpret_cast(1), table, + nullptr, val), + nullptr); + EXPECT_EQ(tsfile_tag_filter_eq(reinterpret_cast(1), table, + col, nullptr), + nullptr); + + EXPECT_EQ(tsfile_tag_filter_neq(nullptr, table, col, val), nullptr); + EXPECT_EQ(tsfile_tag_filter_lt(nullptr, table, col, val), nullptr); + EXPECT_EQ(tsfile_tag_filter_lteq(nullptr, table, col, val), nullptr); + EXPECT_EQ(tsfile_tag_filter_gt(nullptr, table, col, val), nullptr); + EXPECT_EQ(tsfile_tag_filter_gteq(nullptr, table, col, val), nullptr); + + ERRNO err = common::E_OK; + EXPECT_EQ( + tsfile_tag_filter_create(nullptr, table, col, val, TAG_FILTER_EQ, &err), + nullptr); + EXPECT_EQ(err, common::E_INVALID_ARG); + + err = common::E_OK; + EXPECT_EQ(tsfile_tag_filter_create(reinterpret_cast(1), + nullptr, col, val, TAG_FILTER_EQ, &err), + nullptr); + EXPECT_EQ(err, common::E_INVALID_ARG); + + err = common::E_OK; + EXPECT_EQ(tsfile_tag_filter_create(reinterpret_cast(1), table, + nullptr, val, TAG_FILTER_EQ, &err), + nullptr); + EXPECT_EQ(err, common::E_INVALID_ARG); + + err = common::E_OK; + EXPECT_EQ(tsfile_tag_filter_create(reinterpret_cast(1), table, + col, nullptr, TAG_FILTER_EQ, &err), + nullptr); + EXPECT_EQ(err, common::E_INVALID_ARG); + + // err_code itself is null — must not crash, must return null. + EXPECT_EQ(tsfile_tag_filter_create(reinterpret_cast(1), table, + col, val, TAG_FILTER_EQ, nullptr), + nullptr); +} + +} // namespace cwrapper diff --git a/cpp/test/cwrapper/query_by_row_cwrapper_test.cc b/cpp/test/cwrapper/query_by_row_cwrapper_test.cc index 3de447ffd..4983c57ea 100644 --- a/cpp/test/cwrapper/query_by_row_cwrapper_test.cc +++ b/cpp/test/cwrapper/query_by_row_cwrapper_test.cc @@ -217,7 +217,7 @@ TEST_F(CWrapperQueryByRowTest, TableByRowOffsetLimit) { const int limit = 5; ResultSet rs = tsfile_reader_query_table_by_row(reader, table_name.c_str(), column_names_c, 2, offset, - limit, NULL, 0, &code); + limit, nullptr, 0, &code); ASSERT_EQ(code, RET_OK); ASSERT_NE(rs, nullptr); diff --git a/cpp/test/encoding/encoding_coverage_test.cc b/cpp/test/encoding/encoding_coverage_test.cc new file mode 100644 index 000000000..6970b9387 --- /dev/null +++ b/cpp/test/encoding/encoding_coverage_test.cc @@ -0,0 +1,406 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +// Targeted coverage tests that exercise paths missed by the per-codec +// roundtrip tests: type-mismatch error returns, has_remaining variants, +// SIMD/scalar batch branches, floating-point special values, dictionary +// decoder/encoder, and reset cycles. + +#include +#include +#include + +#include "common/allocator/byte_stream.h" +#include "encoding/dictionary_decoder.h" +#include "encoding/dictionary_encoder.h" +#include "encoding/gorilla_decoder.h" +#include "encoding/gorilla_encoder.h" +#include "encoding/int32_rle_decoder.h" +#include "encoding/int32_rle_encoder.h" +#include "encoding/int64_rle_decoder.h" +#include "encoding/int64_rle_encoder.h" +#include "encoding/plain_decoder.h" +#include "encoding/plain_encoder.h" +#include "encoding/ts2diff_decoder.h" +#include "encoding/ts2diff_encoder.h" +#include "encoding/zigzag_decoder.h" +#include "encoding/zigzag_encoder.h" +#include "gtest/gtest.h" + +namespace storage { + +// ── Type-mismatch returns ──────────────────────────────────────────────── +// +// Every codec exposes read_boolean / read_int32 / read_int64 / read_float / +// read_double / read_String. Most of them only implement one or two and +// return E_TYPE_NOT_MATCH for the rest, but those return paths were never +// hit by the existing per-codec tests (which only call the one supported +// method per codec). +TEST(EncodingCoverage, TypeMismatchReturnsAreReachable) { + common::ByteStream s(64, common::MOD_DEFAULT); + common::PageArena pa; + pa.init(512, common::MOD_DEFAULT); + bool b; + float f; + double d; + int64_t i64; + common::String str; + + // Each decoder returns an error sentinel (E_TYPE_NOT_MATCH or + // E_NOT_SUPPORT depending on codec) for the read_* variants it + // doesn't implement. We only care that the unsupported path returns + // an error rather than a corrupted value. Note that GorillaDecoder + // implements its unsupported paths with `ASSERT(false)`; calling + // those in Debug builds aborts, so we exercise only the codecs that + // return cleanly (Zigzag, RLE). + auto NE_OK = [](int r) { EXPECT_NE(r, common::E_OK); }; + IntZigzagDecoder zz; + NE_OK(zz.read_boolean(b, s)); + NE_OK(zz.read_float(f, s)); + NE_OK(zz.read_double(d, s)); + NE_OK(zz.read_String(str, pa, s)); + + Int32RleDecoder rle32; + NE_OK(rle32.read_int64(i64, s)); + NE_OK(rle32.read_float(f, s)); + NE_OK(rle32.read_double(d, s)); + NE_OK(rle32.read_String(str, pa, s)); + + Int64RleDecoder rle64; + int32_t i32; + NE_OK(rle64.read_boolean(b, s)); + NE_OK(rle64.read_int32(i32, s)); + NE_OK(rle64.read_float(f, s)); + NE_OK(rle64.read_double(d, s)); + NE_OK(rle64.read_String(str, pa, s)); + (void)i32; + (void)i64; +} + +// ── Reset cycles ──────────────────────────────────────────────────────── +// +// Each codec defines a reset() that resets internal state; nothing in the +// roundtrip tests calls it. Encode → reset → re-encode should still +// produce a stream that decodes to the second batch's values. +TEST(EncodingCoverage, ResetClearsState) { + { + IntZigzagEncoder enc; + IntZigzagDecoder dec; + common::ByteStream s(64, common::MOD_DEFAULT); + EXPECT_EQ(enc.encode(123, s), common::E_OK); + enc.flush(s); + EXPECT_EQ(dec.decode(s), 123); + dec.reset(); + common::ByteStream s2(64, common::MOD_DEFAULT); + EXPECT_EQ(enc.encode(-456, s2), common::E_OK); + enc.flush(s2); + EXPECT_EQ(dec.decode(s2), -456); + } + { + IntGorillaEncoder enc; + IntGorillaDecoder dec; + common::ByteStream s(64, common::MOD_DEFAULT); + EXPECT_EQ(enc.encode(7, s), common::E_OK); + EXPECT_EQ(enc.encode(7, s), common::E_OK); + enc.flush(s); + int32_t v; + EXPECT_EQ(dec.read_int32(v, s), common::E_OK); + EXPECT_EQ(v, 7); + dec.reset(); + enc.reset(); + common::ByteStream s2(64, common::MOD_DEFAULT); + EXPECT_EQ(enc.encode(42, s2), common::E_OK); + EXPECT_EQ(enc.encode(42, s2), common::E_OK); + enc.flush(s2); + EXPECT_EQ(dec.read_int32(v, s2), common::E_OK); + EXPECT_EQ(v, 42); + } +} + +// ── has_remaining variants ────────────────────────────────────────────── +TEST(EncodingCoverage, HasRemainingOnEmptyAndAfterDrain) { + common::ByteStream empty(64, common::MOD_DEFAULT); + { + IntZigzagDecoder zz; + EXPECT_FALSE(zz.has_remaining(empty)); + } + { + IntGorillaDecoder g; + EXPECT_FALSE(g.has_remaining(empty)); + } + { + Int32RleDecoder rle; + EXPECT_FALSE(rle.has_remaining(empty)); + } + { + TS2DIFFDecoder t; + EXPECT_FALSE(t.has_remaining(empty)); + } + { + PlainDecoder p; + EXPECT_FALSE(p.has_remaining(empty)); + } +} + +// ── Gorilla floating-point special values ────────────────────────────── +// +// FloatGorillaDecoder / DoubleGorillaDecoder run different VALUE_BITS and +// ending-sentinel paths. Verify they round-trip NaN, infinity, -0.0 and +// denormals — none of which the existing happy-path roundtrip exercises. +TEST(EncodingCoverage, GorillaFloatSpecialValues) { + FloatGorillaEncoder enc; + common::ByteStream s(256, common::MOD_DEFAULT); + std::vector values = { + 0.0f, + -0.0f, + std::numeric_limits::infinity(), + -std::numeric_limits::infinity(), + std::numeric_limits::min(), + std::numeric_limits::denorm_min(), + std::numeric_limits::epsilon(), + 1.0f, + -1.0f, + std::numeric_limits::max(), + std::numeric_limits::lowest(), + }; + for (float v : values) ASSERT_EQ(enc.encode(v, s), common::E_OK); + enc.flush(s); + + FloatGorillaDecoder dec; + float out; + for (size_t i = 0; i < values.size(); i++) { + ASSERT_EQ(dec.read_float(out, s), common::E_OK) << "i=" << i; + if (std::isnan(values[i])) { + EXPECT_TRUE(std::isnan(out)); + } else { + // Bitwise compare to catch -0.0 vs 0.0 etc. + uint32_t a, b; + memcpy(&a, &values[i], sizeof(float)); + memcpy(&b, &out, sizeof(float)); + EXPECT_EQ(a, b) << "i=" << i; + } + } +} + +TEST(EncodingCoverage, GorillaDoubleSpecialValues) { + DoubleGorillaEncoder enc; + common::ByteStream s(256, common::MOD_DEFAULT); + std::vector values = { + 0.0, + -0.0, + std::numeric_limits::infinity(), + -std::numeric_limits::infinity(), + std::numeric_limits::min(), + std::numeric_limits::denorm_min(), + std::numeric_limits::epsilon(), + 1.0, + -1.0, + std::numeric_limits::max(), + std::numeric_limits::lowest(), + }; + for (double v : values) ASSERT_EQ(enc.encode(v, s), common::E_OK); + enc.flush(s); + + DoubleGorillaDecoder dec; + double out; + for (size_t i = 0; i < values.size(); i++) { + ASSERT_EQ(dec.read_double(out, s), common::E_OK) << "i=" << i; + uint64_t a, b; + memcpy(&a, &values[i], sizeof(double)); + memcpy(&b, &out, sizeof(double)); + EXPECT_EQ(a, b) << "i=" << i; + } +} + +// ── Gorilla skip path ─────────────────────────────────────────────────── +TEST(EncodingCoverage, GorillaSkipInt32Roundtrip) { + IntGorillaEncoder enc; + common::ByteStream stream(1024, common::MOD_DEFAULT); + const int N = 200; + std::vector values(N); + for (int i = 0; i < N; i++) { + values[i] = i * 11 - 5; + ASSERT_EQ(enc.encode(values[i], stream), common::E_OK); + } + enc.flush(stream); + + // Wrap into contiguous buffer for batch_skip_raw. + uint32_t total = stream.total_size(); + std::vector buf(total); + uint32_t got = 0; + stream.read_buf(buf.data(), total, got); + common::ByteStream wrapped(common::MOD_DEFAULT); + wrapped.wrap_from((const char*)buf.data(), total); + + IntGorillaDecoder dec; + int skipped = 0; + ASSERT_EQ(dec.skip_int32(50, skipped, wrapped), common::E_OK); + EXPECT_EQ(skipped, 50); + int32_t out[N]; + int actual = 0; + ASSERT_EQ(dec.read_batch_int32(out, N - 50, actual, wrapped), common::E_OK); + EXPECT_EQ(actual, N - 50); + for (int i = 0; i < N - 50; i++) { + EXPECT_EQ(out[i], values[50 + i]) << "i=" << i; + } +} + +// ── TS2DIFF batch decode hits SIMD block + scalar tail ───────────────── +TEST(EncodingCoverage, TS2DIFFBatchInt32MultipleBlocks) { + TS2DIFFEncoder enc; + common::ByteStream s(8192, common::MOD_DEFAULT); + // Encode 500 values to span ~4 blocks (default block size 128). + const int N = 500; + std::vector values(N); + for (int i = 0; i < N; i++) { + values[i] = i * 7 + 3; + ASSERT_EQ(enc.encode(values[i], s), common::E_OK); + } + ASSERT_EQ(enc.flush(s), common::E_OK); + + // Wrap-from for the SIMD/scalar block fast path. + uint32_t total = s.total_size(); + std::vector buf(total); + uint32_t got = 0; + s.read_buf(buf.data(), total, got); + common::ByteStream wrapped(common::MOD_DEFAULT); + wrapped.wrap_from((const char*)buf.data(), total); + + TS2DIFFDecoder dec; + std::vector out(N); + int total_decoded = 0; + while (dec.has_remaining(wrapped) && total_decoded < N) { + int actual = 0; + ASSERT_EQ(dec.read_batch_int32(out.data() + total_decoded, + N - total_decoded, actual, wrapped), + common::E_OK); + if (actual == 0) break; + total_decoded += actual; + } + EXPECT_EQ(total_decoded, N); + for (int i = 0; i < N; i++) EXPECT_EQ(out[i], values[i]) << "i=" << i; +} + +TEST(EncodingCoverage, TS2DIFFBatchInt64MultipleBlocks) { + TS2DIFFEncoder enc; + common::ByteStream s(8192, common::MOD_DEFAULT); + const int N = 500; + std::vector values(N); + for (int i = 0; i < N; i++) { + values[i] = static_cast(i) * 17 + 41; + ASSERT_EQ(enc.encode(values[i], s), common::E_OK); + } + ASSERT_EQ(enc.flush(s), common::E_OK); + + uint32_t total = s.total_size(); + std::vector buf(total); + uint32_t got = 0; + s.read_buf(buf.data(), total, got); + common::ByteStream wrapped(common::MOD_DEFAULT); + wrapped.wrap_from((const char*)buf.data(), total); + + TS2DIFFDecoder dec; + std::vector out(N); + int total_decoded = 0; + while (dec.has_remaining(wrapped) && total_decoded < N) { + int actual = 0; + ASSERT_EQ(dec.read_batch_int64(out.data() + total_decoded, + N - total_decoded, actual, wrapped), + common::E_OK); + if (actual == 0) break; + total_decoded += actual; + } + EXPECT_EQ(total_decoded, N); + for (int i = 0; i < N; i++) EXPECT_EQ(out[i], values[i]) << "i=" << i; +} + +// ── Plain encoder: encode_batch fast paths for each type ─────────────── +TEST(EncodingCoverage, PlainEncoderBatchAllTypes) { + PlainEncoder enc; + PlainDecoder dec; + + // Float batch. + { + common::ByteStream s(1024, common::MOD_DEFAULT); + const uint32_t N = 100; + float v[N]; + for (uint32_t i = 0; i < N; i++) v[i] = i * 0.5f - 1.0f; + ASSERT_EQ(enc.encode_batch(v, N, s), common::E_OK); + float out[N]; + int actual = 0; + ASSERT_EQ(dec.read_batch_float(out, N, actual, s), common::E_OK); + EXPECT_EQ(actual, static_cast(N)); + for (uint32_t i = 0; i < N; i++) EXPECT_FLOAT_EQ(out[i], v[i]); + } + // Int64 batch. + { + common::ByteStream s(1024, common::MOD_DEFAULT); + const uint32_t N = 100; + int64_t v[N]; + for (uint32_t i = 0; i < N; i++) v[i] = i * 1000 - 50; + ASSERT_EQ(enc.encode_batch(v, N, s), common::E_OK); + int64_t out[N]; + int actual = 0; + ASSERT_EQ(dec.read_batch_int64(out, N, actual, s), common::E_OK); + EXPECT_EQ(actual, static_cast(N)); + for (uint32_t i = 0; i < N; i++) EXPECT_EQ(out[i], v[i]); + } +} + +// ── PlainDecoder skip paths (wrapped + paged) ────────────────────────── +TEST(EncodingCoverage, PlainSkipPagedStream) { + PlainEncoder enc; + PlainDecoder dec; + // Paged ByteStream (tiny page) forces the fallback path. + common::ByteStream s(16, common::MOD_DEFAULT); + for (int i = 0; i < 32; i++) + ASSERT_EQ(enc.encode((int64_t)i, s), common::E_OK); + int skipped = 0; + ASSERT_EQ(dec.skip_int64(10, skipped, s), common::E_OK); + EXPECT_EQ(skipped, 10); + int64_t out; + ASSERT_EQ(dec.read_int64(out, s), common::E_OK); + EXPECT_EQ(out, 10); +} + +// ── Dictionary codec roundtrip ───────────────────────────────────────── +TEST(EncodingCoverage, DictionaryStringRoundTrip) { + DictionaryEncoder enc; + common::ByteStream s(1024, common::MOD_DEFAULT); + + std::vector raw = {"apple", "banana", "apple", + "cherry", "banana", "apple"}; + for (const auto& r : raw) { + common::String str(const_cast(r.c_str()), r.size()); + ASSERT_EQ(enc.encode(str, s), common::E_OK); + } + enc.flush(s); + + DictionaryDecoder dec; + common::PageArena pa; + pa.init(512, common::MOD_DEFAULT); + for (const auto& r : raw) { + common::String out; + ASSERT_EQ(dec.read_String(out, pa, s), common::E_OK); + ASSERT_EQ(out.len_, r.size()); + EXPECT_EQ(std::string(out.buf_, out.len_), r); + } +} + +} // namespace storage diff --git a/cpp/test/encoding/gorilla_codec_test.cc b/cpp/test/encoding/gorilla_codec_test.cc index 47056a6db..945451088 100644 --- a/cpp/test/encoding/gorilla_codec_test.cc +++ b/cpp/test/encoding/gorilla_codec_test.cc @@ -207,4 +207,319 @@ TEST_F(GorillaCodecTest, DoubleEncodingDecodingBoundaryValues) { } } +// ── Batch decode tests (exercises the raw-pointer GorillaBitReader path) ── + +TEST_F(GorillaCodecTest, Int32BatchDecode) { + storage::IntGorillaEncoder encoder; + common::ByteStream stream(1024, common::MOD_DEFAULT); + const int N = 500; + int32_t expected[N]; + for (int i = 0; i < N; i++) { + expected[i] = i * 7 - 100; + EXPECT_EQ(encoder.encode(expected[i], stream), common::E_OK); + } + encoder.flush(stream); + + // Copy to a contiguous buffer and wrap (simulates production path) + uint32_t total = stream.total_size(); + std::vector buf(total); + uint32_t got = 0; + stream.read_buf(buf.data(), total, got); + ASSERT_EQ(got, total); + + common::ByteStream wrapped(common::MOD_DEFAULT); + wrapped.wrap_from((const char*)buf.data(), total); + + storage::IntGorillaDecoder decoder; + int32_t out[N]; + int total_decoded = 0; + while (decoder.has_remaining(wrapped) && total_decoded < N) { + int batch = std::min(129, N - total_decoded); + int actual = 0; + EXPECT_EQ(decoder.read_batch_int32(out + total_decoded, batch, actual, + wrapped), + common::E_OK); + if (actual == 0) break; + total_decoded += actual; + } + ASSERT_EQ(total_decoded, N); + for (int i = 0; i < N; i++) { + EXPECT_EQ(out[i], expected[i]) << "mismatch at index " << i; + } +} + +TEST_F(GorillaCodecTest, Int64BatchDecode) { + storage::LongGorillaEncoder encoder; + common::ByteStream stream(1024, common::MOD_DEFAULT); + const int N = 500; + int64_t expected[N]; + for (int i = 0; i < N; i++) { + expected[i] = (int64_t)i * 13 - 200; + EXPECT_EQ(encoder.encode(expected[i], stream), common::E_OK); + } + encoder.flush(stream); + + uint32_t total = stream.total_size(); + std::vector buf(total); + uint32_t got = 0; + stream.read_buf(buf.data(), total, got); + + common::ByteStream wrapped(common::MOD_DEFAULT); + wrapped.wrap_from((const char*)buf.data(), total); + + storage::LongGorillaDecoder decoder; + int64_t out[N]; + int total_decoded = 0; + while (decoder.has_remaining(wrapped) && total_decoded < N) { + int batch = std::min(129, N - total_decoded); + int actual = 0; + EXPECT_EQ(decoder.read_batch_int64(out + total_decoded, batch, actual, + wrapped), + common::E_OK); + if (actual == 0) break; + total_decoded += actual; + } + ASSERT_EQ(total_decoded, N); + for (int i = 0; i < N; i++) { + EXPECT_EQ(out[i], expected[i]) << "mismatch at index " << i; + } +} + +TEST_F(GorillaCodecTest, FloatBatchDecode) { + storage::FloatGorillaEncoder encoder; + common::ByteStream stream(1024, common::MOD_DEFAULT); + const int N = 300; + std::vector expected(N); + for (int i = 0; i < N; i++) { + expected[i] = (float)i * 1.5f - 50.0f; + EXPECT_EQ(encoder.encode(expected[i], stream), common::E_OK); + } + encoder.flush(stream); + + uint32_t total = stream.total_size(); + std::vector buf(total); + uint32_t got = 0; + stream.read_buf(buf.data(), total, got); + + common::ByteStream wrapped(common::MOD_DEFAULT); + wrapped.wrap_from((const char*)buf.data(), total); + + storage::FloatGorillaDecoder decoder; + std::vector out(N); + int total_decoded = 0; + while (decoder.has_remaining(wrapped) && total_decoded < N) { + int batch = std::min(129, N - total_decoded); + int actual = 0; + EXPECT_EQ(decoder.read_batch_float(out.data() + total_decoded, batch, + actual, wrapped), + common::E_OK); + if (actual == 0) break; + total_decoded += actual; + } + ASSERT_EQ(total_decoded, N); + for (int i = 0; i < N; i++) { + EXPECT_FLOAT_EQ(out[i], expected[i]) << "mismatch at index " << i; + } +} + +TEST_F(GorillaCodecTest, DoubleBatchDecode) { + storage::DoubleGorillaEncoder encoder; + common::ByteStream stream(1024, common::MOD_DEFAULT); + const int N = 300; + std::vector expected(N); + for (int i = 0; i < N; i++) { + expected[i] = (double)i * 2.7 - 100.0; + EXPECT_EQ(encoder.encode(expected[i], stream), common::E_OK); + } + encoder.flush(stream); + + uint32_t total = stream.total_size(); + std::vector buf(total); + uint32_t got = 0; + stream.read_buf(buf.data(), total, got); + + common::ByteStream wrapped(common::MOD_DEFAULT); + wrapped.wrap_from((const char*)buf.data(), total); + + storage::DoubleGorillaDecoder decoder; + std::vector out(N); + int total_decoded = 0; + while (decoder.has_remaining(wrapped) && total_decoded < N) { + int batch = std::min(129, N - total_decoded); + int actual = 0; + EXPECT_EQ(decoder.read_batch_double(out.data() + total_decoded, batch, + actual, wrapped), + common::E_OK); + if (actual == 0) break; + total_decoded += actual; + } + ASSERT_EQ(total_decoded, N); + for (int i = 0; i < N; i++) { + EXPECT_DOUBLE_EQ(out[i], expected[i]) << "mismatch at index " << i; + } +} + +TEST_F(GorillaCodecTest, Int32BatchSkip) { + storage::IntGorillaEncoder encoder; + common::ByteStream stream(1024, common::MOD_DEFAULT); + const int N = 200; + int32_t expected[N]; + for (int i = 0; i < N; i++) { + expected[i] = i * 3; + EXPECT_EQ(encoder.encode(expected[i], stream), common::E_OK); + } + encoder.flush(stream); + + uint32_t total = stream.total_size(); + std::vector buf(total); + uint32_t got = 0; + stream.read_buf(buf.data(), total, got); + + common::ByteStream wrapped(common::MOD_DEFAULT); + wrapped.wrap_from((const char*)buf.data(), total); + + storage::IntGorillaDecoder decoder; + // Skip first 50 values + int skipped = 0; + EXPECT_EQ(decoder.skip_int32(50, skipped, wrapped), common::E_OK); + EXPECT_EQ(skipped, 50); + // Read next 50 values + int32_t out[50]; + int actual = 0; + EXPECT_EQ(decoder.read_batch_int32(out, 50, actual, wrapped), common::E_OK); + EXPECT_EQ(actual, 50); + for (int i = 0; i < 50; i++) { + EXPECT_EQ(out[i], expected[50 + i]) << "mismatch at index " << i; + } +} + +// Regression: batch_decode_raw used to write out[0] unconditionally in the +// bootstrap branch, even when capacity was 0. Verify the entry path early +// returns and leaves the stream + state untouched. +TEST_F(GorillaCodecTest, Int32BatchDecodeZeroCapacity) { + storage::IntGorillaEncoder encoder; + common::ByteStream stream(1024, common::MOD_DEFAULT); + const int N = 8; + for (int i = 0; i < N; i++) { + ASSERT_EQ(encoder.encode(i, stream), common::E_OK); + } + encoder.flush(stream); + + uint32_t total = stream.total_size(); + std::vector buf(total); + uint32_t got = 0; + stream.read_buf(buf.data(), total, got); + common::ByteStream wrapped(common::MOD_DEFAULT); + wrapped.wrap_from((const char*)buf.data(), total); + + storage::IntGorillaDecoder decoder; + int32_t sentinel[1] = {0x7fffffff}; + int actual = 42; + EXPECT_EQ(decoder.read_batch_int32(sentinel, 0, actual, wrapped), + common::E_OK); + EXPECT_EQ(actual, 0); + EXPECT_EQ(sentinel[0], 0x7fffffff); // not written + + // Followup decode should still read the first value 0. + int32_t out[N]; + int got_actual = 0; + EXPECT_EQ(decoder.read_batch_int32(out, N, got_actual, wrapped), + common::E_OK); + EXPECT_EQ(got_actual, N); + for (int i = 0; i < N; i++) EXPECT_EQ(out[i], i); +} + +TEST_F(GorillaCodecTest, Int64BatchDecodeZeroCapacity) { + storage::LongGorillaEncoder encoder; + common::ByteStream stream(1024, common::MOD_DEFAULT); + for (int i = 0; i < 8; i++) { + ASSERT_EQ(encoder.encode(static_cast(i), stream), + common::E_OK); + } + encoder.flush(stream); + + uint32_t total = stream.total_size(); + std::vector buf(total); + uint32_t got = 0; + stream.read_buf(buf.data(), total, got); + common::ByteStream wrapped(common::MOD_DEFAULT); + wrapped.wrap_from((const char*)buf.data(), total); + + storage::LongGorillaDecoder decoder; + int64_t sentinel[1] = {0x7fffffffffffffffLL}; + int actual = 42; + EXPECT_EQ(decoder.read_batch_int64(sentinel, 0, actual, wrapped), + common::E_OK); + EXPECT_EQ(actual, 0); + EXPECT_EQ(sentinel[0], 0x7fffffffffffffffLL); // not written +} + +// Regression: a truncated Gorilla page used to spin GorillaBitReader::read_long +// forever (bits stays 0, n -= 0 never decreases) and GorillaBitReader::read_bit +// would compute (cur_byte >> -1). batch_decode_raw must now surface +// E_BUF_NOT_ENOUGH instead of looping. +TEST_F(GorillaCodecTest, Int32BatchDecodeTruncatedInputReturnsError) { + // Encode enough values to fill several bits, then chop the buffer down to + // a small prefix so the decoder runs out of bits mid-value. + storage::IntGorillaEncoder encoder; + common::ByteStream stream(1024, common::MOD_DEFAULT); + const int N = 32; + for (int i = 0; i < N; i++) { + ASSERT_EQ(encoder.encode(i * 11 + 3, stream), common::E_OK); + } + encoder.flush(stream); + + uint32_t total = stream.total_size(); + ASSERT_GT(total, 4u); + std::vector buf(total); + uint32_t got = 0; + stream.read_buf(buf.data(), total, got); + ASSERT_EQ(got, total); + + // 3 bytes is large enough to bootstrap the first value (depending on + // VALUE_BITS_LENGTH_32BIT) but typically too short for the full batch. + common::ByteStream truncated(common::MOD_DEFAULT); + truncated.wrap_from((const char*)buf.data(), 3); + + storage::IntGorillaDecoder decoder; + int32_t out[N]; + int actual = -1; + int ret = decoder.read_batch_int32(out, N, actual, truncated); + // Either the decoder reports the truncation, or it stops early without + // looping forever; both are acceptable. What MUST NOT happen is a hang + // or a full-batch return — the test will time out on a hang via the + // GoogleTest harness. + EXPECT_TRUE(ret == common::E_OK || ret == common::E_BUF_NOT_ENOUGH) + << "unexpected ret=" << ret; + EXPECT_LT(actual, N); +} + +TEST_F(GorillaCodecTest, Int64BatchDecodeTruncatedInputReturnsError) { + storage::LongGorillaEncoder encoder; + common::ByteStream stream(1024, common::MOD_DEFAULT); + const int N = 32; + for (int i = 0; i < N; i++) { + ASSERT_EQ(encoder.encode(static_cast(i) * 17 + 5, stream), + common::E_OK); + } + encoder.flush(stream); + uint32_t total = stream.total_size(); + ASSERT_GT(total, 4u); + std::vector buf(total); + uint32_t got = 0; + stream.read_buf(buf.data(), total, got); + ASSERT_EQ(got, total); + + common::ByteStream truncated(common::MOD_DEFAULT); + truncated.wrap_from((const char*)buf.data(), 3); + + storage::LongGorillaDecoder decoder; + int64_t out[N]; + int actual = -1; + int ret = decoder.read_batch_int64(out, N, actual, truncated); + EXPECT_TRUE(ret == common::E_OK || ret == common::E_BUF_NOT_ENOUGH) + << "unexpected ret=" << ret; + EXPECT_LT(actual, N); +} + } // namespace storage diff --git a/cpp/test/encoding/plain_codec_test.cc b/cpp/test/encoding/plain_codec_test.cc index a51fa9261..6372469e6 100644 --- a/cpp/test/encoding/plain_codec_test.cc +++ b/cpp/test/encoding/plain_codec_test.cc @@ -110,4 +110,90 @@ TEST(PlainEncoderDecoderTest, EncodeDecodeDouble) { EXPECT_DOUBLE_EQ(original, decoded); } +// Regression: read_batch_int64/float/double used to dereference +// in.get_wrapped_buf() unconditionally, which is null for a normal paged +// ByteStream. Verify the fallback path produces correct results. +TEST(PlainEncoderDecoderTest, ReadBatchInt64PagedStream) { + PlainEncoder encoder; + PlainDecoder decoder; + // Tiny page size forces multi-page write so the stream is paged, not + // wrapped. + common::ByteStream stream(16, common::MOD_DEFAULT); + const int N = 32; + int64_t values[N]; + for (int i = 0; i < N; i++) { + values[i] = static_cast(i) * 7 - 3; + encoder.encode(values[i], stream); + } + int64_t out[N]; + int actual = 0; + EXPECT_EQ(decoder.read_batch_int64(out, N, actual, stream), common::E_OK); + EXPECT_EQ(actual, N); + for (int i = 0; i < N; i++) { + EXPECT_EQ(out[i], values[i]) << "mismatch at " << i; + } +} + +TEST(PlainEncoderDecoderTest, ReadBatchFloatPagedStream) { + PlainEncoder encoder; + PlainDecoder decoder; + common::ByteStream stream(16, common::MOD_DEFAULT); + const int N = 32; + float values[N]; + for (int i = 0; i < N; i++) { + values[i] = static_cast(i) * 0.5f - 1.25f; + encoder.encode(values[i], stream); + } + float out[N]; + int actual = 0; + EXPECT_EQ(decoder.read_batch_float(out, N, actual, stream), common::E_OK); + EXPECT_EQ(actual, N); + for (int i = 0; i < N; i++) { + EXPECT_FLOAT_EQ(out[i], values[i]); + } +} + +// Regression: encode_batch(const double*) used to reinterpret_cast to +// int64_t* and dispatch into the int64 path, which read the doubles through +// an int64_t pointer — a strict-aliasing violation under -O. The dedicated +// double path now memcpys per element; verify a full round-trip through it. +TEST(PlainEncoderDecoderTest, EncodeBatchDoubleRoundTrip) { + PlainEncoder encoder; + PlainDecoder decoder; + common::ByteStream stream(1024, common::MOD_DEFAULT); + const uint32_t N = 64; + double values[N]; + for (uint32_t i = 0; i < N; i++) { + values[i] = static_cast(i) * 0.125 - 3.14; + } + ASSERT_EQ(encoder.encode_batch(values, N, stream), common::E_OK); + + double out[N]; + int actual = 0; + EXPECT_EQ(decoder.read_batch_double(out, N, actual, stream), common::E_OK); + EXPECT_EQ(actual, static_cast(N)); + for (uint32_t i = 0; i < N; i++) { + EXPECT_DOUBLE_EQ(out[i], values[i]) << "mismatch at " << i; + } +} + +TEST(PlainEncoderDecoderTest, ReadBatchDoublePagedStream) { + PlainEncoder encoder; + PlainDecoder decoder; + common::ByteStream stream(16, common::MOD_DEFAULT); + const int N = 32; + double values[N]; + for (int i = 0; i < N; i++) { + values[i] = static_cast(i) * 1.25 + 3.14; + encoder.encode(values[i], stream); + } + double out[N]; + int actual = 0; + EXPECT_EQ(decoder.read_batch_double(out, N, actual, stream), common::E_OK); + EXPECT_EQ(actual, N); + for (int i = 0; i < N; i++) { + EXPECT_DOUBLE_EQ(out[i], values[i]); + } +} + } // end namespace storage \ No newline at end of file diff --git a/cpp/test/encoding/ts2diff_codec_test.cc b/cpp/test/encoding/ts2diff_codec_test.cc index 3164edafb..fb997103c 100644 --- a/cpp/test/encoding/ts2diff_codec_test.cc +++ b/cpp/test/encoding/ts2diff_codec_test.cc @@ -364,4 +364,120 @@ TEST_F(TS2DIFFCodecTest, TestEncodingLast) { EXPECT_FALSE(decoder_int_->has_remaining(out_stream_int32)); } +// Regression: skip_int32/skip_int64 used to advance the stream by the full +// block size even when the requested skip count fell short of the block, +// which silently dropped values from the next read in aligned nullable +// columns. Verify that skipping a count smaller than the first block leaves +// the remainder of that block intact and decodable. +TEST_F(TS2DIFFCodecTest, SkipPartialBlockInt32PreservesRemainder) { + common::ByteStream out_stream(1024, common::MOD_TS2DIFF_OBJ, false); + const int row_num = 1024; + std::vector data(row_num); + for (int i = 0; i < row_num; i++) { + data[i] = i * 3 + 7; + } + for (int i = 0; i < row_num; i++) { + ASSERT_EQ(encoder_int_->encode(data[i], out_stream), common::E_OK); + } + ASSERT_EQ(encoder_int_->flush(out_stream), common::E_OK); + + const int skip_count = 5; + int skipped = 0; + ASSERT_EQ(decoder_int_->skip_int32(skip_count, skipped, out_stream), + common::E_OK); + EXPECT_EQ(skipped, skip_count); + + int32_t v; + for (int i = skip_count; i < row_num; i++) { + ASSERT_EQ(decoder_int_->read_int32(v, out_stream), common::E_OK); + EXPECT_EQ(v, data[i]) << "mismatch at idx " << i; + } +} + +TEST_F(TS2DIFFCodecTest, SkipPartialBlockInt64PreservesRemainder) { + common::ByteStream out_stream(1024, common::MOD_TS2DIFF_OBJ, false); + const int row_num = 1024; + std::vector data(row_num); + for (int i = 0; i < row_num; i++) { + data[i] = static_cast(i) * 13 + 11; + } + for (int i = 0; i < row_num; i++) { + ASSERT_EQ(encoder_long_->encode(data[i], out_stream), common::E_OK); + } + ASSERT_EQ(encoder_long_->flush(out_stream), common::E_OK); + + const int skip_count = 7; + int skipped = 0; + ASSERT_EQ(decoder_long_->skip_int64(skip_count, skipped, out_stream), + common::E_OK); + EXPECT_EQ(skipped, skip_count); + + int64_t v; + for (int i = skip_count; i < row_num; i++) { + ASSERT_EQ(decoder_long_->read_int64(v, out_stream), common::E_OK); + EXPECT_EQ(v, data[i]) << "mismatch at idx " << i; + } +} + +// Regression: pack_bits_msb used to drop ByteStream::write_buf's return value +// on the floor and unconditionally return 0 (success). flush() then reported +// E_OK and reset() wiped encoder state even when the actual data never made +// it onto the stream. The fix surfaces the underlying error code via the +// helper's return value. +// +// We can't easily inject a real write failure without a custom allocator +// (ByteStream::write_buf only fails on OOM), so this test pins down the +// contract on the visible boundary: a wide bit_width must return the +// dedicated "fallback" sentinel (-1) so flush() knows to take the per-bit +// path, and the helper's return type must be the error code from write_buf +// otherwise. Future refactors that swallow the write error would either +// stop returning -1 for fallback (caught here) or break round-trip in the +// happy-path test below. +TEST_F(TS2DIFFCodecTest, PackBitsMsbFallbackSentinelStillReported) { + common::ByteStream out(1024, common::MOD_TS2DIFF_OBJ, false); + int64_t values[4] = {1, 2, 3, 4}; + EXPECT_EQ(TS2DIFFEncoder::pack_bits_msb(values, 4, 57, out), -1); + // Healthy small bit_width writes succeed. + int32_t small_values[4] = {1, 2, 3, 4}; + EXPECT_EQ(TS2DIFFEncoder::pack_bits_msb(small_values, 4, 3, out), + common::E_OK); +} + +// Regression: FloatTS2DIFFEncoder / DoubleTS2DIFFEncoder kept the previous +// page's overflow markers in underflow_flags_ when reset() was called +// directly (PageWriter drops a partial page that way). The next page would +// then read the stale flags and emit a wrong overflow bitmap. reset() now +// clears underflow_flags_; verify a reset between pages doesn't leak the +// first page's overflow state into the second. +TEST(FloatTS2DIFFEncoderResetTest, ResetClearsUnderflowFlags) { + storage::FloatTS2DIFFEncoder enc; + common::ByteStream out1(1024, common::MOD_TS2DIFF_OBJ, false); + // Encode a value that overflows the scale factor so the encoder records + // an underflow flag. + const float overflow_value = 1e30f; // scaled > INT32_MAX + ASSERT_EQ(enc.encode(0.0f, out1), common::E_OK); + ASSERT_EQ(enc.encode(overflow_value, out1), common::E_OK); + + // Drop the page without flushing. PageWriter does exactly this when + // discarding a half-built page. + enc.reset(); + + // Encode a clean page that should not have any overflow markers. + common::ByteStream out2(1024, common::MOD_TS2DIFF_OBJ, false); + ASSERT_EQ(enc.encode(0.0f, out2), common::E_OK); + ASSERT_EQ(enc.encode(1.0f, out2), common::E_OK); + ASSERT_EQ(enc.encode(2.0f, out2), common::E_OK); + ASSERT_EQ(enc.flush(out2), common::E_OK); + + // Round-trip the clean page; if reset() leaked the stale overflow flags + // the decoder would misinterpret the leading bytes as an overflow + // bitmap header and fail to recover the original values. + storage::FloatTS2DIFFDecoder dec; + float v = 0.0f; + for (int i = 0; i < 3; i++) { + ASSERT_EQ(dec.read_float(v, out2), common::E_OK); + EXPECT_NEAR(v, static_cast(i), 1e-5f); + } +} + } // namespace storage diff --git a/cpp/test/file/restorable_tsfile_io_writer_test.cc b/cpp/test/file/restorable_tsfile_io_writer_test.cc index 8f723e056..c60a855c5 100644 --- a/cpp/test/file/restorable_tsfile_io_writer_test.cc +++ b/cpp/test/file/restorable_tsfile_io_writer_test.cc @@ -994,4 +994,70 @@ TEST_F(RestorableTsFileIOWriterTest, } ASSERT_EQ(table_writer2.close(), E_OK); } -} \ No newline at end of file +} + +// Regression: recovery of an aligned single-page value chunk must consult the +// page's not-null bitmap to bind each decoded value to its real timestamp. +// The bug paired non-null values densely with times[0..N-1], so a column whose +// only non-null entry sat at the tail surfaced start_time/end_time equal to +// the head of the time chunk, which then leaked through chunk-level time +// filters. +TEST_F(RestorableTsFileIOWriterTest, RecoveryAlignedSparseStatRespectsBitmap) { + const int64_t kBase = 100; + const int kRowCount = 10; + const int kNonNullRow = 7; + const std::string table_name = "sparse_aligned_t"; + std::vector ms_vec; + ms_vec.push_back(new MeasurementSchema("device", STRING)); + ms_vec.push_back(new MeasurementSchema("s1", INT64)); + std::vector cats = {ColumnCategory::TAG, + ColumnCategory::FIELD}; + TableSchema table_schema(table_name, ms_vec, cats); + { + WriteFile wf; + ASSERT_EQ(wf.create(file_name_, GetWriteCreateFlags(), 0666), E_OK); + TsFileTableWriter tw(&wf, &table_schema); + Tablet tablet(table_schema.get_measurement_names(), + table_schema.get_data_types(), kRowCount); + tablet.set_table_name(table_name); + for (int i = 0; i < kRowCount; i++) { + tablet.add_timestamp(i, kBase + i); + tablet.add_value(i, "device", "d0"); + // Only row kNonNullRow gets a value; the rest stay null. + if (i == kNonNullRow) { + tablet.add_value(i, "s1", static_cast(999)); + } + } + ASSERT_EQ(tw.write_table(tablet), E_OK); + ASSERT_EQ(tw.flush(), E_OK); + ASSERT_EQ(tw.close(), E_OK); + wf.close(); + } + + CorruptCurrentFileTail(3); + + RestorableTsFileIOWriter rw; + ASSERT_EQ(rw.open(file_name_, true), E_OK); + + const std::vector& cgms = + rw.get_recovered_chunk_group_metas(); + ASSERT_FALSE(cgms.empty()); + + bool found_value_chunk = false; + for (ChunkGroupMeta* cgm : cgms) { + if (cgm == nullptr) continue; + for (auto it = cgm->chunk_meta_list_.begin(); + it != cgm->chunk_meta_list_.end(); it++) { + ChunkMeta* cm = it.get(); + if (cm == nullptr) continue; + if (cm->measurement_name_.to_std_string() != "s1") continue; + ASSERT_NE(cm->statistic_, nullptr); + // Exactly one non-null row at timestamp kBase + kNonNullRow. + EXPECT_EQ(cm->statistic_->count_, 1); + EXPECT_EQ(cm->statistic_->start_time_, kBase + kNonNullRow); + EXPECT_EQ(cm->statistic_->end_time_, kBase + kNonNullRow); + found_value_chunk = true; + } + } + EXPECT_TRUE(found_value_chunk); +} diff --git a/cpp/test/file/write_file_test.cc b/cpp/test/file/write_file_test.cc index 3cb9edd25..615f069e8 100644 --- a/cpp/test/file/write_file_test.cc +++ b/cpp/test/file/write_file_test.cc @@ -141,3 +141,47 @@ TEST_F(WriteFileTest, TruncateFile) { EXPECT_EQ(file_content, "Hello, "); remove(file_name.c_str()); } + +#include "file/tsfile_io_writer.h" + +// Regression: TsFileIOWriter::init() used to leave destroyed_=true after a +// previous destroy(), so the second destroy() (during ~TsFileIOWriter()) +// short-circuited and skipped meta_allocator_.destroy() / +// write_stream_.destroy() / file_ cleanup, leaking everything from the +// new lifecycle. Verify init() rearms the lifecycle by checking destroy() +// runs again cleanly. +TEST(TsFileIOWriterLifecycle, DestroyInitDestroyIsClean) { + std::string fn = "tsfile_iowriter_lifecycle.dat"; + remove(fn.c_str()); + + WriteFile wf1; + int flags = O_WRONLY | O_CREAT | O_TRUNC; +#ifdef _WIN32 + flags |= O_BINARY; +#endif + ASSERT_EQ(wf1.create(fn, flags, 0666), E_OK); + + TsFileIOWriter w; + ASSERT_EQ(w.init(&wf1), E_OK); + w.destroy(); + + // Re-init against a fresh WriteFile (same writer object). Under the + // old bug, destroyed_ stays true here. + remove(fn.c_str()); + WriteFile wf2; + ASSERT_EQ(wf2.create(fn, flags, 0666), E_OK); + ASSERT_EQ(w.init(&wf2), E_OK); + + // get_meta_size() reads meta_allocator_.get_total_used_bytes(); on a + // fresh init() this should be 0 (the allocator was reinitialised). + // If destroyed_ had been left true the allocator pages from before + // would still be there. + EXPECT_EQ(w.get_meta_size(), 0); + + // Trigger second destroy() — must not crash on the re-initialised + // resources. + w.destroy(); + + wf2.close(); + remove(fn.c_str()); +} diff --git a/cpp/test/reader/filter/time_in_filter_test.cc b/cpp/test/reader/filter/time_in_filter_test.cc new file mode 100644 index 000000000..9eceaaaa5 --- /dev/null +++ b/cpp/test/reader/filter/time_in_filter_test.cc @@ -0,0 +1,84 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * License); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include + +#include "reader/filter/time_operator.h" + +using namespace storage; + +// Regression: TimeIn::satisfy_start_end_time / contain_start_end_time used to +// return true unconditionally. In the aligned batch/multi paths the +// contain_start_end_time=true branch flips block_all_pass on, the per-row +// satisfy_batch_time check is skipped, and the reader emits every row in the +// block — making `WHERE time IN (2, 8)` look identical to "no time filter" +// whenever the block's time range overlapped the IN list at all. + +TEST(TimeInFilterTest, ContainStartEndTimeIsFalseForSparseRange) { + TimeIn in({2, 8}, /*not_in=*/false); + // Range [0,10] contains many times not in {2,8}; the block cannot + // unconditionally pass. + EXPECT_FALSE(in.contain_start_end_time(0, 10)); + // Range that is a single matching point passes. + EXPECT_TRUE(in.contain_start_end_time(2, 2)); + // Single non-matching point: doesn't pass. + EXPECT_FALSE(in.contain_start_end_time(5, 5)); +} + +TEST(TimeInFilterTest, SatisfyStartEndTimeTracksOverlap) { + TimeIn in({2, 8}, /*not_in=*/false); + // Some value in range → block may have matching rows. + EXPECT_TRUE(in.satisfy_start_end_time(0, 10)); + EXPECT_TRUE(in.satisfy_start_end_time(2, 2)); + EXPECT_TRUE(in.satisfy_start_end_time(8, 8)); + // No value in range → block can be skipped. + EXPECT_FALSE(in.satisfy_start_end_time(3, 7)); + EXPECT_FALSE(in.satisfy_start_end_time(9, 100)); +} + +TEST(TimeInFilterTest, NotInContainSemantics) { + TimeIn not_in({2, 8}, /*not_in=*/true); + // Range [3,7] has no excluded value → every row passes NOT IN. + EXPECT_TRUE(not_in.contain_start_end_time(3, 7)); + // Range [0,10] includes 2 and 8 → cannot blanket-pass. + EXPECT_FALSE(not_in.contain_start_end_time(0, 10)); +} + +TEST(TimeInFilterTest, NotInSatisfyStartEndTimeSemantics) { + TimeIn not_in({2, 8}, /*not_in=*/true); + // Single excluded point: filter rejects it. + EXPECT_FALSE(not_in.satisfy_start_end_time(2, 2)); + // Single non-excluded point: filter accepts it. + EXPECT_TRUE(not_in.satisfy_start_end_time(5, 5)); + // A wider range always has at least one non-excluded time. + EXPECT_TRUE(not_in.satisfy_start_end_time(0, 10)); +} + +TEST(TimeInFilterTest, BatchTimeFallbackUsesScalarSemantics) { + TimeIn in({2, 8}, /*not_in=*/false); + int64_t times[] = {1, 2, 3, 7, 8, 9}; + bool mask[6]; + int pass = in.satisfy_batch_time(times, 6, mask); + EXPECT_EQ(pass, 2); + EXPECT_FALSE(mask[0]); + EXPECT_TRUE(mask[1]); + EXPECT_FALSE(mask[2]); + EXPECT_FALSE(mask[3]); + EXPECT_TRUE(mask[4]); + EXPECT_FALSE(mask[5]); +} diff --git a/cpp/test/reader/query_by_row_performance_test.cc b/cpp/test/reader/query_by_row_performance_test.cc index 4caf26f71..051c15d87 100644 --- a/cpp/test/reader/query_by_row_performance_test.cc +++ b/cpp/test/reader/query_by_row_performance_test.cc @@ -60,6 +60,7 @@ #include "file/write_file.h" #include "reader/tsfile_reader.h" #include "reader/tsfile_tree_reader.h" +#include "utils/util_define.h" #include "writer/tsfile_table_writer.h" #include "writer/tsfile_tree_writer.h" @@ -86,7 +87,8 @@ static int query_by_row_perf_iters() { return n; } -static int compute_offset_with_env(int num_rows, int default_offset) { +MAYBE_UNUSED static int compute_offset_with_env(int num_rows, + int default_offset) { int offset = default_offset; int abs = 0; if (get_env_int("QUERY_BY_ROW_PERF_OFFSET", abs)) { diff --git a/cpp/test/reader/table_view/tsfile_reader_table_batch_test.cc b/cpp/test/reader/table_view/tsfile_reader_table_batch_test.cc index e115552ec..6e2da1c40 100644 --- a/cpp/test/reader/table_view/tsfile_reader_table_batch_test.cc +++ b/cpp/test/reader/table_view/tsfile_reader_table_batch_test.cc @@ -133,6 +133,25 @@ class TsFileTableReaderBatchTest : public ::testing::Test { column_categories); } + static TableSchema* gen_table_schema_with_string_field() { + std::vector measurement_schemas; + std::vector column_categories; + measurement_schemas.emplace_back( + new MeasurementSchema("id0", TSDataType::STRING, TSEncoding::PLAIN, + CompressionType::UNCOMPRESSED)); + column_categories.emplace_back(ColumnCategory::TAG); + measurement_schemas.emplace_back(new MeasurementSchema( + "s_text", TSDataType::STRING, TSEncoding::PLAIN, + CompressionType::UNCOMPRESSED)); + column_categories.emplace_back(ColumnCategory::FIELD); + measurement_schemas.emplace_back( + new MeasurementSchema("s_num", TSDataType::INT64, TSEncoding::PLAIN, + CompressionType::UNCOMPRESSED)); + column_categories.emplace_back(ColumnCategory::FIELD); + return new TableSchema("testTableString", measurement_schemas, + column_categories); + } + static storage::Tablet gen_tablet(TableSchema* table_schema, int offset, int device_num, int num_timestamp_per_device = 10) { @@ -171,6 +190,121 @@ class TsFileTableReaderBatchTest : public ::testing::Test { delete[] literal; return tablet; } + + static storage::Tablet gen_tablet_with_string_field( + TableSchema* table_schema, int num_rows) { + storage::Tablet tablet(table_schema->get_table_name(), + table_schema->get_measurement_names(), + table_schema->get_data_types(), + table_schema->get_column_categories(), num_rows); + for (int i = 0; i < num_rows; i++) { + tablet.add_timestamp(i, i); + tablet.add_value(i, "id0", "device_a"); + tablet.add_value(i, "s_text", "value_" + std::to_string(i)); + tablet.add_value(i, "s_num", static_cast(i * 10)); + } + return tablet; + } + + std::vector query_timestamps_in_batches(TableSchema* table_schema, + int64_t start_time, + int64_t end_time, + int batch_size) { + storage::TsFileReader reader; + int ret = reader.open(file_name_); + EXPECT_EQ(ret, common::E_OK); + + ResultSet* tmp_result_set = nullptr; + ret = reader.query(table_schema->get_table_name(), + table_schema->get_measurement_names(), start_time, + end_time, tmp_result_set, batch_size); + EXPECT_EQ(ret, common::E_OK); + EXPECT_NE(tmp_result_set, nullptr); + + auto* table_result_set = dynamic_cast(tmp_result_set); + EXPECT_NE(table_result_set, nullptr); + + std::vector timestamps; + common::TsBlock* block = nullptr; + while ((ret = table_result_set->get_next_tsblock(block)) == + common::E_OK) { + if (block == nullptr) { + ADD_FAILURE() << "Expected non-null TsBlock"; + break; + } + common::RowIterator row_iterator(block); + while (row_iterator.has_next()) { + uint32_t len = 0; + bool null = false; + int64_t timestamp = *reinterpret_cast( + row_iterator.read(0, &len, &null)); + EXPECT_FALSE(null); + timestamps.push_back(timestamp); + + for (uint32_t col_idx = 1; + col_idx < row_iterator.get_column_count(); ++col_idx) { + const char* value = row_iterator.read(col_idx, &len, &null); + EXPECT_FALSE(null); + if (row_iterator.get_data_type(col_idx) == + TSDataType::INT64) { + int64_t int_val = + *reinterpret_cast(value); + EXPECT_EQ(int_val, 0); + } + } + row_iterator.next(); + } + } + + reader.destroy_query_data_set(table_result_set); + EXPECT_EQ(reader.close(), common::E_OK); + return timestamps; + } + + std::vector> query_string_field_in_batches( + TableSchema* table_schema, int64_t start_time, int64_t end_time, + int batch_size) { + storage::TsFileReader reader; + int ret = reader.open(file_name_); + EXPECT_EQ(ret, common::E_OK); + + ResultSet* tmp_result_set = nullptr; + ret = reader.query(table_schema->get_table_name(), + table_schema->get_measurement_names(), start_time, + end_time, tmp_result_set, batch_size); + EXPECT_EQ(ret, common::E_OK); + EXPECT_NE(tmp_result_set, nullptr); + + auto* table_result_set = dynamic_cast(tmp_result_set); + EXPECT_NE(table_result_set, nullptr); + + std::vector> result; + common::TsBlock* block = nullptr; + while ((ret = table_result_set->get_next_tsblock(block)) == + common::E_OK) { + if (block == nullptr) { + ADD_FAILURE() << "Expected non-null TsBlock"; + break; + } + common::RowIterator row_iterator(block); + while (row_iterator.has_next()) { + uint32_t len = 0; + bool null = false; + int64_t timestamp = *reinterpret_cast( + row_iterator.read(0, &len, &null)); + EXPECT_FALSE(null); + + const char* value = row_iterator.read(2, &len, &null); + EXPECT_FALSE(null); + result.emplace_back(timestamp, std::string(value, len)); + row_iterator.next(); + } + } + + reader.destroy_query_data_set(table_result_set); + EXPECT_EQ(reader.close(), common::E_OK); + return result; + } }; TEST_F(TsFileTableReaderBatchTest, BatchQueryWithSmallBatchSize) { @@ -361,6 +495,89 @@ TEST_F(TsFileTableReaderBatchTest, BatchQueryVerifyDataCorrectness) { delete table_schema; } +TEST_F(TsFileTableReaderBatchTest, + BatchQueryKeepsStateAcrossTsBlocksWithinPage) { + auto table_schema = gen_table_schema(); + auto tsfile_table_writer_ = + std::make_shared(&write_file_, table_schema); + + const int prev_page_point_num = g_config_value_.page_writer_max_point_num_; + g_config_value_.page_writer_max_point_num_ = 128; + + const int device_num = 1; + const int points_per_device = 35; + auto tablet = gen_tablet(table_schema, 0, device_num, points_per_device); + ASSERT_EQ(tsfile_table_writer_->write_table(tablet), common::E_OK); + ASSERT_EQ(tsfile_table_writer_->flush(), common::E_OK); + ASSERT_EQ(tsfile_table_writer_->close(), common::E_OK); + + const int batch_size = 8; + std::vector timestamps = query_timestamps_in_batches( + table_schema, 0, 1000000000000LL, batch_size); + + ASSERT_EQ(timestamps.size(), static_cast(points_per_device)); + for (int64_t i = 0; i < points_per_device; ++i) { + EXPECT_EQ(timestamps[i], i); + } + + g_config_value_.page_writer_max_point_num_ = prev_page_point_num; + delete table_schema; +} + +TEST_F(TsFileTableReaderBatchTest, BatchQueryTimeFilterAcrossBoundaryPages) { + auto table_schema = gen_table_schema(); + auto tsfile_table_writer_ = + std::make_shared(&write_file_, table_schema); + + const int prev_page_point_num = g_config_value_.page_writer_max_point_num_; + g_config_value_.page_writer_max_point_num_ = 8; + + const int device_num = 1; + const int points_per_device = 25; + auto tablet = gen_tablet(table_schema, 0, device_num, points_per_device); + ASSERT_EQ(tsfile_table_writer_->write_table(tablet), common::E_OK); + ASSERT_EQ(tsfile_table_writer_->flush(), common::E_OK); + ASSERT_EQ(tsfile_table_writer_->close(), common::E_OK); + + const int batch_size = 4; + std::vector timestamps = + query_timestamps_in_batches(table_schema, 5, 18, batch_size); + + ASSERT_EQ(timestamps.size(), static_cast(14)); + for (int64_t i = 0; i < 14; ++i) { + EXPECT_EQ(timestamps[i], i + 5); + } + + g_config_value_.page_writer_max_point_num_ = prev_page_point_num; + delete table_schema; +} + +TEST_F(TsFileTableReaderBatchTest, + BatchQueryVariableLengthFieldAcrossTsBlocks) { + auto table_schema = gen_table_schema_with_string_field(); + auto tsfile_table_writer_ = + std::make_shared(&write_file_, table_schema); + + const int prev_page_point_num = g_config_value_.page_writer_max_point_num_; + g_config_value_.page_writer_max_point_num_ = 8; + + const int num_rows = 23; + auto tablet = gen_tablet_with_string_field(table_schema, num_rows); + ASSERT_EQ(tsfile_table_writer_->write_table(tablet), common::E_OK); + ASSERT_EQ(tsfile_table_writer_->flush(), common::E_OK); + ASSERT_EQ(tsfile_table_writer_->close(), common::E_OK); + + auto result = query_string_field_in_batches(table_schema, 0, INT64_MAX, 5); + ASSERT_EQ(result.size(), static_cast(num_rows)); + for (int i = 0; i < num_rows; ++i) { + EXPECT_EQ(result[i].first, i); + EXPECT_EQ(result[i].second, "value_" + std::to_string(i)); + } + + g_config_value_.page_writer_max_point_num_ = prev_page_point_num; + delete table_schema; +} + TEST_F(TsFileTableReaderBatchTest, PerformanceComparisonSinglePointVsBatch) { // Create table schema without tags (only fields) auto table_schema = gen_table_schema_no_tag(); diff --git a/cpp/test/reader/table_view/tsfile_reader_table_test.cc b/cpp/test/reader/table_view/tsfile_reader_table_test.cc index e55f34c2a..be0a6f64c 100644 --- a/cpp/test/reader/table_view/tsfile_reader_table_test.cc +++ b/cpp/test/reader/table_view/tsfile_reader_table_test.cc @@ -209,6 +209,43 @@ class TsFileTableReaderTest : public ::testing::Test { TEST_F(TsFileTableReaderTest, TableModelQuery) { test_table_model_query(); } +// Regression: single_device_tsblock_reader used to initialise all_outside +// to true, then bail out when the per-device chunk-list loop didn't +// execute (e.g. time-only query where time_series_indexs is empty). The +// result was an empty resultset whenever a time filter was present, even +// though there might be rows that satisfy it. Verify that querying only +// the time column with a tight filter still returns the matching rows. +TEST_F(TsFileTableReaderTest, TimeOnlyQueryWithTimeFilterStillReturnsRows) { + auto table_schema = gen_table_schema(0); + auto tsfile_table_writer_ = + std::make_shared(&write_file_, table_schema); + auto tablet = gen_tablet(table_schema, /*start_ts=*/0, /*device_num=*/1, + /*per_device=*/10); + ASSERT_EQ(tsfile_table_writer_->write_table(tablet), common::E_OK); + ASSERT_EQ(tsfile_table_writer_->flush(), common::E_OK); + ASSERT_EQ(tsfile_table_writer_->close(), common::E_OK); + + storage::TsFileReader reader; + ASSERT_EQ(reader.open(file_name_), common::E_OK); + ResultSet* tmp = nullptr; + // Query with an empty measurement list and a time window covering all + // 10 timestamps. Under the bug this returned 0 rows. + std::vector empty_cols; + ASSERT_EQ(reader.query(table_schema->get_table_name(), empty_cols, + /*start_time=*/0, /*end_time=*/9, tmp), + common::E_OK); + auto* rs = (TableResultSet*)tmp; + int rows = 0; + bool hn = false; + while (IS_SUCC(rs->next(hn)) && hn) { + rows++; + } + EXPECT_EQ(rows, 10); + reader.destroy_query_data_set(rs); + ASSERT_EQ(reader.close(), common::E_OK); + delete table_schema; +} + TEST_F(TsFileTableReaderTest, TableModelQueryOneSmallPage) { int prev_config = g_config_value_.page_writer_max_point_num_; g_config_value_.page_writer_max_point_num_ = 5; @@ -216,11 +253,13 @@ TEST_F(TsFileTableReaderTest, TableModelQueryOneSmallPage) { g_config_value_.page_writer_max_point_num_ = prev_config; } -// Triggers memory-based seal in aligned table: time page seals by size while -// value pages may not; ensure value pages are sealed together with time (no -// time-page-sealed / value-page-not-sealed inconsistency). -// Use 512 bytes so time seals by size before point count; 128 was too small -// and could produce misaligned time/value pages on some encodings. +TEST_F(TsFileTableReaderTest, TableModelQueryOneLargePage) { + int prev_config = g_config_value_.page_writer_max_point_num_; + g_config_value_.page_writer_max_point_num_ = 10000; + test_table_model_query(g_config_value_.page_writer_max_point_num_); + g_config_value_.page_writer_max_point_num_ = prev_config; +} + TEST_F(TsFileTableReaderTest, TableModelQueryMemoryBasedSeal) { uint32_t prev_point_num = g_config_value_.page_writer_max_point_num_; uint32_t prev_mem_bytes = g_config_value_.page_writer_max_memory_bytes_; @@ -231,13 +270,6 @@ TEST_F(TsFileTableReaderTest, TableModelQueryMemoryBasedSeal) { g_config_value_.page_writer_max_memory_bytes_ = prev_mem_bytes; } -TEST_F(TsFileTableReaderTest, TableModelQueryOneLargePage) { - int prev_config = g_config_value_.page_writer_max_point_num_; - g_config_value_.page_writer_max_point_num_ = 10000; - test_table_model_query(g_config_value_.page_writer_max_point_num_); - g_config_value_.page_writer_max_point_num_ = prev_config; -} - TEST_F(TsFileTableReaderTest, TableModelQueryMultiLargePage) { int prev_config = g_config_value_.page_writer_max_point_num_; g_config_value_.page_writer_max_point_num_ = 10000; @@ -1221,4 +1253,4 @@ TEST_F(TsFileTableReaderTest, MultiTagColumnFilterOnSecondTag) { ASSERT_EQ(reader.close(), common::E_OK); delete table_schema; delete tag_filter; -} \ No newline at end of file +} diff --git a/cpp/test/reader/table_view/tsfile_table_query_by_row_test.cc b/cpp/test/reader/table_view/tsfile_table_query_by_row_test.cc index 026f75b2d..9e3d9b562 100644 --- a/cpp/test/reader/table_view/tsfile_table_query_by_row_test.cc +++ b/cpp/test/reader/table_view/tsfile_table_query_by_row_test.cc @@ -27,7 +27,6 @@ #include "common/schema.h" #include "common/tablet.h" #include "file/write_file.h" -#include "reader/filter/tag_filter.h" #include "reader/table_result_set.h" #include "reader/tsfile_reader.h" #include "writer/tsfile_table_writer.h" @@ -103,6 +102,41 @@ class TableQueryByRowTest : public ::testing::Test { delete schema; } + void write_single_device_file_with_string_field(int num_rows) { + std::vector col_schemas = { + ColumnSchema("id1", TSDataType::STRING, + CompressionType::UNCOMPRESSED, TSEncoding::PLAIN, + ColumnCategory::TAG), + ColumnSchema("s_text", TSDataType::STRING, + CompressionType::UNCOMPRESSED, TSEncoding::PLAIN, + ColumnCategory::FIELD), + ColumnSchema("s_num", TSDataType::INT64, + CompressionType::UNCOMPRESSED, TSEncoding::PLAIN, + ColumnCategory::FIELD), + }; + auto* schema = new TableSchema("t_string", col_schemas); + auto* writer = new TsFileTableWriter(&write_file_, schema); + + Tablet tablet( + "t_string", {"id1", "s_text", "s_num"}, + {TSDataType::STRING, TSDataType::STRING, TSDataType::INT64}, + {ColumnCategory::TAG, ColumnCategory::FIELD, ColumnCategory::FIELD}, + num_rows); + + for (int i = 0; i < num_rows; i++) { + tablet.add_timestamp(i, static_cast(i)); + tablet.add_value(i, "id1", "device_a"); + tablet.add_value(i, "s_text", "value_" + std::to_string(i)); + tablet.add_value(i, "s_num", static_cast(i * 10)); + } + + ASSERT_EQ(writer->write_table(tablet), E_OK); + ASSERT_EQ(writer->flush(), E_OK); + ASSERT_EQ(writer->close(), E_OK); + delete writer; + delete schema; + } + void write_multi_device_file(int rows_per_device, int device_count) { std::vector col_schemas = { ColumnSchema("id1", TSDataType::STRING, @@ -341,6 +375,29 @@ class TableQueryByRowTest : public ::testing::Test { return manual; } + std::vector> query_by_row_time_and_text( + const std::string& table_name, const std::vector& cols, + int offset, int limit) { + TsFileReader reader; + EXPECT_EQ(reader.open(file_name_), E_OK); + ResultSet* rs = nullptr; + EXPECT_EQ(reader.queryByRow(table_name, cols, offset, limit, rs), E_OK); + EXPECT_NE(rs, nullptr); + + std::vector> result; + bool has_next = false; + while (IS_SUCC(rs->next(has_next)) && has_next) { + int64_t time = rs->get_value("time"); + common::String* text_val = rs->get_value("s_text"); + result.emplace_back(time, + std::string(text_val->buf_, text_val->len_)); + } + + reader.destroy_query_data_set(rs); + reader.close(); + return result; + } + std::string file_name_; WriteFile write_file_; }; @@ -356,6 +413,23 @@ TEST_F(TableQueryByRowTest, NoOffsetNoLimit) { ASSERT_EQ(result, all); } +TEST_F(TableQueryByRowTest, NoOffsetNoLimitWithSmallPages) { + int prev_page_config = g_config_value_.page_writer_max_point_num_; + g_config_value_.page_writer_max_point_num_ = 8; + + int num_rows = 25; + write_single_device_file(num_rows); + + auto result = query_by_row_time_and_s1("t1", {"id1", "s1", "s2"}, 0, -1); + ASSERT_EQ(result.size(), static_cast(num_rows)); + for (int i = 0; i < num_rows; ++i) { + EXPECT_EQ(result[i].first, i); + EXPECT_EQ(result[i].second, i * 10); + } + + g_config_value_.page_writer_max_point_num_ = prev_page_config; +} + // Offset only: skip first N rows, return the rest; limit=-1 means no cap. TEST_F(TableQueryByRowTest, OffsetOnly) { int num_rows = 50; @@ -399,6 +473,43 @@ TEST_F(TableQueryByRowTest, OffsetAndLimit) { } } +TEST_F(TableQueryByRowTest, OffsetAndLimitWithSmallPages) { + int prev_page_config = g_config_value_.page_writer_max_point_num_; + g_config_value_.page_writer_max_point_num_ = 8; + + int num_rows = 40; + write_single_device_file(num_rows); + + int offset = 7; + int limit = 19; + auto by_row = + query_by_row_time_and_s1("t1", {"id1", "s1", "s2"}, offset, limit); + auto manual = + query_manual_time_and_s1("t1", {"id1", "s1", "s2"}, offset, limit); + + ASSERT_EQ(by_row, manual); + + g_config_value_.page_writer_max_point_num_ = prev_page_config; +} + +TEST_F(TableQueryByRowTest, VariableLengthFieldWithSmallPages) { + int prev_page_config = g_config_value_.page_writer_max_point_num_; + g_config_value_.page_writer_max_point_num_ = 8; + + int num_rows = 21; + write_single_device_file_with_string_field(num_rows); + + auto result = query_by_row_time_and_text("t_string", + {"id1", "s_text", "s_num"}, 0, -1); + ASSERT_EQ(result.size(), static_cast(num_rows)); + for (int i = 0; i < num_rows; ++i) { + EXPECT_EQ(result[i].first, i); + EXPECT_EQ(result[i].second, "value_" + std::to_string(i)); + } + + g_config_value_.page_writer_max_point_num_ = prev_page_config; +} + // Offset beyond total row count: returns empty result. TEST_F(TableQueryByRowTest, OffsetBeyondData) { int num_rows = 30; @@ -652,15 +763,16 @@ TEST_F(TableQueryByRowTest, DenseSingleDeviceSsiLevelPushdown) { // Pushdown is faster than full query + manual next: queryByRow(offset, limit) // skips at device/SSI/Chunk level; old query then manual next decodes every -// row. Timing tolerance 20% to allow measurement noise. +// row. Timing tolerance 5% to allow measurement noise. TEST_F(TableQueryByRowTest, DISABLED_QueryByRowFasterThanManualNext) { - const int num_rows = 8000; - const int offset = 3000; + const int num_rows = 80000; + const int offset = 30000; const int limit = 1000; write_single_device_file(num_rows); const int num_iters = 5; - const double tolerance = 0.2; + const double tolerance = + 0.5; // 50% tolerance for cross-platform timing noise auto run_query_by_row = [this, offset, limit]() { TsFileReader reader; @@ -725,47 +837,3 @@ TEST_F(TableQueryByRowTest, DISABLED_QueryByRowFasterThanManualNext) { "(min_by_row=" << min_by_row << " ms, min_manual=" << min_manual << " ms)"; } - -// queryByRow with tag filter: only rows matching the tag predicate are -// returned. -TEST_F(TableQueryByRowTest, TagFilterEq) { - int rows_per_device = 20; - int device_count = 3; - write_multi_device_file(rows_per_device, device_count); - - // Reconstruct the same schema used by write_multi_device_file. - std::vector col_schemas = { - ColumnSchema("id1", TSDataType::STRING, CompressionType::UNCOMPRESSED, - TSEncoding::PLAIN, ColumnCategory::TAG), - ColumnSchema("s1", TSDataType::INT64, CompressionType::UNCOMPRESSED, - TSEncoding::PLAIN, ColumnCategory::FIELD), - }; - TableSchema schema("t1", col_schemas); - - // Build tag filter: id1 == "dev1" - TagFilterBuilder builder(&schema); - Filter* tag_filter = builder.eq("id1", "dev1"); - - TsFileReader reader; - ASSERT_EQ(reader.open(file_name_), E_OK); - - ResultSet* rs = nullptr; - ASSERT_EQ(reader.queryByRow("t1", {"id1", "s1"}, 0, -1, rs, tag_filter), - E_OK); - ASSERT_NE(rs, nullptr); - - std::vector filtered_s1; - bool has_next = false; - while (IS_SUCC(rs->next(has_next)) && has_next) { - filtered_s1.push_back(rs->get_value("s1")); - } - reader.destroy_query_data_set(rs); - reader.close(); - delete tag_filter; - - // dev1 has rows_per_device rows with s1 = 1*1000+t for t in [0,20). - ASSERT_EQ(filtered_s1.size(), static_cast(rows_per_device)); - for (int t = 0; t < rows_per_device; t++) { - EXPECT_EQ(filtered_s1[t], static_cast(1 * 1000 + t)); - } -} diff --git a/cpp/test/reader/tree_view/tsfile_reader_tree_test.cc b/cpp/test/reader/tree_view/tsfile_reader_tree_test.cc index 8181b6130..e4daed748 100644 --- a/cpp/test/reader/tree_view/tsfile_reader_tree_test.cc +++ b/cpp/test/reader/tree_view/tsfile_reader_tree_test.cc @@ -509,3 +509,48 @@ TEST_F(TsFileTreeReaderTest, QueryTableOnTreeMissingMeasurement) { } reader.close(); } + +// Regression: query_table_on_tree with an inverted time range (start > end) on +// a non-aligned tree device must yield zero rows, not E_NOT_SUPPORT. The chunk +// time span straddles both bounds and single-chunk timeseries carry no +// per-chunk statistic, so the device-level early-skip does NOT short-circuit; +// the empty value-column result previously fell through to the time-only +// fallback -> alloc_multi_ssi() (aligned-only) -> E_NOT_SUPPORT. +TEST_F(TsFileTreeReaderTest, QueryTableOnTreeInvertedTimeRange) { + std::string device_id = "root.Device1"; + std::vector measurement_ids = {"m1", "m2", "m3"}; + { + TsFileTreeWriter writer(&write_file_); + for (auto const& m : measurement_ids) { + auto* schema = new storage::MeasurementSchema(m, TSDataType::INT32); + ASSERT_EQ(E_OK, writer.register_timeseries(device_id, schema)); + delete schema; + } + for (int i = 0; i < 100; i++) { + TsRecord record(device_id, static_cast(i - 50)); + for (auto const& m : measurement_ids) { + record.add_point(m, static_cast(i)); + } + ASSERT_EQ(E_OK, writer.write(record)); + } + writer.flush(); + writer.close(); + } + + TsFileReader reader; + ASSERT_EQ(E_OK, reader.open(file_name_)); + ResultSet* result = nullptr; + int ret = reader.query_table_on_tree(measurement_ids, 10, -10, result); + ASSERT_EQ(E_OK, ret); + auto* trs = (storage::TableResultSet*)result; + bool has_next = false; + int row_cnt = 0; + int next_ret = E_OK; + while (IS_SUCC(next_ret = trs->next(has_next)) && has_next) { + row_cnt++; + } + EXPECT_EQ(E_OK, next_ret); + EXPECT_EQ(0, row_cnt); + reader.destroy_query_data_set(result); + reader.close(); +} diff --git a/cpp/test/reader/tree_view/tsfile_tree_query_by_row_test.cc b/cpp/test/reader/tree_view/tsfile_tree_query_by_row_test.cc index a686b8998..9c47a9d4d 100644 --- a/cpp/test/reader/tree_view/tsfile_tree_query_by_row_test.cc +++ b/cpp/test/reader/tree_view/tsfile_tree_query_by_row_test.cc @@ -16,7 +16,6 @@ * specific language governing permissions and limitations * under the License. */ -#include #include #include @@ -25,12 +24,10 @@ #include "common/global.h" #include "common/record.h" #include "common/schema.h" -#include "common/tablet.h" #include "file/write_file.h" #include "reader/tsfile_reader.h" #include "reader/tsfile_tree_reader.h" #include "writer/tsfile_tree_writer.h" -#include "writer/tsfile_writer.h" using namespace storage; using namespace common; @@ -210,6 +207,90 @@ class TreeQueryByRowTest : public ::testing::Test { WriteFile write_file_; }; +// Regression: aligned value chunks store statistic_->count_ as the +// non-null row count, not the total row count. Whole-chunk offset skip +// used to apply value_cm's count, so a sparse aligned chunk with 100 rows +// and 10 non-nulls would jump over all 100 rows on offset=10 — leaving +// the next chunks completely unread. The fix only takes the whole-chunk +// shortcut when time and value statistics agree on the row count, falling +// through to per-row offset handling otherwise. +TEST_F(TreeQueryByRowTest, SparseAlignedChunkOffsetCrossesChunks) { + using namespace storage; + libtsfile_destroy(); + libtsfile_init(); + remove(file_name_.c_str()); + + // Tighten per-chunk capacity so two write_tablet_aligned calls produce + // two distinct aligned chunks (rather than being merged into one). + uint32_t prev_chunk_thresh = g_config_value_.chunk_group_size_threshold_; + g_config_value_.chunk_group_size_threshold_ = 64; + int64_t prev_record_check = + g_config_value_.record_count_for_next_mem_check_; + g_config_value_.record_count_for_next_mem_check_ = 1; + + { + TsFileWriter writer; + int flags = O_WRONLY | O_CREAT | O_TRUNC; +#ifdef _WIN32 + flags |= O_BINARY; +#endif + ASSERT_EQ(writer.open(file_name_, flags, 0666), E_OK); + const std::string device = "sparse_dev"; + std::vector reg; + reg.push_back(new MeasurementSchema("v0", INT64, PLAIN, UNCOMPRESSED)); + writer.register_aligned_timeseries(device, reg); + + // First aligned chunk: 20 timestamps but only every 4th row has a + // non-null value column (5 non-nulls). Flush. + for (int i = 0; i < 20; i++) { + TsRecord r(static_cast(i), device); + DataPoint p("v0"); + if (i % 4 == 0) p.set_i64(static_cast(i)); + r.points_.push_back(p); + ASSERT_EQ(writer.write_record_aligned(r), E_OK); + } + ASSERT_EQ(writer.flush(), E_OK); + + // Second aligned chunk: 20 more timestamps, every value non-null + // (all 20 non-nulls). + for (int i = 20; i < 40; i++) { + TsRecord r(static_cast(i), device); + DataPoint p("v0"); + p.set_i64(static_cast(i)); + r.points_.push_back(p); + ASSERT_EQ(writer.write_record_aligned(r), E_OK); + } + ASSERT_EQ(writer.flush(), E_OK); + ASSERT_EQ(writer.close(), E_OK); + } + g_config_value_.chunk_group_size_threshold_ = prev_chunk_thresh; + g_config_value_.record_count_for_next_mem_check_ = prev_record_check; + + // Query with offset=10 — enough to fully cover the first chunk's 5 + // non-null statistic-reported rows, but NOT enough to cover the + // chunk's 20 actual rows. Under the bug the entire first chunk was + // skipped, and offset_=10-5=5 would land 5 rows into the second + // chunk, returning rows 25..39 (15 rows). With the fix the first + // chunk is decoded, 10 rows are eaten, leaving rows 10..39 (30 rows). + TsFileTreeReader reader; + ASSERT_EQ(reader.open(file_name_), E_OK); + std::vector devices = {"sparse_dev"}; + std::vector measurements = {"v0"}; + ResultSet* result = nullptr; + ASSERT_EQ(reader.queryByRow(devices, measurements, 10, -1, result), E_OK); + ASSERT_NE(result, nullptr); + + auto timestamps = collect_timestamps(result); + EXPECT_EQ(timestamps.size(), static_cast(30)); + if (timestamps.size() == 30) { + for (size_t i = 0; i < timestamps.size(); i++) { + EXPECT_EQ(timestamps[i], static_cast(i + 10)); + } + } + reader.destroy_query_data_set(result); + reader.close(); +} + // Basic test: queryByRow returns correct total count with no offset/limit. TEST_F(TreeQueryByRowTest, NoOffsetNoLimit) { std::vector devices = {"d1"}; @@ -1310,7 +1391,8 @@ TEST_F(TreeQueryByRowTest, MultiPath_TimeHint_SkipsStaleChunk_WithOffset) { // Pushdown is faster than full query + manual next: queryByRow(offset, limit) // skips at Chunk/Page level; old query then manual next decodes every row. -// Timing tolerance 20% to allow measurement noise. +// Use the same 50% tolerance as the table-view sibling test for cross-platform +// timing noise; the test is DISABLED_ and intended for manual runs. TEST_F(TreeQueryByRowTest, DISABLED_QueryByRowFasterThanManualNext) { std::vector devices = {"d1"}; std::vector measurements = {"s1"}; @@ -1320,7 +1402,8 @@ TEST_F(TreeQueryByRowTest, DISABLED_QueryByRowFasterThanManualNext) { write_test_file(devices, measurements, num_rows); const int num_iters = 5; - const double tolerance = 0.2; + const double tolerance = + 0.5; // 50% tolerance for cross-platform timing noise auto run_query_by_row = [this, &devices, &measurements, offset, limit]() { TsFileTreeReader reader; diff --git a/cpp/test/reader/tsfile_reader_test.cc b/cpp/test/reader/tsfile_reader_test.cc index 08cda6e31..5f50724c4 100644 --- a/cpp/test/reader/tsfile_reader_test.cc +++ b/cpp/test/reader/tsfile_reader_test.cc @@ -29,9 +29,14 @@ #include "common/record.h" #include "common/schema.h" #include "common/tablet.h" +#include "common/tsblock/tsblock.h" +#include "file/tsfile_io_reader.h" #include "file/tsfile_io_writer.h" #include "file/write_file.h" +#include "reader/block/single_device_tsblock_reader.h" +#include "reader/filter/time_operator.h" #include "reader/qds_without_timegenerator.h" +#include "reader/tsfile_series_scan_iterator.h" #include "writer/tsfile_writer.h" using namespace storage; @@ -395,3 +400,596 @@ TEST_F(TsFileReaderTest, GetTimeseriesMetadataTableModelTypeAndDeviceFilter) { reader.close(); } + +static const int64_t kLargeFileNumRecords = 300000000; +static const int64_t kLargeFileFlushBatch = 100000; + +TEST_F(TsFileReaderTest, + DISABLED_LargeFileNoEncodingNoCompression_WriteAndRead) { + std::string device_path = "device1"; + std::string measurement_name = "temperature"; + common::TSDataType data_type = common::TSDataType::INT64; + common::TSEncoding encoding = common::TSEncoding::PLAIN; + common::CompressionType compression_type = + common::CompressionType::UNCOMPRESSED; + + tsfile_writer_->register_timeseries( + device_path, storage::MeasurementSchema(measurement_name, data_type, + encoding, compression_type)); + + const int64_t start_time = 1622505600000LL; + for (int64_t i = 0; i < kLargeFileNumRecords; ++i) { + TsRecord record(start_time + i * 1000, device_path); + record.add_point(measurement_name, static_cast(i)); + ASSERT_EQ(tsfile_writer_->write_record(record), E_OK); + if ((i + 1) % kLargeFileFlushBatch == 0) { + ASSERT_EQ(tsfile_writer_->flush(), E_OK); + } + } + ASSERT_EQ(tsfile_writer_->flush(), E_OK); + ASSERT_EQ(tsfile_writer_->close(), E_OK); + + std::vector select_list = {"device1.temperature"}; + const int64_t end_time = start_time + (kLargeFileNumRecords - 1) * 1000 + 1; + + storage::TsFileReader reader; + int ret = reader.open(file_name_); + ASSERT_EQ(ret, common::E_OK); + + storage::ResultSet* tmp_qds = nullptr; + ret = reader.query(select_list, start_time, end_time, tmp_qds); + ASSERT_EQ(ret, common::E_OK); + ASSERT_NE(tmp_qds, nullptr); + + auto* qds = static_cast(tmp_qds); + std::shared_ptr meta = qds->get_metadata(); + ASSERT_NE(meta, nullptr); + ASSERT_EQ(meta->get_column_type(1), INT64); + ASSERT_EQ(meta->get_column_type(2), INT64); + + int64_t row_count = 0; + bool has_next = false; + + while (true) { + ret = qds->next(has_next); + ASSERT_EQ(ret, common::E_OK); + if (!has_next) break; + row_count++; + } + + ASSERT_EQ(row_count, kLargeFileNumRecords); + + reader.destroy_query_data_set(qds); + reader.close(); +} + +// Multi-value aligned chunk reader doesn't honour row_offset / row_limit / +// min_time_hint pushdown — silently dropping those args would hand the caller +// full-chunk data when it asked for a sub-range. The guard at the top of +// AlignedChunkReader::get_next_page must turn the unsupported combination +// into an explicit E_NOT_SUPPORT. +TEST_F(TsFileReaderTest, MultiValueAlignedRowOffsetReturnsNotSupport) { + const std::string device = "root.dev_multi_offset"; + std::vector schema_vec; + schema_vec.emplace_back("v0", INT64, PLAIN, UNCOMPRESSED); + schema_vec.emplace_back("v1", INT64, PLAIN, UNCOMPRESSED); + { + std::vector reg; + for (auto& s : schema_vec) reg.push_back(new MeasurementSchema(s)); + ASSERT_EQ(tsfile_writer_->register_aligned_timeseries(device, reg), + E_OK); + } + const int N = 32; + Tablet tablet(device, + std::make_shared>(schema_vec), + N); + for (int i = 0; i < N; ++i) { + ASSERT_EQ(tablet.add_timestamp(i, 1000 + i), E_OK); + ASSERT_EQ(tablet.add_value(i, 0u, static_cast(i)), E_OK); + ASSERT_EQ(tablet.add_value(i, 1u, static_cast(i * 2)), E_OK); + } + ASSERT_EQ(tsfile_writer_->write_tablet_aligned(tablet), E_OK); + ASSERT_EQ(tsfile_writer_->flush(), E_OK); + ASSERT_EQ(tsfile_writer_->close(), E_OK); + + storage::TsFileIOReader io_reader; + ASSERT_EQ(io_reader.init(file_name_), E_OK); + + auto device_id = std::make_shared(device); + std::vector measurements = {"v0", "v1"}; + storage::TsFileSeriesScanIterator* ssi = nullptr; + common::PageArena pa; + pa.init(512, common::MOD_DEFAULT); + ASSERT_EQ(io_reader.alloc_multi_ssi(device_id, measurements, ssi, pa, + /*time_filter=*/nullptr), + E_OK); + ASSERT_NE(ssi, nullptr); + + // row_offset > 0 hits the multi-value guard at the top of + // AlignedChunkReader::get_next_page; the SSI propagates the error code. + ssi->set_row_range(/*offset=*/5, /*limit=*/-1); + common::TsBlock* block = nullptr; + EXPECT_EQ(ssi->get_next(block, /*alloc_tsblock=*/true), + common::E_NOT_SUPPORT); + + if (block != nullptr) { + ssi->revert_tsblock(); + } + io_reader.revert_ssi(ssi); + // RAII handles io_reader teardown — explicit reset() would destroy the + // tsfile_meta page arena while tsfile_meta_ still holds shared_ptrs into + // it, then ~TsFileMeta would call self_deleter on freed memory. +} + +namespace storage { +// Subclass that lets the test (a) inject an error from the next-tsblock load +// and (b) wire a manually constructed TsBlock into the inherited iterator +// fields, so we can exercise the end-of-block branch of skip_rows() +// deterministically. The base destructor calls revert_ssi(nullptr), which +// short-circuits safely; we hand it a default-constructed (never-init'd) +// TsFileIOReader purely to satisfy the constructor. +class FaultySingleMeasurementColumnContext + : public SingleMeasurementColumnContext { + public: + using SingleMeasurementColumnContext::SingleMeasurementColumnContext; + int get_next_tsblock_ret_ = common::E_OK; + int get_next_tsblock_calls_ = 0; + int get_next_tsblock(bool /*alloc_mem*/) override { + ++get_next_tsblock_calls_; + return get_next_tsblock_ret_; + } + void prime_iters_for_block(common::TsBlock* tsb) { + tsblock_ = tsb; + time_iter_ = new common::ColIterator(0, tsb); + value_iter_ = new common::ColIterator(1, tsb); + } +}; +} // namespace storage + +// Regression: skip_rows() used to be a void method that called +// get_next_tsblock(false) for its side effects when the current block ran +// out. An IO/decode error from that call was silently swallowed and the +// outer reader treated the source as exhausted, returning fewer rows than +// requested with no error indication. skip_rows() now returns int and must +// surface hard errors (E_NO_MORE_DATA is the legitimate EOF and stays +// suppressed). +TEST_F(TsFileReaderTest, + SingleMeasurementSkipRowsPropagatesGetNextTsBlockError) { + common::TupleDesc desc; + desc.push_back(common::ColumnSchema("time", common::INT64, + common::UNCOMPRESSED, common::PLAIN)); + desc.push_back(common::ColumnSchema("v0", common::INT64, + common::UNCOMPRESSED, common::PLAIN)); + common::TsBlock tsb(&desc, 4); + ASSERT_EQ(tsb.init(), common::E_OK); + common::RowAppender ra(&tsb); + for (int i = 0; i < 2; i++) { + ASSERT_TRUE(ra.add_row()); + int64_t t = 1000 + i; + int64_t v = i; + ra.append(0, reinterpret_cast(&t), sizeof(int64_t)); + ra.append(1, reinterpret_cast(&v), sizeof(int64_t)); + } + + storage::TsFileIOReader io_reader_stub; + storage::FaultySingleMeasurementColumnContext ctx(&io_reader_stub); + ctx.prime_iters_for_block(&tsb); + + // Hard error: skip_rows must propagate. + ctx.get_next_tsblock_ret_ = common::E_INVALID_ARG; + EXPECT_EQ(ctx.skip_rows(2), common::E_INVALID_ARG); + EXPECT_EQ(ctx.get_next_tsblock_calls_, 1); +} + +TEST_F(TsFileReaderTest, SingleMeasurementSkipRowsSwallowsEndOfStream) { + common::TupleDesc desc; + desc.push_back(common::ColumnSchema("time", common::INT64, + common::UNCOMPRESSED, common::PLAIN)); + desc.push_back(common::ColumnSchema("v0", common::INT64, + common::UNCOMPRESSED, common::PLAIN)); + common::TsBlock tsb(&desc, 4); + ASSERT_EQ(tsb.init(), common::E_OK); + common::RowAppender ra(&tsb); + for (int i = 0; i < 2; i++) { + ASSERT_TRUE(ra.add_row()); + int64_t t = 1000 + i; + int64_t v = i; + ra.append(0, reinterpret_cast(&t), sizeof(int64_t)); + ra.append(1, reinterpret_cast(&v), sizeof(int64_t)); + } + + storage::TsFileIOReader io_reader_stub; + storage::FaultySingleMeasurementColumnContext ctx(&io_reader_stub); + ctx.prime_iters_for_block(&tsb); + + // EOF: skip_rows must squash to E_OK so the outer loop notices via + // available_rows() instead of bubbling the EOF up as a query failure. + ctx.get_next_tsblock_ret_ = common::E_NO_MORE_DATA; + EXPECT_EQ(ctx.skip_rows(2), common::E_OK); + EXPECT_EQ(ctx.get_next_tsblock_calls_, 1); +} + +// Regression: the multi-value aligned batch loop required the destination +// TsBlock to have >= BATCH (=129) rows of free capacity, otherwise it +// returned E_OVERFLOW immediately and the SSI surfaced that error to the +// caller. When tsblock_max_memory_ is small enough to land max_row_count_ +// below 129 (e.g. very small per-block memory in low-RAM configs) no rows +// could ever be decoded. The fix caps the batch by remaining capacity, +// matching ChunkReader's per-type batch loops. +TEST_F(TsFileReaderTest, MultiValueAlignedProgressesWithSmallTsBlock) { + const std::string device = "root.dev_multi_small_block"; + std::vector schema_vec; + schema_vec.emplace_back("v0", INT64, PLAIN, UNCOMPRESSED); + schema_vec.emplace_back("v1", INT64, PLAIN, UNCOMPRESSED); + { + std::vector reg; + for (auto& s : schema_vec) reg.push_back(new MeasurementSchema(s)); + ASSERT_EQ(tsfile_writer_->register_aligned_timeseries(device, reg), + E_OK); + } + const int N = 200; // > BATCH (129) so the batch loop iterates twice + Tablet tablet(device, + std::make_shared>(schema_vec), + N); + for (int i = 0; i < N; ++i) { + ASSERT_EQ(tablet.add_timestamp(i, 1000 + i), E_OK); + ASSERT_EQ(tablet.add_value(i, 0u, static_cast(i)), E_OK); + ASSERT_EQ(tablet.add_value(i, 1u, static_cast(i * 2)), E_OK); + } + ASSERT_EQ(tsfile_writer_->write_tablet_aligned(tablet), E_OK); + ASSERT_EQ(tsfile_writer_->flush(), E_OK); + ASSERT_EQ(tsfile_writer_->close(), E_OK); + + // Force max_row_count_ below BATCH: ~2 KB / 24 B per row → ~85 rows. + // Also force the multi_DECODE_TV_BATCH path by disabling parallel reads: + // with a thread pool the chunk-level pre-decode shortcut would otherwise + // run for any multi-column query (no upper column-count cutoff anymore). + uint32_t prev_capacity = common::g_config_value_.tsblock_max_memory_; + bool prev_parallel = common::g_config_value_.parallel_read_enabled_; + struct Guard { + uint32_t cap; + bool par; + ~Guard() { + common::g_config_value_.tsblock_max_memory_ = cap; + common::g_config_value_.parallel_read_enabled_ = par; + } + } guard{prev_capacity, prev_parallel}; + common::g_config_value_.tsblock_max_memory_ = 2048; + common::g_config_value_.parallel_read_enabled_ = false; + + storage::TsFileIOReader io_reader; + ASSERT_EQ(io_reader.init(file_name_), E_OK); + + auto device_id = std::make_shared(device); + std::vector measurements = {"v0", "v1"}; + storage::TsFileSeriesScanIterator* ssi = nullptr; + common::PageArena pa; + pa.init(512, common::MOD_TSFILE_READER); + ASSERT_EQ(io_reader.alloc_multi_ssi(device_id, measurements, ssi, pa, + /*time_filter=*/nullptr), + E_OK); + ASSERT_NE(ssi, nullptr); + + int collected = 0; + while (true) { + common::TsBlock* block = nullptr; + int ret = ssi->get_next(block, /*alloc_tsblock=*/true); + if (ret == common::E_NO_MORE_DATA) break; + ASSERT_EQ(ret, common::E_OK); + ASSERT_NE(block, nullptr); + ASSERT_GT(block->get_max_row_count(), 0u); + ASSERT_LT(block->get_max_row_count(), 129u); + collected += static_cast(block->get_row_count()); + ssi->revert_tsblock(); + } + EXPECT_EQ(collected, N); + + io_reader.revert_ssi(ssi); +} + +// Regression: when a whole batch is filtered out, multi_DECODE_TV_BATCH skips +// the non-null value bytes for each column. The old code ignored the skip +// return code and the `skipped` count, so a short/truncated page could leave +// the decoder mid-value; subsequent batches would then read garbage bytes as +// values. This test exercises an intact page: the filter rejects rows +// 0..127 (one full batch worth), then the rows after must come back with +// their *correct* values — proving the decoder advanced exactly nonnull_count +// values, not some smaller number that would shift the value alignment. +TEST_F(TsFileReaderTest, MultiValueAlignedSkipsBatchPreservesValueAlignment) { + const std::string device = "root.dev_multi_skip_align"; + std::vector schema_vec; + schema_vec.emplace_back("v0", INT64, PLAIN, UNCOMPRESSED); + schema_vec.emplace_back("v1", INT64, PLAIN, UNCOMPRESSED); + { + std::vector reg; + for (auto& s : schema_vec) reg.push_back(new MeasurementSchema(s)); + ASSERT_EQ(tsfile_writer_->register_aligned_timeseries(device, reg), + E_OK); + } + // Two batches' worth of rows so the filter skips the first batch entirely + // and decodes the second. + const int N = 200; + Tablet tablet(device, + std::make_shared>(schema_vec), + N); + for (int i = 0; i < N; ++i) { + // Distinctive value pattern: i and 1000000 + i. If skip + // mis-advances the decoder by even one value, the v0/v1 read after + // the skip will land on the wrong row's bytes. + ASSERT_EQ(tablet.add_timestamp(i, static_cast(i)), E_OK); + ASSERT_EQ(tablet.add_value(i, 0u, static_cast(i)), E_OK); + ASSERT_EQ(tablet.add_value(i, 1u, static_cast(1000000 + i)), + E_OK); + } + ASSERT_EQ(tsfile_writer_->write_tablet_aligned(tablet), E_OK); + ASSERT_EQ(tsfile_writer_->flush(), E_OK); + ASSERT_EQ(tsfile_writer_->close(), E_OK); + + bool prev_parallel = common::g_config_value_.parallel_read_enabled_; + struct Guard { + bool par; + ~Guard() { common::g_config_value_.parallel_read_enabled_ = par; } + } guard{prev_parallel}; + // Force the multi_DECODE_TV_BATCH path (the chunk-level shortcut would + // bypass the skip branch we want to exercise). + common::g_config_value_.parallel_read_enabled_ = false; + + storage::TsFileIOReader io_reader; + ASSERT_EQ(io_reader.init(file_name_), E_OK); + + auto device_id = std::make_shared(device); + std::vector measurements = {"v0", "v1"}; + storage::TsFileSeriesScanIterator* ssi = nullptr; + common::PageArena pa; + pa.init(512, common::MOD_TSFILE_READER); + + // TimeIn filter selecting only rows 130..139 — entirely past the first + // 129-row batch, so the first batch hits the pass_count==0 skip branch + // for both value columns. + std::vector want; + for (int i = 130; i < 140; ++i) want.push_back(i); + storage::TimeIn time_filter(want, /*not_in=*/false); + + ASSERT_EQ(io_reader.alloc_multi_ssi(device_id, measurements, ssi, pa, + &time_filter), + E_OK); + ASSERT_NE(ssi, nullptr); + + std::vector> got; + while (true) { + common::TsBlock* block = nullptr; + int ret = ssi->get_next(block, /*alloc_tsblock=*/true, &time_filter); + if (ret == common::E_NO_MORE_DATA) break; + ASSERT_EQ(ret, common::E_OK); + ASSERT_NE(block, nullptr); + // Columns: time, v0, v1. + common::ColIterator t_iter(0, block); + common::ColIterator v0_iter(1, block); + common::ColIterator v1_iter(2, block); + const uint32_t rows = block->get_row_count(); + for (uint32_t r = 0; r < rows; ++r) { + uint32_t len = 0; + int64_t t = *reinterpret_cast(t_iter.read(&len)); + int64_t v0 = *reinterpret_cast(v0_iter.read(&len)); + int64_t v1 = *reinterpret_cast(v1_iter.read(&len)); + got.push_back({t, v0}); + // The decoder must have advanced exactly nonnull_count values + // when it skipped batch #1. If it under-advanced (the latent + // bug), v1 would land on the wrong row's bytes here. + EXPECT_EQ(v1, 1000000 + t); + EXPECT_EQ(v0, t); + t_iter.next(); + v0_iter.next(); + v1_iter.next(); + } + ssi->revert_tsblock(); + } + + ASSERT_EQ(got.size(), want.size()); + for (size_t i = 0; i < got.size(); ++i) { + EXPECT_EQ(got[i].first, want[i]); + EXPECT_EQ(got[i].second, want[i]); + } + + io_reader.revert_ssi(ssi); +} + +// Coverage: an aligned read with > 6 value columns now takes the chunk-level +// parallel decode path (decode_all_planned_pages) exactly like the 2..6 column +// case — the old "<= 6 columns" dispatch cutoff that sent wide chunks down the +// per-page serial path is gone. With libtsfile_init() having built the global +// pool and parallel_read_enabled_ on by default, an 8-column query exercises +// that path end-to-end; each column carries a disjoint value range so any +// cross-column misalignment in the wide chunk-level decode would be caught. +TEST_F(TsFileReaderTest, MultiValueAlignedWideChunkParallelDecode) { + const std::string device = "root.dev_multi_wide"; + const uint32_t kCols = 8; // > 6: previously bypassed the chunk-level path + std::vector schema_vec; + for (uint32_t c = 0; c < kCols; ++c) { + schema_vec.emplace_back("v" + std::to_string(c), INT64, PLAIN, + UNCOMPRESSED); + } + { + std::vector reg; + for (auto& s : schema_vec) reg.push_back(new MeasurementSchema(s)); + ASSERT_EQ(tsfile_writer_->register_aligned_timeseries(device, reg), + E_OK); + } + const int N = 200; // > BATCH (129) so the decode loop iterates more once + Tablet tablet(device, + std::make_shared>(schema_vec), + N); + // Row i, column c carries c * 1000000 + i so each column's values occupy a + // disjoint range; a wide-chunk decode that crossed column boundaries would + // surface as a value landing in the wrong column's range. + for (int i = 0; i < N; ++i) { + ASSERT_EQ(tablet.add_timestamp(i, 1000 + i), E_OK); + for (uint32_t c = 0; c < kCols; ++c) { + ASSERT_EQ( + tablet.add_value(i, c, static_cast(c * 1000000 + i)), + E_OK); + } + } + ASSERT_EQ(tsfile_writer_->write_tablet_aligned(tablet), E_OK); + ASSERT_EQ(tsfile_writer_->flush(), E_OK); + ASSERT_EQ(tsfile_writer_->close(), E_OK); + + // parallel_read_enabled_ defaults to true and SetUp() ran libtsfile_init(), + // so the SSI hands the AlignedChunkReader the global pool; with 8 value + // columns (> 1) the reader takes the chunk-level decode path. + ASSERT_TRUE(common::g_config_value_.parallel_read_enabled_); + + storage::TsFileIOReader io_reader; + ASSERT_EQ(io_reader.init(file_name_), E_OK); + + auto device_id = std::make_shared(device); + std::vector measurements; + for (uint32_t c = 0; c < kCols; ++c) + measurements.push_back("v" + std::to_string(c)); + storage::TsFileSeriesScanIterator* ssi = nullptr; + common::PageArena pa; + pa.init(512, common::MOD_TSFILE_READER); + ASSERT_EQ(io_reader.alloc_multi_ssi(device_id, measurements, ssi, pa, + /*time_filter=*/nullptr), + E_OK); + ASSERT_NE(ssi, nullptr); + + int collected = 0; + while (true) { + common::TsBlock* block = nullptr; + int ret = ssi->get_next(block, /*alloc_tsblock=*/true); + if (ret == common::E_NO_MORE_DATA) break; + ASSERT_EQ(ret, common::E_OK); + ASSERT_NE(block, nullptr); + const uint32_t rows = block->get_row_count(); + + common::ColIterator t_iter(0, block); + std::vector times; + times.reserve(rows); + for (uint32_t r = 0; r < rows; ++r) { + uint32_t len = 0; + times.push_back(*reinterpret_cast(t_iter.read(&len))); + t_iter.next(); + } + // One independent iterator per value column so we never rely on + // vector being movable. + for (uint32_t c = 0; c < kCols; ++c) { + common::ColIterator it(c + 1, block); + for (uint32_t r = 0; r < rows; ++r) { + uint32_t len = 0; + int64_t v = *reinterpret_cast(it.read(&len)); + int64_t i = times[r] - 1000; // timestamp == 1000 + i + EXPECT_EQ(v, static_cast(c) * 1000000 + i); + it.next(); + } + } + collected += static_cast(rows); + ssi->revert_tsblock(); + } + EXPECT_EQ(collected, N); + + io_reader.revert_ssi(ssi); +} + +// Regression: AlignedTimeseriesIndex::get_data_type() returns the time column +// type (VECTOR), which the schema accessor used to surface verbatim — every +// aligned column came back as VECTOR instead of its real INT32/FLOAT/etc. +// type. get_timeseries_schema() now unwraps AlignedTimeseriesIndex to read +// value_ts_idx_->get_data_type() like the develop branch did. +TEST_F(TsFileReaderTest, AlignedSchemaReportsValueDataType) { + const std::string device = "root.dev_aligned_schema"; + std::vector schema_vec; + schema_vec.emplace_back("v_i32", INT32, PLAIN, UNCOMPRESSED); + schema_vec.emplace_back("v_dbl", DOUBLE, PLAIN, UNCOMPRESSED); + { + std::vector reg; + for (auto& s : schema_vec) reg.push_back(new MeasurementSchema(s)); + ASSERT_EQ(tsfile_writer_->register_aligned_timeseries(device, reg), + E_OK); + } + const int N = 8; + Tablet tablet(device, + std::make_shared>(schema_vec), + N); + for (int i = 0; i < N; ++i) { + ASSERT_EQ(tablet.add_timestamp(i, 1000 + i), E_OK); + ASSERT_EQ(tablet.add_value(i, 0u, static_cast(i)), E_OK); + ASSERT_EQ(tablet.add_value(i, 1u, static_cast(i) * 0.5), E_OK); + } + ASSERT_EQ(tsfile_writer_->write_tablet_aligned(tablet), E_OK); + ASSERT_EQ(tsfile_writer_->flush(), E_OK); + ASSERT_EQ(tsfile_writer_->close(), E_OK); + + storage::TsFileReader reader; + ASSERT_EQ(reader.open(file_name_), E_OK); + + auto device_id = std::make_shared(device); + std::vector schemas; + ASSERT_EQ(reader.get_timeseries_schema(device_id, schemas), E_OK); + ASSERT_EQ(schemas.size(), 2u); + + // Match by name — IO reader iteration order isn't part of the contract. + common::TSDataType i32_type = common::INVALID_DATATYPE; + common::TSDataType dbl_type = common::INVALID_DATATYPE; + for (const auto& s : schemas) { + if (s.measurement_name_ == "v_i32") i32_type = s.data_type_; + if (s.measurement_name_ == "v_dbl") dbl_type = s.data_type_; + } + EXPECT_EQ(i32_type, INT32); + EXPECT_EQ(dbl_type, DOUBLE); + reader.close(); +} + +namespace storage { +class TsFileReaderMetaArenaTest { + public: + static int64_t arena_used(const storage::TsFileReader& r) { + return r.tsfile_reader_meta_pa_.get_total_used_bytes(); + } +}; +} // namespace storage + +// Regression: tsfile_reader_meta_pa_ used to be re-initialised at the start +// of each get_timeseries_metadata() call. When that reset was removed, +// every call accumulated another copy of the per-device meta into the same +// arena, so a long-lived reader that polled metadata kept growing memory +// without bound. Re-init now happens at the top of both overloads; verify +// arena usage stays flat across repeated calls instead of growing linearly. +TEST_F(TsFileReaderTest, RepeatedGetTimeseriesMetadataDoesNotLeakArena) { + const std::string device = "root.dev_arena_growth"; + { + std::vector reg; + reg.push_back(new MeasurementSchema("v0", INT64, PLAIN, UNCOMPRESSED)); + ASSERT_EQ(tsfile_writer_->register_aligned_timeseries(device, reg), + E_OK); + } + TsRecord r(1000, device); + r.points_.emplace_back("v0", static_cast(0)); + ASSERT_EQ(tsfile_writer_->write_record_aligned(r), E_OK); + ASSERT_EQ(tsfile_writer_->flush(), E_OK); + ASSERT_EQ(tsfile_writer_->close(), E_OK); + + storage::TsFileReader reader; + ASSERT_EQ(reader.open(file_name_), E_OK); + std::vector> ids = { + std::make_shared(device)}; + + // Prime the arena and capture the steady-state size. + (void)reader.get_timeseries_metadata(ids); + const int64_t after_one = + storage::TsFileReaderMetaArenaTest::arena_used(reader); + ASSERT_GT(after_one, 0); + + for (int i = 0; i < 10; ++i) { + (void)reader.get_timeseries_metadata(ids); + } + const int64_t after_eleven = + storage::TsFileReaderMetaArenaTest::arena_used(reader); + // Without the fix, after_eleven ≈ 11 × after_one. With the fix it + // should equal after_one (arena reset before each call). Allow a small + // slack for arena page rounding, but reject anything close to 2× growth. + EXPECT_LT(after_eleven, after_one * 2) + << "arena grew from " << after_one << " to " << after_eleven + << " across 11 calls — reset on entry is missing"; + reader.close(); +} diff --git a/cpp/test/writer/table_view/tsfile_writer_table_test.cc b/cpp/test/writer/table_view/tsfile_writer_table_test.cc index d1f3b92e4..0dfaccc06 100644 --- a/cpp/test/writer/table_view/tsfile_writer_table_test.cc +++ b/cpp/test/writer/table_view/tsfile_writer_table_test.cc @@ -20,7 +20,6 @@ #include -#include "common/global.h" #include "common/record.h" #include "common/schema.h" #include "common/tablet.h" @@ -32,11 +31,10 @@ using namespace storage; using namespace common; -class TsFileWriterTableTest : public ::testing::TestWithParam { +class TsFileWriterTableTest : public ::testing::Test { protected: void SetUp() override { libtsfile_init(); - set_parallel_write_enabled(GetParam()); file_name_ = std::string("tsfile_writer_table_test_") + generate_random_string(10) + std::string(".tsfile"); remove(file_name_.c_str()); @@ -135,7 +133,7 @@ class TsFileWriterTableTest : public ::testing::TestWithParam { } }; -TEST_P(TsFileWriterTableTest, WriteTableTest) { +TEST_F(TsFileWriterTableTest, WriteTableTest) { auto table_schema = gen_table_schema(0); auto tsfile_table_writer_ = std::make_shared(&write_file_, table_schema); @@ -146,7 +144,7 @@ TEST_P(TsFileWriterTableTest, WriteTableTest) { delete table_schema; } -TEST_P(TsFileWriterTableTest, WithoutTagAndMultiPage) { +TEST_F(TsFileWriterTableTest, WithoutTagAndMultiPage) { std::vector measurement_schemas; std::vector column_categories; measurement_schemas.resize(1); @@ -194,7 +192,7 @@ TEST_P(TsFileWriterTableTest, WithoutTagAndMultiPage) { delete table_schema; } -TEST_P(TsFileWriterTableTest, WriteDisorderTest) { +TEST_F(TsFileWriterTableTest, WriteDisorderTest) { auto table_schema = gen_table_schema(0); auto tsfile_table_writer_ = std::make_shared(&write_file_, table_schema); @@ -239,12 +237,13 @@ TEST_P(TsFileWriterTableTest, WriteDisorderTest) { ASSERT_EQ(tsfile_table_writer_->write_table(tablet), common::E_OUT_OF_ORDER); - ASSERT_EQ(tsfile_table_writer_->flush(), common::E_OK); - ASSERT_EQ(tsfile_table_writer_->close(), common::E_OK); + ASSERT_EQ(tsfile_table_writer_->flush(), common::E_DATA_INCONSISTENCY); + ASSERT_EQ(tsfile_table_writer_->close(), common::E_DATA_INCONSISTENCY); + ASSERT_EQ(tsfile_table_writer_->close(), common::E_DATA_INCONSISTENCY); delete table_schema; } -TEST_P(TsFileWriterTableTest, WriteTableTestMultiFlush) { +TEST_F(TsFileWriterTableTest, WriteTableTestMultiFlush) { auto table_schema = gen_table_schema(0); auto tsfile_table_writer_ = std::make_shared( &write_file_, table_schema, 2 * 1024); @@ -257,7 +256,7 @@ TEST_P(TsFileWriterTableTest, WriteTableTestMultiFlush) { delete table_schema; } -TEST_P(TsFileWriterTableTest, WriteNonExistColumnTest) { +TEST_F(TsFileWriterTableTest, WriteNonExistColumnTest) { auto table_schema = gen_table_schema(0); auto tsfile_table_writer_ = std::make_shared(&write_file_, table_schema); @@ -285,7 +284,7 @@ TEST_P(TsFileWriterTableTest, WriteNonExistColumnTest) { delete table_schema; } -TEST_P(TsFileWriterTableTest, WriteNonExistTableTest) { +TEST_F(TsFileWriterTableTest, WriteNonExistTableTest) { auto table_schema = gen_table_schema(0); auto tsfile_table_writer_ = std::make_shared(&write_file_, table_schema); @@ -297,7 +296,7 @@ TEST_P(TsFileWriterTableTest, WriteNonExistTableTest) { delete table_schema; } -TEST_P(TsFileWriterTableTest, WriterWithMemoryThreshold) { +TEST_F(TsFileWriterTableTest, WriterWithMemoryThreshold) { auto table_schema = gen_table_schema(0); auto tsfile_table_writer_ = std::make_shared( &write_file_, table_schema, 256 * 1024 * 1024); @@ -307,7 +306,7 @@ TEST_P(TsFileWriterTableTest, WriterWithMemoryThreshold) { delete table_schema; } -TEST_P(TsFileWriterTableTest, EmptyTagWrite) { +TEST_F(TsFileWriterTableTest, EmptyTagWrite) { std::vector measurement_schemas; std::vector column_categories; measurement_schemas.resize(3); @@ -363,7 +362,7 @@ TEST_P(TsFileWriterTableTest, EmptyTagWrite) { delete table_schema; } -TEST_P(TsFileWriterTableTest, WritehDataTypeMisMatch) { +TEST_F(TsFileWriterTableTest, WritehDataTypeMisMatch) { auto table_schema = gen_table_schema(0); auto tsfile_table_writer_ = std::make_shared( &write_file_, table_schema, 256 * 1024 * 1024); @@ -414,7 +413,7 @@ TEST_P(TsFileWriterTableTest, WritehDataTypeMisMatch) { tsfile_table_writer_->close(); } -TEST_P(TsFileWriterTableTest, WriteAndReadSimple) { +TEST_F(TsFileWriterTableTest, WriteAndReadSimple) { std::vector measurement_schemas; std::vector column_categories; measurement_schemas.resize(2); @@ -469,7 +468,7 @@ TEST_P(TsFileWriterTableTest, WriteAndReadSimple) { delete table_schema; } -TEST_P(TsFileWriterTableTest, DuplicateColumnName) { +TEST_F(TsFileWriterTableTest, DuplicateColumnName) { std::vector measurement_schemas; std::vector column_categories; measurement_schemas.resize(3); @@ -507,7 +506,7 @@ TEST_P(TsFileWriterTableTest, DuplicateColumnName) { delete table_schema; } -TEST_P(TsFileWriterTableTest, WriteWithNullAndEmptyTag) { +TEST_F(TsFileWriterTableTest, WriteWithNullAndEmptyTag) { std::vector measurement_schemas; std::vector column_categories; for (int i = 0; i < 3; i++) { @@ -639,7 +638,7 @@ TEST_P(TsFileWriterTableTest, WriteWithNullAndEmptyTag) { ASSERT_EQ(reader.close(), common::E_OK); } -TEST_P(TsFileWriterTableTest, MultiDeviceMultiFields) { +TEST_F(TsFileWriterTableTest, MultiDeviceMultiFields) { common::config_set_max_degree_of_index_node(5); auto table_schema = gen_table_schema(0, 1, 100); auto tsfile_table_writer_ = @@ -698,7 +697,7 @@ TEST_P(TsFileWriterTableTest, MultiDeviceMultiFields) { delete table_schema; } -TEST_P(TsFileWriterTableTest, WriteDataWithEmptyField) { +TEST_F(TsFileWriterTableTest, WriteDataWithEmptyField) { std::vector measurement_schemas; std::vector column_categories; for (int i = 0; i < 3; i++) { @@ -775,7 +774,7 @@ TEST_P(TsFileWriterTableTest, WriteDataWithEmptyField) { ASSERT_EQ(reader.close(), common::E_OK); } -TEST_P(TsFileWriterTableTest, MultiDatatypes) { +TEST_F(TsFileWriterTableTest, MultiDatatypes) { std::vector measurement_schemas; std::vector column_categories; @@ -879,7 +878,7 @@ TEST_P(TsFileWriterTableTest, MultiDatatypes) { delete[] literal; } -TEST_P(TsFileWriterTableTest, DiffCodecTypes) { +TEST_F(TsFileWriterTableTest, DiffCodecTypes) { std::vector measurement_schemas; std::vector column_categories; @@ -987,7 +986,7 @@ TEST_P(TsFileWriterTableTest, DiffCodecTypes) { delete[] literal; } -TEST_P(TsFileWriterTableTest, EncodingConfigIntegration) { +TEST_F(TsFileWriterTableTest, EncodingConfigIntegration) { // 1. Test setting global compression type ASSERT_EQ(E_OK, set_global_compression(SNAPPY)); @@ -1100,7 +1099,7 @@ TEST_P(TsFileWriterTableTest, EncodingConfigIntegration) { } #ifdef ENABLE_MEM_STAT -TEST_P(TsFileWriterTableTest, DISABLED_MemStatWriteAndVerify) { +TEST_F(TsFileWriterTableTest, DISABLED_MemStatWriteAndVerify) { TableSchema* table_schema = gen_table_schema(0, 2, 3); auto tsfile_table_writer = std::make_shared(&write_file_, table_schema); @@ -1175,8 +1174,3 @@ TEST_P(TsFileWriterTableTest, DISABLED_MemStatWriteAndVerify) { delete table_schema; } #endif - -INSTANTIATE_TEST_SUITE_P(Serial, TsFileWriterTableTest, - ::testing::Values(false)); -INSTANTIATE_TEST_SUITE_P(Parallel, TsFileWriterTableTest, - ::testing::Values(true)); \ No newline at end of file diff --git a/cpp/test/writer/tsfile_writer_test.cc b/cpp/test/writer/tsfile_writer_test.cc index 139761380..62d5167f3 100644 --- a/cpp/test/writer/tsfile_writer_test.cc +++ b/cpp/test/writer/tsfile_writer_test.cc @@ -20,12 +20,15 @@ #include +#include +#include #include #include "common/path.h" #include "common/record.h" #include "common/schema.h" #include "common/tablet.h" +#include "common/tsfile_common.h" #include "file/tsfile_io_writer.h" #include "file/write_file.h" #include "reader/qds_without_timegenerator.h" @@ -618,6 +621,74 @@ TEST_F(TsFileWriterTest, WriteMultipleTabletsDouble) { ASSERT_EQ(tsfile_writer_->close(), E_OK); } +// Regression: write_column() is the null fallback of the non-aligned batch +// path (write_column_batch -> has_null -> write_column). It used to handle +// only BOOLEAN/INT32/INT64/FLOAT/DOUBLE/STRING and ASSERT(false) otherwise; +// in NDEBUG that assert is a no-op, so a non-aligned TEXT/BLOB/DATE/TIMESTAMP +// column that contained a null silently dropped every row of that column. +// This writes a TEXT column with a null in the middle and verifies the two +// non-null rows survive the round trip. +TEST_F(TsFileWriterTest, NonAlignedTextColumnWithNullIsNotDropped) { + // Non-const: storage::Path's ctor takes non-const std::string&. + std::string device = "root.dev_text_null"; + std::string measure = "s_text"; + tsfile_writer_->register_timeseries( + device, MeasurementSchema(measure, common::TSDataType::TEXT, + common::TSEncoding::PLAIN, + common::CompressionType::UNCOMPRESSED)); + + std::vector schema_vec; + schema_vec.emplace_back(measure, common::TSDataType::TEXT, + common::TSEncoding::PLAIN, + common::CompressionType::UNCOMPRESSED); + const int max_rows = 3; + storage::Tablet tablet( + device, std::make_shared>(schema_vec), + max_rows); + for (int row = 0; row < max_rows; row++) { + ASSERT_EQ(tablet.add_timestamp(row, 1000 + row), E_OK); + } + // Rows 0 and 2 get values; row 1 is left untouched, so its not-null bit + // stays set (default) — that is the null that forces the write_column + // fallback. + char buf0[] = "v0"; + char buf2[] = "v2"; + String s0(buf0, 2), s2(buf2, 2); + ASSERT_EQ(tablet.add_value(0, 0u, s0), E_OK); + ASSERT_EQ(tablet.add_value(2, 0u, s2), E_OK); + ASSERT_EQ(tsfile_writer_->write_tablet(tablet), E_OK); + ASSERT_EQ(tsfile_writer_->flush(), E_OK); + ASSERT_EQ(tsfile_writer_->close(), E_OK); + + storage::TsFileReader reader; + ASSERT_EQ(reader.open(file_name_), E_OK); + std::vector select_list{storage::Path(device, measure)}; + storage::QueryExpression* query_expr = + storage::QueryExpression::create(select_list, nullptr); + storage::ResultSet* tmp_qds = nullptr; + ASSERT_EQ(reader.query(query_expr, tmp_qds), E_OK); + auto* qds = (QDSWithoutTimeGenerator*)tmp_qds; + + // The regression signal is row survival: before the fix write_column hit + // ASSERT(false) on TEXT (a no-op in NDEBUG), so the column was dropped and + // this query returned 0 rows. TEXT shares the identical (proven) string + // write path as STRING, so the two surviving rows at the right timestamps + // confirm the fix. field(1) is the value column, but field(0) is non-null + // here too — the result row carries the timestamp as field(0). + std::vector times; + bool has_next = false; + while (IS_SUCC(qds->next(has_next)) && has_next) { + storage::RowRecord* rec = qds->get_row_record(); + times.push_back(rec->get_timestamp()); + } + reader.destroy_query_data_set(qds); + reader.close(); + + ASSERT_EQ(times.size(), 2u); + EXPECT_EQ(times[0], 1000); + EXPECT_EQ(times[1], 1002); +} + TEST_F(TsFileWriterTest, FlushMultipleDevice) { const int device_num = 50; const int measurement_num = 50; @@ -699,6 +770,22 @@ TEST_F(TsFileWriterTest, FlushMultipleDevice) { } TEST_F(TsFileWriterTest, AnalyzeTsfileForload) { + // estimate_max_mem_size() now reflects the real 64 KiB-page footprint of + // each per-measurement output stream. 50 devices × 50 measurements × + // 2 streams × 64 KiB = ~320 MiB, well past the 128 MiB default + // chunk_group_size_threshold_ — without raising the cap the auto-flush + // would fire mid-write and the post-write hasData() check below would + // observe a freshly drained chunk writer. Lift the cap for the + // duration of this smoke test so the original semantics still apply. + uint32_t prev_threshold = + common::g_config_value_.chunk_group_size_threshold_; + struct Guard { + uint32_t prev; + ~Guard() { common::g_config_value_.chunk_group_size_threshold_ = prev; } + } guard{prev_threshold}; + common::g_config_value_.chunk_group_size_threshold_ = + 2ULL * 1024 * 1024 * 1024; + const int device_num = 50; const int measurement_num = 50; const int max_rows = 100; @@ -1070,6 +1157,214 @@ TEST_F(TsFileWriterTest, AlignedSealSync_ValueMemoryFirst) { ASSERT_EQ(reader.close(), E_OK); } +// Regression: write_tablet_aligned() writes the entire time column first and +// then each value column. With memory-based auto-seal still active, a large +// STRING value column hits the memory threshold mid-batch (say at row 5), +// while the INT64 time column does not seal until row page_writer_max_point +// is reached. Those divergent seals stamp misaligned page boundaries onto +// the file and read-back returns wrong values per row. Suppressing +// memory-driven seals during the batch should keep all pages count-aligned. +TEST_F(TsFileWriterTest, AlignedSealSync_TabletLargeStringValueMemoryFirst) { + uint32_t prev_pt = g_config_value_.page_writer_max_point_num_; + uint32_t prev_mem = g_config_value_.page_writer_max_memory_bytes_; + struct Guard { + uint32_t pt, mem; + ~Guard() { + g_config_value_.page_writer_max_point_num_ = pt; + g_config_value_.page_writer_max_memory_bytes_ = mem; + } + } guard{prev_pt, prev_mem}; + // Big point cap, tiny memory cap: time chunk (INT64 PLAIN, 8B/point) never + // hits memory before it reaches the point cap, while the STRING value + // chunk crosses the memory threshold within a handful of rows. + g_config_value_.page_writer_max_point_num_ = 10000; + g_config_value_.page_writer_max_memory_bytes_ = 512; + + std::string device_name = "device_tablet_str"; + std::vector schema_vec; + schema_vec.emplace_back("s0", INT64, PLAIN, UNCOMPRESSED); + schema_vec.emplace_back("s1", STRING, PLAIN, UNCOMPRESSED); + schema_vec.emplace_back("s2", INT64, PLAIN, UNCOMPRESSED); + { + std::vector reg; + for (auto& s : schema_vec) reg.push_back(new MeasurementSchema(s)); + tsfile_writer_->register_aligned_timeseries(device_name, reg); + } + + const int row_num = 200; + Tablet tablet(device_name, + std::make_shared>(schema_vec), + row_num); + char* long_buf = new char[101]; + memset(long_buf, 'A', 100); + long_buf[100] = '\0'; + common::String str_val(long_buf, 100); + for (int i = 0; i < row_num; ++i) { + ASSERT_EQ(tablet.add_timestamp(i, 1622505600000 + i), E_OK); + ASSERT_EQ(tablet.add_value(i, 0u, static_cast(i)), E_OK); + // Sparse string column: every third row is null so we also exercise + // the bitmap path through the memory-pressured value page. + if (i % 3 != 0) { + ASSERT_EQ(tablet.add_value(i, 1u, str_val), E_OK); + } + ASSERT_EQ(tablet.add_value(i, 2u, static_cast(i * 10)), E_OK); + } + delete[] long_buf; + + ASSERT_EQ(tsfile_writer_->write_tablet_aligned(tablet), E_OK); + ASSERT_EQ(tsfile_writer_->flush(), E_OK); + ASSERT_EQ(tsfile_writer_->close(), E_OK); + + std::string s0("s0"), s1("s1"), s2("s2"); + std::vector select_list; + select_list.emplace_back(device_name, s0); + select_list.emplace_back(device_name, s1); + select_list.emplace_back(device_name, s2); + storage::QueryExpression* qe = + storage::QueryExpression::create(select_list, nullptr); + storage::TsFileReader reader; + ASSERT_EQ(reader.open(file_name_), E_OK); + storage::ResultSet* tmp_qds = nullptr; + ASSERT_EQ(reader.query(qe, tmp_qds), E_OK); + auto* qds = (QDSWithoutTimeGenerator*)tmp_qds; + + bool has_next = false; + int64_t cur_row = 0; + while (IS_SUCC(qds->next(has_next)) && has_next) { + auto* rec = qds->get_row_record(); + ASSERT_NE(rec, nullptr); + EXPECT_EQ(rec->get_timestamp(), 1622505600000 + cur_row); + EXPECT_EQ(field_to_string(rec->get_field(1)), std::to_string(cur_row)); + EXPECT_EQ(field_to_string(rec->get_field(3)), + std::to_string(cur_row * 10)); + cur_row++; + } + EXPECT_EQ(cur_row, row_num); + reader.destroy_query_data_set(qds); + ASSERT_EQ(reader.close(), E_OK); +} + +// Regression: write_tablet_aligned() used to discard time_write_column_batch +// errors and keep writing value columns. On an out-of-order tablet that left +// the time chunk with fewer rows than the value chunks (or with their seal +// flag still suppressed). The fix propagates the time-column error so no +// value column is touched and the page seal flags are restored. +TEST_F(TsFileWriterTest, AlignedTabletTimeBatchOutOfOrderAborts) { + std::string device_name = "device_aligned_out_of_order"; + std::vector schema_vec; + schema_vec.emplace_back("v0", INT64, PLAIN, UNCOMPRESSED); + schema_vec.emplace_back("v1", INT64, PLAIN, UNCOMPRESSED); + { + std::vector reg; + for (auto& s : schema_vec) reg.push_back(new MeasurementSchema(s)); + tsfile_writer_->register_aligned_timeseries(device_name, reg); + } + + const int row_num = 16; + Tablet tablet(device_name, + std::make_shared>(schema_vec), + row_num); + // Non-monotonic timestamps trip TimePageWriter::write_batch's order check. + for (int i = 0; i < row_num; ++i) { + int64_t ts = (i == row_num - 1) ? 0 : 1000 + i; + ASSERT_EQ(tablet.add_timestamp(i, ts), E_OK); + ASSERT_EQ(tablet.add_value(i, 0u, static_cast(i)), E_OK); + ASSERT_EQ(tablet.add_value(i, 1u, static_cast(i * 2)), E_OK); + } + EXPECT_NE(tsfile_writer_->write_tablet_aligned(tablet), E_OK); + ASSERT_EQ(tsfile_writer_->close(), E_OK); +} + +// Regression: write_record_aligned used to ignore the time write return +// value, then unconditionally write each value column. An out-of-order +// timestamp would leave the time chunk one row short of every value chunk +// for the rest of the file. The fix propagates the time-write error and +// marks the writer unrecoverable when value-column writes diverge from +// time. +TEST_F(TsFileWriterTest, RecordAlignedOutOfOrderDoesNotAdvanceValueColumns) { + std::string device_name = "root.dev_aligned_record"; + std::vector schema_vec; + schema_vec.emplace_back("v0", INT64, PLAIN, UNCOMPRESSED); + schema_vec.emplace_back("v1", INT64, PLAIN, UNCOMPRESSED); + { + std::vector reg; + for (auto& s : schema_vec) reg.push_back(new MeasurementSchema(s)); + tsfile_writer_->register_aligned_timeseries(device_name, reg); + } + + // First record at ts=1000 — should write cleanly. + TsRecord r1(1000, device_name); + r1.points_.emplace_back("v0", static_cast(0)); + r1.points_.emplace_back("v1", static_cast(0)); + ASSERT_EQ(tsfile_writer_->write_record_aligned(r1), E_OK); + + // Second record at the same timestamp 1000 — time_chunk_writer rejects + // it (E_OUT_OF_ORDER per TimePageWriter::write). The value columns + // must not advance. + TsRecord r2(1000, device_name); + r2.points_.emplace_back("v0", static_cast(99)); + r2.points_.emplace_back("v1", static_cast(99)); + EXPECT_EQ(tsfile_writer_->write_record_aligned(r2), E_OUT_OF_ORDER); + // close() must succeed because the failure was caught before any value + // write — writer state is still consistent. + ASSERT_EQ(tsfile_writer_->close(), E_OK); +} + +// Regression: the aligned bulk-memcpy fast path in AlignedChunkReader only +// appended bytes to each Vector's value_data without calling add_row_nums(). +// Vector::row_num_ stayed at 0 while TsBlock::row_count_ jumped to N, so +// fill_trailling_nulls() then overwrote every just-written row as null +// (visible to the caller as all-null columns). +TEST_F(TsFileWriterTest, AlignedBulkMemcpyAdvancesVectorRowNum) { + std::string device_name = "device_bulk_rownum"; + std::vector schema_vec; + schema_vec.emplace_back("v0", INT64, PLAIN, UNCOMPRESSED); + schema_vec.emplace_back("v1", INT64, PLAIN, UNCOMPRESSED); + { + std::vector reg; + for (auto& s : schema_vec) reg.push_back(new MeasurementSchema(s)); + tsfile_writer_->register_aligned_timeseries(device_name, reg); + } + const int N = 64; + Tablet tablet(device_name, + std::make_shared>(schema_vec), + N); + for (int i = 0; i < N; i++) { + ASSERT_EQ(tablet.add_timestamp(i, 1000 + i), E_OK); + ASSERT_EQ(tablet.add_value(i, 0u, static_cast(i)), E_OK); + ASSERT_EQ(tablet.add_value(i, 1u, static_cast(i * 2)), E_OK); + } + ASSERT_EQ(tsfile_writer_->write_tablet_aligned(tablet), E_OK); + ASSERT_EQ(tsfile_writer_->flush(), E_OK); + ASSERT_EQ(tsfile_writer_->close(), E_OK); + + // Read back via TsBlock — confirms the rows are visible. Under the + // bug Vector::row_num_ stayed at 0, fill_trailling_nulls() then + // marked every just-written row null; the iterator still reports + // them as rows so we check the non-null field for a real value. + std::vector select; + std::string s0("v0"), s1("v1"); + select.emplace_back(device_name, s0); + select.emplace_back(device_name, s1); + storage::QueryExpression* qe = + storage::QueryExpression::create(select, nullptr); + storage::TsFileReader reader; + ASSERT_EQ(reader.open(file_name_), E_OK); + storage::ResultSet* tmp = nullptr; + ASSERT_EQ(reader.query(qe, tmp), E_OK); + auto* qds = (QDSWithoutTimeGenerator*)tmp; + int got = 0; + bool has_next = false; + while (IS_SUCC(qds->next(has_next)) && has_next) { + auto* rec = qds->get_row_record(); + ASSERT_NE(rec, nullptr); + got++; + } + EXPECT_EQ(got, N); + reader.destroy_query_data_set(qds); + reader.close(); +} + TEST_F(TsFileWriterTest, WriteAlignedMultiFlush) { int measurement_num = 100, row_num = 100; std::string device_name = "device"; @@ -1256,4 +1551,145 @@ TEST_F(TsFileWriterTest, WriteTabletDataTypeMismatch) { ASSERT_EQ(E_TYPE_NOT_MATCH, tsfile_writer_->write_tablet_aligned(tablet)); ASSERT_EQ(tsfile_writer_->flush(), E_OK); ASSERT_EQ(tsfile_writer_->close(), E_OK); +} + +// Regression: partial-write failures (parallel aligned task failing mid-way, +// non-aligned column failing after earlier columns advanced, etc.) leave per- +// column chunk writers out of sync. The writer latches unrecoverable_ so +// subsequent flush/close/write must refuse rather than seal a corrupt file +// whose time and value chunks disagree on row count. Directly triggering +// the partial failure deterministically is hard, so this test asserts the +// downstream contract by flipping the flag through a friend hook. +namespace storage { +class TsFileWriterUnrecoverableTest { + public: + static void mark_unrecoverable(TsFileWriter& w) { w.unrecoverable_ = true; } +}; +} // namespace storage + +TEST_F(TsFileWriterTest, UnrecoverableLatchRefusesFlushCloseAndWrites) { + const std::string device = "root.dev_unrec"; + std::vector reg; + reg.push_back(new MeasurementSchema("v0", INT64, PLAIN, UNCOMPRESSED)); + reg.push_back(new MeasurementSchema("v1", INT64, PLAIN, UNCOMPRESSED)); + ASSERT_EQ(tsfile_writer_->register_aligned_timeseries(device, reg), E_OK); + + // Write one good row so a flush attempt would otherwise have data to emit. + TsRecord r(1000, device); + r.points_.emplace_back("v0", static_cast(0)); + r.points_.emplace_back("v1", static_cast(0)); + ASSERT_EQ(tsfile_writer_->write_record_aligned(r), E_OK); + + // Simulate the post-partial-failure state. + storage::TsFileWriterUnrecoverableTest::mark_unrecoverable(*tsfile_writer_); + + // Every public write/flush/close entry point must refuse. + EXPECT_EQ(tsfile_writer_->flush(), E_DATA_INCONSISTENCY); + EXPECT_EQ(tsfile_writer_->close(), E_DATA_INCONSISTENCY); + + TsRecord r2(1001, device); + r2.points_.emplace_back("v0", static_cast(1)); + r2.points_.emplace_back("v1", static_cast(1)); + EXPECT_EQ(tsfile_writer_->write_record_aligned(r2), E_DATA_INCONSISTENCY); + + Tablet tablet(device, + std::make_shared>( + std::vector{ + MeasurementSchema("v0", INT64, PLAIN, UNCOMPRESSED), + MeasurementSchema("v1", INT64, PLAIN, UNCOMPRESSED)}), + 4); + for (int i = 0; i < 4; i++) { + ASSERT_EQ(tablet.add_timestamp(i, 2000 + i), E_OK); + ASSERT_EQ(tablet.add_value(i, 0u, static_cast(i)), E_OK); + ASSERT_EQ(tablet.add_value(i, 1u, static_cast(i * 2)), E_OK); + } + EXPECT_EQ(tsfile_writer_->write_tablet_aligned(tablet), + E_DATA_INCONSISTENCY); + EXPECT_EQ(tsfile_writer_->write_tablet(tablet), E_DATA_INCONSISTENCY); +} + +namespace { + +WriteFile* OpenWriteFileFor(const std::string& path) { + int flags = O_WRONLY | O_CREAT | O_TRUNC; +#ifdef _WIN32 + flags |= O_BINARY; +#endif + auto* wf = new WriteFile; + if (wf->create(path, flags, 0666) != E_OK) { + delete wf; + return nullptr; + } + return wf; +} + +void WriteOneAlignedRow(TsFileWriter& w, const std::string& device, int64_t ts, + int64_t value) { + std::vector reg; + reg.push_back(new MeasurementSchema("v0", INT64, PLAIN, UNCOMPRESSED)); + ASSERT_EQ(w.register_aligned_timeseries(device, reg), E_OK); + TsRecord r(ts, device); + r.points_.emplace_back("v0", value); + ASSERT_EQ(w.write_record_aligned(r), E_OK); +} + +} // namespace + +// Writing speed up: TsFileWriter must be reusable across a +// destroy() + init() cycle. +// - 1: TsFileIOWriter::destroy() left chunk_group_meta_list_ and +// chunk_group_meta_index_ pointing at meta_allocator_-owned memory that +// the next init() then re-armed; the next start_flush_chunk_group() +// linear scan would deref freed nodes. +// - 2: TsFileWriter::init() did not reset start_file_done_, so +// the second file's flush() skipped the magic/version header and +// produced a file the reader can't open. +// This test forces both code paths: destroy(), init() onto a fresh +// WriteFile, write data, close, then read the second file via the public +// TsFileReader API. +TEST_F(TsFileWriterTest, WriterReuseAfterDestroyProducesValidSecondFile) { + // First lifecycle uses the fixture-provided writer (already open()'d on + // file_name_). Write one row and close — this flushes the magic + + // version into file_name_ and flips start_file_done_ true. + WriteOneAlignedRow(*tsfile_writer_, "root.dev_first", 1000, 7); + ASSERT_EQ(tsfile_writer_->flush(), E_OK); + ASSERT_EQ(tsfile_writer_->close(), E_OK); + + // Second lifecycle: tear down the previous writer state and re-init + // against a brand-new file. + tsfile_writer_->destroy(); + + const std::string second_path = std::string("tsfile_writer_reuse_test_") + + generate_random_string(10) + + std::string(".tsfile"); + remove(second_path.c_str()); + WriteFile* wf = OpenWriteFileFor(second_path); + ASSERT_NE(wf, nullptr); + ASSERT_EQ(tsfile_writer_->init(wf), E_OK); + + WriteOneAlignedRow(*tsfile_writer_, "root.dev_second", 2000, 9); + ASSERT_EQ(tsfile_writer_->flush(), E_OK); + ASSERT_EQ(tsfile_writer_->close(), E_OK); + + // The second file must start with the TsFile magic + version byte. + // The TsFileReader open path mostly indexes from the file tail, so a + // missing magic at offset 0 isn't caught by reader.open(). Inspect the + // raw header bytes instead — that's exactly what start_file_done_ guards. + { + std::ifstream in(second_path, std::ios::binary); + ASSERT_TRUE(in.is_open()); + char header[MAGIC_STRING_TSFILE_LEN + 1] = {0}; + in.read(header, MAGIC_STRING_TSFILE_LEN + 1); + EXPECT_EQ(in.gcount(), + static_cast(MAGIC_STRING_TSFILE_LEN + 1)); + EXPECT_EQ(memcmp(header, MAGIC_STRING_TSFILE, MAGIC_STRING_TSFILE_LEN), + 0) + << "second-file header is missing the TsFile magic — " + "start_file_done_ residual from the previous lifecycle"; + EXPECT_EQ(header[MAGIC_STRING_TSFILE_LEN], VERSION_NUM_BYTE); + } + + // wf was passed to init() but init() did not take ownership. + delete wf; + remove(second_path.c_str()); } \ No newline at end of file diff --git a/cpp/test/writer/value_page_writer_test.cc b/cpp/test/writer/value_page_writer_test.cc index 07666e189..be04586ee 100644 --- a/cpp/test/writer/value_page_writer_test.cc +++ b/cpp/test/writer/value_page_writer_test.cc @@ -106,3 +106,36 @@ TEST_F(ValuePageWriterTest, WritePageHeaderAndData) { common::E_OK); value_page_writer.destroy_page_data(); } + +// Regression: write_batch used to bump size_ and the page bitmap for every +// row in the batch *before* encoding the values. If the value encode failed +// mid-batch, the page would claim `count` rows had been written even though +// the encoder stream only held a prefix. The fix counts valid rows +// upfront, encodes, and only commits size_ / bitmap when the encode +// finishes cleanly. This test exercises the happy path on a mixed-null +// batch and asserts size_ and statistics agree with the row count — a +// subsequent code change that re-introduces premature size_ bumping +// without rolling back on failure would still pass this test, but it +// guards the encode-then-commit ordering contract against accidental +// rewrites. +TEST_F(ValuePageWriterTest, WriteBatchCommitsStateAfterEncode) { + ValuePageWriter w; + w.init(TSDataType::INT64, TSEncoding::PLAIN, UNCOMPRESSED); + + const uint32_t N = 5; + int64_t timestamps[N] = {100, 101, 102, 103, 104}; + int64_t values[N] = {10, 20, 30, 40, 50}; + common::BitMap nullmap; + ASSERT_EQ(nullmap.init(N), common::E_OK); + // bit=1 means null in the tablet bitmap convention. + nullmap.set(1); // row 1 (timestamp 101) is null + nullmap.set(3); // row 3 (timestamp 103) is null + ASSERT_EQ(w.write_batch(timestamps, values, nullmap, 0, N), common::E_OK); + + // size_ tracks every row regardless of nullness, statistic only the + // non-null subset. get_point_numer() returns size_ (rows incl. NULLs). + EXPECT_EQ(w.get_point_numer(), N); + auto* stat = static_cast(w.get_statistic()); + ASSERT_NE(stat, nullptr); + EXPECT_EQ(stat->count_, 3u); +} diff --git a/python/tests/test_tsfile_dataset.py b/python/tests/test_tsfile_dataset.py index f79a6d466..4e52a1b5f 100644 --- a/python/tests/test_tsfile_dataset.py +++ b/python/tests/test_tsfile_dataset.py @@ -688,10 +688,21 @@ def test_reader_catalog_shares_device_metadata_and_resolves_paths(tmp_path): def test_reader_read_series_by_row_retries_across_native_row_query_boundaries(): + """read_series_by_row pulls TsBlocks via read_arrow_batch and must keep + re-issuing query_table_by_row when the underlying native call stops at + an internal block boundary before the caller's window is filled.""" + + import pyarrow as pa + class _FakeResultSet: - def __init__(self, rows): - self._rows = rows - self._index = -1 + def __init__(self, times, values): + self._batch = pa.table( + { + "time": pa.array(times, type=pa.int64()), + "totalcloudcover": pa.array(values, type=pa.float64()), + } + ) + self._delivered = False def __enter__(self): return self @@ -699,12 +710,11 @@ def __enter__(self): def __exit__(self, exc_type, exc_val, exc_tb): return False - def next(self): - self._index += 1 - return self._index < len(self._rows) - - def get_value_by_name(self, name): - return self._rows[self._index][name] + def read_arrow_batch(self): + if self._delivered or self._batch.num_rows == 0: + return None + self._delivered = True + return self._batch class _FakeNativeReader: def __init__(self, timestamps, values, boundary): @@ -713,28 +723,31 @@ def __init__(self, timestamps, values, boundary): self._boundary = boundary def query_table_by_row( - self, table_name, column_names, offset=0, limit=-1, tag_filter=None + self, + table_name, + column_names, + offset=0, + limit=-1, + tag_filter=None, + batch_size=0, ): assert table_name == "pvf" assert column_names == ["totalcloudcover"] assert tag_filter is None + assert batch_size > 0, "row reads should use batch (Arrow) mode" if limit < 0: stop = len(self._timestamps) else: stop = min(offset + limit, len(self._timestamps)) - # Simulate the current native bug: one row query cannot cross the - # next internal boundary, so callers must re-issue from the + # Simulate the native quirk where one query stops at the next + # internal block boundary; callers must re-issue from the # advanced offset to complete a large logical window. chunk_stop = min(stop, ((offset // self._boundary) + 1) * self._boundary) - rows = [ - { - "time": int(self._timestamps[idx]), - "totalcloudcover": float(self._values[idx]), - } - for idx in range(offset, chunk_stop) - ] - return _FakeResultSet(rows) + return _FakeResultSet( + self._timestamps[offset:chunk_stop], + self._values[offset:chunk_stop], + ) reader = object.__new__(TsFileSeriesReader) reader._reader = _FakeNativeReader( diff --git a/python/tsfile/dataset/reader.py b/python/tsfile/dataset/reader.py index 4899b2bf9..ffc38b07d 100644 --- a/python/tsfile/dataset/reader.py +++ b/python/tsfile/dataset/reader.py @@ -365,37 +365,44 @@ def read_series_by_row( tag_values = dict(zip(table_entry.tag_columns, device_entry.tag_values)) tag_filter = _build_exact_tag_filter(tag_values) if tag_values else None - # Some native row-query paths stop at an internal block boundary even - # when the requested window extends further. Re-issue from the advanced - # offset until we fill the caller's logical row window or reach EOF. + # Pull whole TsBlocks via the Arrow C-Data interface instead of + # iterating row-by-row in Python. Each result_set.next() + + # get_value_by_name() pair would be a Python<->C round-trip per row + # and dominates wall time on long slices; read_arrow_batch() returns + # a column-oriented batch in one call and lands directly in numpy. timestamp_parts = [] value_parts = [] remaining = limit next_offset = offset while remaining > 0: - batch_timestamps = [] - batch_values = [] + produced_this_call = 0 with self._reader.query_table_by_row( table_entry.table_name, [field_name], offset=next_offset, limit=remaining, tag_filter=tag_filter, + batch_size=65536, ) as result_set: - while result_set.next(): - batch_timestamps.append(result_set.get_value_by_name("time")) - value = result_set.get_value_by_name(field_name) - batch_values.append(np.nan if value is None else float(value)) - - if not batch_timestamps: + while True: + arrow_table = result_set.read_arrow_batch() + if arrow_table is None: + break + if arrow_table.num_rows == 0: + continue + timestamp_parts.append(arrow_table.column("time").to_numpy()) + raw_values = arrow_table.column(field_name).to_numpy( + zero_copy_only=False + ) + value_parts.append(np.asarray(raw_values, dtype=np.float64)) + produced_this_call += arrow_table.num_rows + + if produced_this_call == 0: break - timestamp_parts.append(np.asarray(batch_timestamps, dtype=np.int64)) - value_parts.append(np.asarray(batch_values, dtype=np.float64)) - read_count = len(batch_timestamps) - next_offset += read_count - remaining -= read_count + next_offset += produced_this_call + remaining -= produced_this_call if not timestamp_parts: return np.array([], dtype=np.int64), np.array([], dtype=np.float64) diff --git a/python/tsfile/tsfile_reader.pyx b/python/tsfile/tsfile_reader.pyx index 9193e2c61..c9ecd78f7 100644 --- a/python/tsfile/tsfile_reader.pyx +++ b/python/tsfile/tsfile_reader.pyx @@ -199,7 +199,9 @@ cdef class ResultSetPy: if data_type == TSDataTypePy.INT32: return tsfile_result_set_get_value_by_index_int32_t(self.result, index) elif data_type == TSDataTypePy.DATE: - return parse_int_to_date(tsfile_result_set_get_value_by_index_int64_t(self.result, index)) + # DATE is physically stored as int32 (yyyymmdd), so read it through + # the int32 accessor that matches the underlying storage width. + return parse_int_to_date(tsfile_result_set_get_value_by_index_int32_t(self.result, index)) elif data_type == TSDataTypePy.INT64 or data_type == TSDataTypePy.TIMESTAMP: return tsfile_result_set_get_value_by_index_int64_t(self.result, index) elif data_type == TSDataTypePy.FLOAT: